From 1715b6359c1ae37f24d6774f0bcd73b6bf839eaa Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 6 Oct 2023 16:14:54 -0300 Subject: [PATCH 0001/1290] perf beauty socket/prctl_option: Cope with extended regexp complaint by grep Noticed on fedora 38, the extended regexp that so far was ok for both grep and sed now gets complaints by grep, that says '/' doesn't need to be escaped with '\'. So stop using '/' in sed, use '%' instead and remove the \ before / in the common extended regexp. Link: https://x.com/SMT_Solvers/status/1710380010098344192?s=20 Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/ZUEddFPTJHVLhH%2F6@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/trace/beauty/prctl_option.sh | 4 ++-- tools/perf/trace/beauty/socket.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/perf/trace/beauty/prctl_option.sh b/tools/perf/trace/beauty/prctl_option.sh index 8059342ca4126..9455d9672f140 100755 --- a/tools/perf/trace/beauty/prctl_option.sh +++ b/tools/perf/trace/beauty/prctl_option.sh @@ -4,9 +4,9 @@ [ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/linux/ printf "static const char *prctl_options[] = {\n" -regex='^#define[[:space:]]{1}PR_(\w+)[[:space:]]*([[:xdigit:]]+)([[:space:]]*\/.*)?$' +regex='^#define[[:space:]]{1}PR_(\w+)[[:space:]]*([[:xdigit:]]+)([[:space:]]*/.*)?$' grep -E $regex ${header_dir}/prctl.h | grep -v PR_SET_PTRACER | \ - sed -r "s/$regex/\2 \1/g" | \ + sed -E "s%$regex%\2 \1%g" | \ sort -n | xargs printf "\t[%s] = \"%s\",\n" printf "};\n" diff --git a/tools/perf/trace/beauty/socket.sh b/tools/perf/trace/beauty/socket.sh index 8bc7ba62203e4..670c6db298ae0 100755 --- a/tools/perf/trace/beauty/socket.sh +++ b/tools/perf/trace/beauty/socket.sh @@ -18,10 +18,10 @@ grep -E $ipproto_regex ${uapi_header_dir}/in.h | \ printf "};\n\n" printf "static const char *socket_level[] = {\n" -socket_level_regex='^#define[[:space:]]+SOL_(\w+)[[:space:]]+([[:digit:]]+)([[:space:]]+\/.*)?' +socket_level_regex='^#define[[:space:]]+SOL_(\w+)[[:space:]]+([[:digit:]]+)([[:space:]]+/.*)?' grep -E $socket_level_regex ${beauty_header_dir}/socket.h | \ - sed -r "s/$socket_level_regex/\2 \1/g" | \ + sed -E "s%$socket_level_regex%\2 \1%g" | \ sort -n | xargs printf "\t[%s] = \"%s\",\n" printf "};\n\n" -- GitLab From c8e3ade38bc6545faece71cc6c642ad744d4cea3 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 6 Oct 2023 18:11:05 -0300 Subject: [PATCH 0002/1290] perf tests make: Remove the last egrep call, use 'grep -E' instead One last case, caught while testing with amazonlinux:2, centos:stream, etc: 4 7.28 amazonlinux:2 : FAIL egrep: warning: egrep is obsolescent; using grep -E gcc version 7.3.1 20180712 (Red Hat 7.3.1-17) (GCC) 8 13.87 centos:stream : FAIL egrep: warning: egrep is obsolescent; using grep -E Reviewed-by: Guilherme Amadio Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/ZUEdtblE8qDAQkBK@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/make | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/tests/make b/tools/perf/tests/make index d9945ed25bc5a..8a4da7eb637a8 100644 --- a/tools/perf/tests/make +++ b/tools/perf/tests/make @@ -183,7 +183,7 @@ run += make_install_prefix_slash # run += make_install_pdf run += make_minimal -old_libbpf := $(shell echo '\#include ' | $(CC) -E -dM -x c -| egrep -q "define[[:space:]]+LIBBPF_MAJOR_VERSION[[:space:]]+0{1}") +old_libbpf := $(shell echo '\#include ' | $(CC) -E -dM -x c -| grep -q -E "define[[:space:]]+LIBBPF_MAJOR_VERSION[[:space:]]+0{1}") ifneq ($(old_libbpf),) run += make_libbpf_dynamic -- GitLab From 851bbccf6b0c152d98ecf0ec83d75fc97aebf43c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 11 Oct 2023 18:14:52 -0300 Subject: [PATCH 0003/1290] perf build: Warn about missing libelf before warning about missing libbpf As libelf is a requirement for libbpf if it is not available, as in some container build tests where NO_LIBELF=1 is used, then better warn about the most basic library first. Ditto for libz, check its availability before libbpf too. Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/ZUEehyDk0FkPnvMR@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.config | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index b3e6ed10f40c6..8b6cffbc48583 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -680,15 +680,15 @@ ifndef BUILD_BPF_SKEL endif ifeq ($(BUILD_BPF_SKEL),1) - ifeq ($(filter -DHAVE_LIBBPF_SUPPORT, $(CFLAGS)),) - dummy := $(warning Warning: Disabled BPF skeletons as libbpf is required) - BUILD_BPF_SKEL := 0 - else ifeq ($(filter -DHAVE_LIBELF_SUPPORT, $(CFLAGS)),) + ifeq ($(filter -DHAVE_LIBELF_SUPPORT, $(CFLAGS)),) dummy := $(warning Warning: Disabled BPF skeletons as libelf is required by bpftool) BUILD_BPF_SKEL := 0 else ifeq ($(filter -DHAVE_ZLIB_SUPPORT, $(CFLAGS)),) dummy := $(warning Warning: Disabled BPF skeletons as zlib is required by bpftool) BUILD_BPF_SKEL := 0 + else ifeq ($(filter -DHAVE_LIBBPF_SUPPORT, $(CFLAGS)),) + dummy := $(warning Warning: Disabled BPF skeletons as libbpf is required) + BUILD_BPF_SKEL := 0 else ifeq ($(call get-executable,$(CLANG)),) dummy := $(warning Warning: Disabled BPF skeletons as clang ($(CLANG)) is missing) BUILD_BPF_SKEL := 0 -- GitLab From 76db7aab1fca6688ddf9f388157521c442e0ffb8 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 25 Oct 2023 13:16:24 -0700 Subject: [PATCH 0004/1290] tools headers UAPI: Sync include/uapi/linux/perf_event.h header with the kernel Sync the new sample type for the branch counters feature. Signed-off-by: Kan Liang Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexey Bayduraev Cc: Andi Kleen Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tinghao Zhang Link: https://lore.kernel.org/r/20231025201626.3000228-6-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/perf_event.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 39c6a250dd1b9..3a64499b0f5d6 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -204,6 +204,8 @@ enum perf_branch_sample_type_shift { PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT = 18, /* save privilege mode */ + PERF_SAMPLE_BRANCH_COUNTERS_SHIFT = 19, /* save occurrences of events on a branch */ + PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */ }; @@ -235,6 +237,8 @@ enum perf_branch_sample_type { PERF_SAMPLE_BRANCH_PRIV_SAVE = 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT, + PERF_SAMPLE_BRANCH_COUNTERS = 1U << PERF_SAMPLE_BRANCH_COUNTERS_SHIFT, + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, }; @@ -982,6 +986,12 @@ enum perf_event_type { * { u64 nr; * { u64 hw_idx; } && PERF_SAMPLE_BRANCH_HW_INDEX * { u64 from, to, flags } lbr[nr]; + * # + * # The format of the counters is decided by the + * # "branch_counter_nr" and "branch_counter_width", + * # which are defined in the ABI. + * # + * { u64 counters; } cntr[nr] && PERF_SAMPLE_BRANCH_COUNTERS * } && PERF_SAMPLE_BRANCH_STACK * * { u64 abi; # enum perf_sample_regs_abi @@ -1427,6 +1437,9 @@ struct perf_branch_entry { reserved:31; }; +/* Size of used info bits in struct perf_branch_entry */ +#define PERF_BRANCH_ENTRY_INFO_BITS_MAX 33 + union perf_sample_weight { __u64 full; #if defined(__LITTLE_ENDIAN_BITFIELD) -- GitLab From ac9cd7245fffa0fc053afce3b345469e5afa533a Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 25 Oct 2023 13:16:25 -0700 Subject: [PATCH 0005/1290] perf header: Support num and width of branch counters To support the branch counters feature, the information of the maximum number of supported counters and the width of the counters is exposed in the sysfs caps folder. The perf tool can use the information to parse the logged counters in each branch. Store the information in the perf_env for later usage. Reviewed-by: Ian Rogers Signed-off-by: Kan Liang Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexey Bayduraev Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tinghao Zhang Link: https://lore.kernel.org/r/20231025201626.3000228-7-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/env.h | 5 +++++ tools/perf/util/header.c | 18 +++++++++++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h index 4566c51f2fd95..48d7f8759a2a7 100644 --- a/tools/perf/util/env.h +++ b/tools/perf/util/env.h @@ -46,6 +46,9 @@ struct hybrid_node { struct pmu_caps { int nr_caps; unsigned int max_branches; + unsigned int br_cntr_nr; + unsigned int br_cntr_width; + char **caps; char *pmu_name; }; @@ -62,6 +65,8 @@ struct perf_env { unsigned long long total_mem; unsigned int msr_pmu_type; unsigned int max_branches; + unsigned int br_cntr_nr; + unsigned int br_cntr_width; int kernel_is_64_bit; int nr_cmdline; diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index e86b9439ffee0..eeb96a1b63a7c 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -3259,7 +3259,9 @@ static int process_compressed(struct feat_fd *ff, } static int __process_pmu_caps(struct feat_fd *ff, int *nr_caps, - char ***caps, unsigned int *max_branches) + char ***caps, unsigned int *max_branches, + unsigned int *br_cntr_nr, + unsigned int *br_cntr_width) { char *name, *value, *ptr; u32 nr_pmu_caps, i; @@ -3294,6 +3296,12 @@ static int __process_pmu_caps(struct feat_fd *ff, int *nr_caps, if (!strcmp(name, "branches")) *max_branches = atoi(value); + if (!strcmp(name, "branch_counter_nr")) + *br_cntr_nr = atoi(value); + + if (!strcmp(name, "branch_counter_width")) + *br_cntr_width = atoi(value); + free(value); free(name); } @@ -3318,7 +3326,9 @@ static int process_cpu_pmu_caps(struct feat_fd *ff, { int ret = __process_pmu_caps(ff, &ff->ph->env.nr_cpu_pmu_caps, &ff->ph->env.cpu_pmu_caps, - &ff->ph->env.max_branches); + &ff->ph->env.max_branches, + &ff->ph->env.br_cntr_nr, + &ff->ph->env.br_cntr_width); if (!ret && !ff->ph->env.cpu_pmu_caps) pr_debug("cpu pmu capabilities not available\n"); @@ -3347,7 +3357,9 @@ static int process_pmu_caps(struct feat_fd *ff, void *data __maybe_unused) for (i = 0; i < nr_pmu; i++) { ret = __process_pmu_caps(ff, &pmu_caps[i].nr_caps, &pmu_caps[i].caps, - &pmu_caps[i].max_branches); + &pmu_caps[i].max_branches, + &pmu_caps[i].br_cntr_nr, + &pmu_caps[i].br_cntr_width); if (ret) goto err; -- GitLab From 9fbb4b02302b0ae618303565025412070d32f85e Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 25 Oct 2023 13:16:26 -0700 Subject: [PATCH 0006/1290] perf tools: Add branch counter knob Add a new branch filter, "counter", for the branch counter option. It is used to mark the events which should be logged in the branch. If it is applied with the -j option, the counters of all the events should be logged in the branch. If the legacy kernel doesn't support the new branch sample type, switching off the branch counter filter. The stored counter values in each branch are displayed right after the regular branch stack information via perf report -D. Usage examples: # perf record -e "{branch-instructions,branch-misses}:S" -j any,counter Only the first event, branch-instructions, collect the LBR. Both branch-instructions and branch-misses are marked as logged events. The occurrences information of them can be found in the branch stack extension space of each branch. # perf record -e "{cpu/branch-instructions,branch_type=any/,cpu/branch-misses,branch_type=counter/}" Only the first event, branch-instructions, collect the LBR. Only the branch-misses event is marked as a logged event. Committer notes: I noticed 'perf test "Sample parsing"' failing, reported to the list and Kan provided a patch that checks if the evsel has a leader and that evsel->evlist is set, the comment in the source code further explains it. Reviewed-by: Ian Rogers Signed-off-by: Kan Liang Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexey Bayduraev Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tinghao Zhang Link: https://lore.kernel.org/r/20231025201626.3000228-8-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-record.txt | 4 +++ tools/perf/util/evsel.c | 35 ++++++++++++++++++++++- tools/perf/util/evsel.h | 1 + tools/perf/util/parse-branch-options.c | 1 + tools/perf/util/perf_event_attr_fprintf.c | 1 + tools/perf/util/sample.h | 1 + tools/perf/util/session.c | 15 ++++++++-- 7 files changed, 55 insertions(+), 3 deletions(-) diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 1889f66addf2a..6015fdd08fb63 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -445,6 +445,10 @@ following filters are defined: 4th-Gen Xeon+ server), the save branch type is unconditionally enabled when the taken branch stack sampling is enabled. - priv: save privilege state during sampling in case binary is not available later + - counter: save occurrences of the event since the last branch entry. Currently, the + feature is only supported by a newer CPU, e.g., Intel Sierra Forest and + later platforms. An error out is expected if it's used on the unsupported + kernel or CPUs. + The option requires at least one branch type among any, any_call, any_ret, ind_call, cond. diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 72a5dfc38d380..a5da74e3a517b 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1832,6 +1832,8 @@ static int __evsel__prepare_open(struct evsel *evsel, struct perf_cpu_map *cpus, static void evsel__disable_missing_features(struct evsel *evsel) { + if (perf_missing_features.branch_counters) + evsel->core.attr.branch_sample_type &= ~PERF_SAMPLE_BRANCH_COUNTERS; if (perf_missing_features.read_lost) evsel->core.attr.read_format &= ~PERF_FORMAT_LOST; if (perf_missing_features.weight_struct) { @@ -1885,7 +1887,12 @@ bool evsel__detect_missing_features(struct evsel *evsel) * Must probe features in the order they were added to the * perf_event_attr interface. */ - if (!perf_missing_features.read_lost && + if (!perf_missing_features.branch_counters && + (evsel->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS)) { + perf_missing_features.branch_counters = true; + pr_debug2("switching off branch counters support\n"); + return true; + } else if (!perf_missing_features.read_lost && (evsel->core.attr.read_format & PERF_FORMAT_LOST)) { perf_missing_features.read_lost = true; pr_debug2("switching off PERF_FORMAT_LOST support\n"); @@ -2318,6 +2325,22 @@ u64 evsel__bitfield_swap_branch_flags(u64 value) return new_val; } +static inline bool evsel__has_branch_counters(const struct evsel *evsel) +{ + struct evsel *cur, *leader = evsel__leader(evsel); + + /* The branch counters feature only supports group */ + if (!leader || !evsel->evlist) + return false; + + evlist__for_each_entry(evsel->evlist, cur) { + if ((leader == evsel__leader(cur)) && + (cur->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS)) + return true; + } + return false; +} + int evsel__parse_sample(struct evsel *evsel, union perf_event *event, struct perf_sample *data) { @@ -2551,6 +2574,16 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event, OVERFLOW_CHECK(array, sz, max_size); array = (void *)array + sz; + + if (evsel__has_branch_counters(evsel)) { + OVERFLOW_CHECK_u64(array); + + data->branch_stack_cntr = (u64 *)array; + sz = data->branch_stack->nr * sizeof(u64); + + OVERFLOW_CHECK(array, sz, max_size); + array = (void *)array + sz; + } } if (type & PERF_SAMPLE_REGS_USER) { diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index d791316a1792e..f19ac9f027efc 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -191,6 +191,7 @@ struct perf_missing_features { bool code_page_size; bool weight_struct; bool read_lost; + bool branch_counters; }; extern struct perf_missing_features perf_missing_features; diff --git a/tools/perf/util/parse-branch-options.c b/tools/perf/util/parse-branch-options.c index fd67d204d720d..f7f7aff3d85a0 100644 --- a/tools/perf/util/parse-branch-options.c +++ b/tools/perf/util/parse-branch-options.c @@ -36,6 +36,7 @@ static const struct branch_mode branch_modes[] = { BRANCH_OPT("stack", PERF_SAMPLE_BRANCH_CALL_STACK), BRANCH_OPT("hw_index", PERF_SAMPLE_BRANCH_HW_INDEX), BRANCH_OPT("priv", PERF_SAMPLE_BRANCH_PRIV_SAVE), + BRANCH_OPT("counter", PERF_SAMPLE_BRANCH_COUNTERS), BRANCH_END }; diff --git a/tools/perf/util/perf_event_attr_fprintf.c b/tools/perf/util/perf_event_attr_fprintf.c index 2247991451f3a..8f04d3b7f3ec7 100644 --- a/tools/perf/util/perf_event_attr_fprintf.c +++ b/tools/perf/util/perf_event_attr_fprintf.c @@ -55,6 +55,7 @@ static void __p_branch_sample_type(char *buf, size_t size, u64 value) bit_name(COND), bit_name(CALL_STACK), bit_name(IND_JUMP), bit_name(CALL), bit_name(NO_FLAGS), bit_name(NO_CYCLES), bit_name(TYPE_SAVE), bit_name(HW_INDEX), bit_name(PRIV_SAVE), + bit_name(COUNTERS), { .name = NULL, } }; #undef bit_name diff --git a/tools/perf/util/sample.h b/tools/perf/util/sample.h index c92ad0f51ecd9..70b2c3135555e 100644 --- a/tools/perf/util/sample.h +++ b/tools/perf/util/sample.h @@ -113,6 +113,7 @@ struct perf_sample { void *raw_data; struct ip_callchain *callchain; struct branch_stack *branch_stack; + u64 *branch_stack_cntr; struct regs_dump user_regs; struct regs_dump intr_regs; struct stack_dump user_stack; diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 1e9aa8ed15b64..4a094ab0362b4 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1150,9 +1150,13 @@ static void callchain__printf(struct evsel *evsel, i, callchain->ips[i]); } -static void branch_stack__printf(struct perf_sample *sample, bool callstack) +static void branch_stack__printf(struct perf_sample *sample, + struct evsel *evsel) { struct branch_entry *entries = perf_sample__branch_entries(sample); + bool callstack = evsel__has_branch_callstack(evsel); + u64 *branch_stack_cntr = sample->branch_stack_cntr; + struct perf_env *env = evsel__env(evsel); uint64_t i; if (!callstack) { @@ -1194,6 +1198,13 @@ static void branch_stack__printf(struct perf_sample *sample, bool callstack) } } } + + if (branch_stack_cntr) { + printf("... branch stack counters: nr:%" PRIu64 " (counter width: %u max counter nr:%u)\n", + sample->branch_stack->nr, env->br_cntr_width, env->br_cntr_nr); + for (i = 0; i < sample->branch_stack->nr; i++) + printf("..... %2"PRIu64": %016" PRIx64 "\n", i, branch_stack_cntr[i]); + } } static void regs_dump__printf(u64 mask, u64 *regs, const char *arch) @@ -1355,7 +1366,7 @@ static void dump_sample(struct evsel *evsel, union perf_event *event, callchain__printf(evsel, sample); if (evsel__has_br_stack(evsel)) - branch_stack__printf(sample, evsel__has_branch_callstack(evsel)); + branch_stack__printf(sample, evsel); if (sample_type & PERF_SAMPLE_REGS_USER) regs_user__printf(sample, arch); -- GitLab From 7ff7b7afe364af17362f2a08cccdc79905feb0bc Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 3 Oct 2023 08:49:11 +0100 Subject: [PATCH 0007/1290] perf tools: Fix spelling mistake "parametrized" -> "parameterized" There are spelling mistakes in comments and a pr_debug message. Fix them. Reviewed-by: Athira Jajeev Reviewed-by: Ian Rogers Signed-off-by: Colin Ian King Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Peter Zijlstra Cc: kernel-janitors@vger.kernel.org Link: https://lore.kernel.org/r/20231003074911.220216-1-colin.i.king@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/parse-events.c | 4 ++-- tools/perf/tests/shell/stat_all_pmu.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index f78be21a5999b..e52f45c7c3d1a 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -2549,7 +2549,7 @@ static int test__pmu_events(struct test_suite *test __maybe_unused, int subtest if (strchr(ent->d_name, '.')) continue; - /* exclude parametrized ones (name contains '?') */ + /* exclude parameterized ones (name contains '?') */ n = snprintf(pmu_event, sizeof(pmu_event), "%s%s", path, ent->d_name); if (n >= PATH_MAX) { pr_err("pmu event name crossed PATH_MAX(%d) size\n", PATH_MAX); @@ -2578,7 +2578,7 @@ static int test__pmu_events(struct test_suite *test __maybe_unused, int subtest fclose(file); if (is_event_parameterized == 1) { - pr_debug("skipping parametrized PMU event: %s which contains ?\n", pmu_event); + pr_debug("skipping parameterized PMU event: %s which contains ?\n", pmu_event); continue; } diff --git a/tools/perf/tests/shell/stat_all_pmu.sh b/tools/perf/tests/shell/stat_all_pmu.sh index c779554191731..d2a3506e0d196 100755 --- a/tools/perf/tests/shell/stat_all_pmu.sh +++ b/tools/perf/tests/shell/stat_all_pmu.sh @@ -4,7 +4,7 @@ set -e -# Test all PMU events; however exclude parametrized ones (name contains '?') +# Test all PMU events; however exclude parameterized ones (name contains '?') for p in $(perf list --raw-dump pmu | sed 's/[[:graph:]]\+?[[:graph:]]\+[[:space:]]//g'); do echo "Testing $p" result=$(perf stat -e "$p" true 2>&1) -- GitLab From 1a27fc01700fbff2f205000edf0d1d315b5f85cc Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 2 Nov 2023 10:56:44 -0700 Subject: [PATCH 0008/1290] perf record: Lazy load kernel symbols Commit 5b7ba82a75915e73 ("perf symbols: Load kernel maps before using") changed it so that loading a kernel DSO would cause the symbols for the DSO to be eagerly loaded. For 'perf record' this is overhead as the symbols won't be used. Add a field to 'struct symbol_conf' to control the behavior and disable it for 'perf record' and 'perf inject'. Reviewed-by: Adrian Hunter Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Jajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Miguel Ojeda Cc: Ming Wang Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231102175735.2272696-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-inject.c | 6 ++++++ tools/perf/builtin-record.c | 2 ++ tools/perf/util/event.c | 4 ++-- tools/perf/util/symbol_conf.h | 3 ++- 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index c8cf2fdd9cff9..eb3ef5c24b662 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -2265,6 +2265,12 @@ int cmd_inject(int argc, const char **argv) "perf inject []", NULL }; + + if (!inject.itrace_synth_opts.set) { + /* Disable eager loading of kernel symbols that adds overhead to perf inject. */ + symbol_conf.lazy_load_kernel_maps = true; + } + #ifndef HAVE_JITDUMP set_option_nobuild(options, 'j', "jit", "NO_LIBELF=1", true); #endif diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index dcf288a4fb9a9..8ec818568662e 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -3989,6 +3989,8 @@ int cmd_record(int argc, const char **argv) # undef set_nobuild #endif + /* Disable eager loading of kernel symbols that adds overhead to perf record. */ + symbol_conf.lazy_load_kernel_maps = true; rec->opts.affinity = PERF_AFFINITY_SYS; rec->evlist = evlist__new(); diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 923c0fb151222..68f45e9e63b6e 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -617,13 +617,13 @@ struct map *thread__find_map(struct thread *thread, u8 cpumode, u64 addr, if (cpumode == PERF_RECORD_MISC_KERNEL && perf_host) { al->level = 'k'; maps = machine__kernel_maps(machine); - load_map = true; + load_map = !symbol_conf.lazy_load_kernel_maps; } else if (cpumode == PERF_RECORD_MISC_USER && perf_host) { al->level = '.'; } else if (cpumode == PERF_RECORD_MISC_GUEST_KERNEL && perf_guest) { al->level = 'g'; maps = machine__kernel_maps(machine); - load_map = true; + load_map = !symbol_conf.lazy_load_kernel_maps; } else if (cpumode == PERF_RECORD_MISC_GUEST_USER && perf_guest) { al->level = 'u'; } else { diff --git a/tools/perf/util/symbol_conf.h b/tools/perf/util/symbol_conf.h index 0b589570d1d09..2b2fb9e224b00 100644 --- a/tools/perf/util/symbol_conf.h +++ b/tools/perf/util/symbol_conf.h @@ -42,7 +42,8 @@ struct symbol_conf { inline_name, disable_add2line_warn, buildid_mmap2, - guest_code; + guest_code, + lazy_load_kernel_maps; const char *vmlinux_name, *kallsyms_name, *source_prefix, -- GitLab From 9ffa6c7512ca7aaeb30e596e2c247cb1fae7123a Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 2 Nov 2023 10:56:47 -0700 Subject: [PATCH 0009/1290] perf machine thread: Remove exited threads by default 'struct thread' values hold onto references to mmaps, DSOs, etc. When a thread exits it is necessary to clean all of this memory up by removing the thread from the machine's threads. Some tools require this doesn't happen, such as auxtrace events, 'perf report' if offcpu events exist or if a task list is being generated, so add a 'struct symbol_conf' member to make the behavior optional. When an exited thread is left in the machine's threads, mark it as exited. This change relates to commit 40826c45eb0b8856 ("perf thread: Remove notion of dead threads") . Dead threads were removed as they had a reference count of 0 and were difficult to reason about with the reference count checker. Here a thread is removed from threads when it exits, unless via symbol_conf the exited thread isn't remove and is marked as exited. Reference counting behaves as it normally does. Reviewed-by: Adrian Hunter Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Jajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Miguel Ojeda Cc: Ming Wang Cc: Namhyung Kim Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231102175735.2272696-6-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 7 +++++++ tools/perf/util/machine.c | 10 +++++++--- tools/perf/util/session.c | 5 +++++ tools/perf/util/symbol_conf.h | 3 ++- tools/perf/util/thread.h | 14 ++++++++++++++ 5 files changed, 35 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 9cb1da2dc0c03..121a2781323c8 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -1426,6 +1426,13 @@ int cmd_report(int argc, const char **argv) if (ret < 0) goto exit; + /* + * tasks_mode require access to exited threads to list those that are in + * the data file. Off-cpu events are synthesized after other events and + * reference exited threads. + */ + symbol_conf.keep_exited_threads = true; + annotation_options__init(&report.annotation_opts); ret = perf_config(report__config, &report); diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 90c750150b19b..a985d004aa8d8 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -2157,9 +2157,13 @@ int machine__process_exit_event(struct machine *machine, union perf_event *event if (dump_trace) perf_event__fprintf_task(event, stdout); - if (thread != NULL) - thread__put(thread); - + if (thread != NULL) { + if (symbol_conf.keep_exited_threads) + thread__set_exited(thread, /*exited=*/true); + else + machine__remove_thread(machine, thread); + } + thread__put(thread); return 0; } diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 4a094ab0362b4..199d3e8df3158 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -115,6 +115,11 @@ static int perf_session__open(struct perf_session *session, int repipe_fd) return -1; } + if (perf_header__has_feat(&session->header, HEADER_AUXTRACE)) { + /* Auxiliary events may reference exited threads, hold onto dead ones. */ + symbol_conf.keep_exited_threads = true; + } + if (perf_data__is_pipe(data)) return 0; diff --git a/tools/perf/util/symbol_conf.h b/tools/perf/util/symbol_conf.h index 2b2fb9e224b00..6040286e07a65 100644 --- a/tools/perf/util/symbol_conf.h +++ b/tools/perf/util/symbol_conf.h @@ -43,7 +43,8 @@ struct symbol_conf { disable_add2line_warn, buildid_mmap2, guest_code, - lazy_load_kernel_maps; + lazy_load_kernel_maps, + keep_exited_threads; const char *vmlinux_name, *kallsyms_name, *source_prefix, diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h index e79225a0ea46b..0df775b5c1105 100644 --- a/tools/perf/util/thread.h +++ b/tools/perf/util/thread.h @@ -36,13 +36,22 @@ struct thread_rb_node { }; DECLARE_RC_STRUCT(thread) { + /** @maps: mmaps associated with this thread. */ struct maps *maps; pid_t pid_; /* Not all tools update this */ + /** @tid: thread ID number unique to a machine. */ pid_t tid; + /** @ppid: parent process of the process this thread belongs to. */ pid_t ppid; int cpu; int guest_cpu; /* For QEMU thread */ refcount_t refcnt; + /** + * @exited: Has the thread had an exit event. Such threads are usually + * removed from the machine's threads but some events/tools require + * access to dead threads. + */ + bool exited; bool comm_set; int comm_len; struct list_head namespaces_list; @@ -189,6 +198,11 @@ static inline refcount_t *thread__refcnt(struct thread *thread) return &RC_CHK_ACCESS(thread)->refcnt; } +static inline void thread__set_exited(struct thread *thread, bool exited) +{ + RC_CHK_ACCESS(thread)->exited = exited; +} + static inline bool thread__comm_set(const struct thread *thread) { return RC_CHK_ACCESS(thread)->comm_set; -- GitLab From 89d5c48c34c8a552acd9a2e3ee504b2f06879fea Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 3 Nov 2023 12:55:41 -0700 Subject: [PATCH 0010/1290] perf test: Simplify "object code reading" test It tries cycles (or cpu-clock on s390) event with exclude_kernel bit to open. But other arch on a VM can fail with the hardware event and need to fallback to the software event in the same way. So let's get rid of the cpuid check and use generic fallback mechanism using an array of event candidates. Now event in the odd index excludes the kernel so use that for the return value. Reviewed-by: Adrian Hunter Signed-off-by: Namhyung Kim Tested-by: James Clark Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Thomas Richter Link: https://lore.kernel.org/r/20231103195541.67788-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/code-reading.c | 76 ++++++++++----------------------- 1 file changed, 23 insertions(+), 53 deletions(-) diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index 3af81012014ed..d6e845c579023 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -511,38 +511,6 @@ static void fs_something(void) } } -#ifdef __s390x__ -#include "header.h" // for get_cpuid() -#endif - -static const char *do_determine_event(bool excl_kernel) -{ - const char *event = excl_kernel ? "cycles:u" : "cycles"; - -#ifdef __s390x__ - char cpuid[128], model[16], model_c[16], cpum_cf_v[16]; - unsigned int family; - int ret, cpum_cf_a; - - if (get_cpuid(cpuid, sizeof(cpuid))) - goto out_clocks; - ret = sscanf(cpuid, "%*[^,],%u,%[^,],%[^,],%[^,],%x", &family, model_c, - model, cpum_cf_v, &cpum_cf_a); - if (ret != 5) /* Not available */ - goto out_clocks; - if (excl_kernel && (cpum_cf_a & 4)) - return event; - if (!excl_kernel && (cpum_cf_a & 2)) - return event; - - /* Fall through: missing authorization */ -out_clocks: - event = excl_kernel ? "cpu-clock:u" : "cpu-clock"; - -#endif - return event; -} - static void do_something(void) { fs_something(); @@ -583,8 +551,10 @@ static int do_test_code_reading(bool try_kcore) int err = -1, ret; pid_t pid; struct map *map; - bool have_vmlinux, have_kcore, excl_kernel = false; + bool have_vmlinux, have_kcore; struct dso *dso; + const char *events[] = { "cycles", "cycles:u", "cpu-clock", "cpu-clock:u", NULL }; + int evidx = 0; pid = getpid(); @@ -618,7 +588,7 @@ static int do_test_code_reading(bool try_kcore) /* No point getting kernel events if there is no kernel object */ if (!have_vmlinux && !have_kcore) - excl_kernel = true; + evidx++; threads = thread_map__new_by_tid(pid); if (!threads) { @@ -646,7 +616,7 @@ static int do_test_code_reading(bool try_kcore) goto out_put; } - while (1) { + while (events[evidx]) { const char *str; evlist = evlist__new(); @@ -657,7 +627,7 @@ static int do_test_code_reading(bool try_kcore) perf_evlist__set_maps(&evlist->core, cpus, threads); - str = do_determine_event(excl_kernel); + str = events[evidx]; pr_debug("Parsing event '%s'\n", str); ret = parse_event(evlist, str); if (ret < 0) { @@ -675,32 +645,32 @@ static int do_test_code_reading(bool try_kcore) ret = evlist__open(evlist); if (ret < 0) { - if (!excl_kernel) { - excl_kernel = true; - /* - * Both cpus and threads are now owned by evlist - * and will be freed by following perf_evlist__set_maps - * call. Getting reference to keep them alive. - */ - perf_cpu_map__get(cpus); - perf_thread_map__get(threads); - perf_evlist__set_maps(&evlist->core, NULL, NULL); - evlist__delete(evlist); - evlist = NULL; - continue; - } + evidx++; - if (verbose > 0) { + if (events[evidx] == NULL && verbose > 0) { char errbuf[512]; evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf)); pr_debug("perf_evlist__open() failed!\n%s\n", errbuf); } - goto out_put; + /* + * Both cpus and threads are now owned by evlist + * and will be freed by following perf_evlist__set_maps + * call. Getting reference to keep them alive. + */ + perf_cpu_map__get(cpus); + perf_thread_map__get(threads); + perf_evlist__set_maps(&evlist->core, NULL, NULL); + evlist__delete(evlist); + evlist = NULL; + continue; } break; } + if (events[evidx] == NULL) + goto out_put; + ret = evlist__mmap(evlist, UINT_MAX); if (ret < 0) { pr_debug("evlist__mmap failed\n"); @@ -721,7 +691,7 @@ static int do_test_code_reading(bool try_kcore) err = TEST_CODE_READING_NO_KERNEL_OBJ; else if (!have_vmlinux && !try_kcore) err = TEST_CODE_READING_NO_VMLINUX; - else if (excl_kernel) + else if (strstr(events[evidx], ":u")) err = TEST_CODE_READING_NO_ACCESS; else err = TEST_CODE_READING_OK; -- GitLab From de2c7eb59c342d1a61124caaf2993e325a9becb7 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 3 Nov 2023 12:19:03 -0700 Subject: [PATCH 0011/1290] perf annotate: Split branch stack cycles information out of 'struct annotation_line' The cycles info is used only when branch stack is provided. Separate them from 'struct annotation_line' into a separate struct and lazy allocate them to save some memory. Committer notes: Make annotation__compute_ipc() check if the lazy allocation works, bailing out if so, its callers already do error checking and propagation. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Christophe JAILLET Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231103191907.54531-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/annotate.c | 2 +- tools/perf/util/annotate.c | 61 ++++++++++++++++++++++--------- tools/perf/util/annotate.h | 15 +++++--- 3 files changed, 54 insertions(+), 24 deletions(-) diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index ccdb2cd11fbf0..d2470f87344d0 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -337,7 +337,7 @@ static void annotate_browser__calc_percent(struct annotate_browser *browser, max_percent = percent; } - if (max_percent < 0.01 && pos->al.ipc == 0) { + if (max_percent < 0.01 && (!pos->al.cycles || pos->al.cycles->ipc == 0)) { RB_CLEAR_NODE(&pos->al.rb_node); continue; } diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 82956adf99632..99ff3bb9cad8d 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1100,8 +1100,8 @@ static void annotation__count_and_fill(struct annotation *notes, u64 start, u64 for (offset = start; offset <= end; offset++) { struct annotation_line *al = notes->offsets[offset]; - if (al && al->ipc == 0.0) { - al->ipc = ipc; + if (al && al->cycles && al->cycles->ipc == 0.0) { + al->cycles->ipc = ipc; cover_insn++; } } @@ -1114,12 +1114,13 @@ static void annotation__count_and_fill(struct annotation *notes, u64 start, u64 } } -void annotation__compute_ipc(struct annotation *notes, size_t size) +static int annotation__compute_ipc(struct annotation *notes, size_t size) { + int err = 0; s64 offset; if (!notes->src || !notes->src->cycles_hist) - return; + return 0; notes->total_insn = annotation__count_insn(notes, 0, size - 1); notes->hit_cycles = 0; @@ -1134,18 +1135,39 @@ void annotation__compute_ipc(struct annotation *notes, size_t size) if (ch && ch->cycles) { struct annotation_line *al; + al = notes->offsets[offset]; + if (al && al->cycles == NULL) { + al->cycles = zalloc(sizeof(*al->cycles)); + if (al->cycles == NULL) { + err = ENOMEM; + break; + } + } if (ch->have_start) annotation__count_and_fill(notes, ch->start, offset, ch); - al = notes->offsets[offset]; if (al && ch->num_aggr) { - al->cycles = ch->cycles_aggr / ch->num_aggr; - al->cycles_max = ch->cycles_max; - al->cycles_min = ch->cycles_min; + al->cycles->avg = ch->cycles_aggr / ch->num_aggr; + al->cycles->max = ch->cycles_max; + al->cycles->min = ch->cycles_min; } notes->have_cycles = true; } } + + if (err) { + while (++offset < (s64)size) { + struct cyc_hist *ch = ¬es->src->cycles_hist[offset]; + + if (ch && ch->cycles) { + struct annotation_line *al = notes->offsets[offset]; + if (al) + zfree(&al->cycles); + } + } + } + annotation__unlock(notes); + return 0; } int addr_map_symbol__inc_samples(struct addr_map_symbol *ams, struct perf_sample *sample, @@ -1225,6 +1247,7 @@ static void annotation_line__exit(struct annotation_line *al) { zfree_srcline(&al->path); zfree(&al->line); + zfree(&al->cycles); } static size_t disasm_line_size(int nr) @@ -3083,8 +3106,8 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati int printed; if (first_line && (al->offset == -1 || percent_max == 0.0)) { - if (notes->have_cycles) { - if (al->ipc == 0.0 && al->cycles == 0) + if (notes->have_cycles && al->cycles) { + if (al->cycles->ipc == 0.0 && al->cycles->avg == 0) show_title = true; } else show_title = true; @@ -3121,17 +3144,17 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati } if (notes->have_cycles) { - if (al->ipc) - obj__printf(obj, "%*.2f ", ANNOTATION__IPC_WIDTH - 1, al->ipc); + if (al->cycles && al->cycles->ipc) + obj__printf(obj, "%*.2f ", ANNOTATION__IPC_WIDTH - 1, al->cycles->ipc); else if (!show_title) obj__printf(obj, "%*s", ANNOTATION__IPC_WIDTH, " "); else obj__printf(obj, "%*s ", ANNOTATION__IPC_WIDTH - 1, "IPC"); if (!notes->options->show_minmax_cycle) { - if (al->cycles) + if (al->cycles && al->cycles->avg) obj__printf(obj, "%*" PRIu64 " ", - ANNOTATION__CYCLES_WIDTH - 1, al->cycles); + ANNOTATION__CYCLES_WIDTH - 1, al->cycles->avg); else if (!show_title) obj__printf(obj, "%*s", ANNOTATION__CYCLES_WIDTH, " "); @@ -3145,8 +3168,8 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati scnprintf(str, sizeof(str), "%" PRIu64 "(%" PRIu64 "/%" PRIu64 ")", - al->cycles, al->cycles_min, - al->cycles_max); + al->cycles->avg, al->cycles->min, + al->cycles->max); obj__printf(obj, "%*s ", ANNOTATION__MINMAX_CYCLES_WIDTH - 1, @@ -3264,7 +3287,11 @@ int symbol__annotate2(struct map_symbol *ms, struct evsel *evsel, annotation__set_offsets(notes, size); annotation__mark_jump_targets(notes, sym); - annotation__compute_ipc(notes, size); + + err = annotation__compute_ipc(notes, size); + if (err) + goto out_free_offsets; + annotation__init_column_widths(notes, sym); notes->nr_events = nr_pcnt; diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 9627805591760..19bc2f0391757 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -130,6 +130,13 @@ struct annotation_data { struct sym_hist_entry he; }; +struct cycles_info { + float ipc; + u64 avg; + u64 max; + u64 min; +}; + struct annotation_line { struct list_head node; struct rb_node rb_node; @@ -137,12 +144,9 @@ struct annotation_line { char *line; int line_nr; char *fileloc; - int jump_sources; - float ipc; - u64 cycles; - u64 cycles_max; - u64 cycles_min; char *path; + struct cycles_info *cycles; + int jump_sources; u32 idx; int idx_asm; int data_nr; @@ -325,7 +329,6 @@ static inline bool annotation_line__filter(struct annotation_line *al, struct an } void annotation__set_offsets(struct annotation *notes, s64 size); -void annotation__compute_ipc(struct annotation *notes, size_t size); void annotation__mark_jump_targets(struct annotation *notes, struct symbol *sym); void annotation__update_column_widths(struct annotation *notes); void annotation__init_column_widths(struct annotation *notes, struct symbol *sym); -- GitLab From b7f87e32590bf48eca84f729d3422be7b8dc22d3 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 3 Nov 2023 12:19:04 -0700 Subject: [PATCH 0012/1290] perf annotate: Split branch stack cycles info from 'struct annotation' The cycles info is only meaningful when sample has branch stacks. To save the memory for normal cases, move those fields to a new 'struct annotated_branch' and dynamically allocate it when needed. Also move cycles_hist from annotated_source as it's related here. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Christophe JAILLET Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231103191907.54531-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 99 ++++++++++++++++++++---------------- tools/perf/util/annotate.h | 17 ++++--- tools/perf/util/block-info.c | 4 +- tools/perf/util/sort.c | 14 ++--- 4 files changed, 73 insertions(+), 61 deletions(-) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 99ff3bb9cad8d..4e40c94c85d10 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -810,7 +810,6 @@ static __maybe_unused void annotated_source__delete(struct annotated_source *src if (src == NULL) return; zfree(&src->histograms); - zfree(&src->cycles_hist); free(src); } @@ -845,18 +844,6 @@ static int annotated_source__alloc_histograms(struct annotated_source *src, return src->histograms ? 0 : -1; } -/* The cycles histogram is lazily allocated. */ -static int symbol__alloc_hist_cycles(struct symbol *sym) -{ - struct annotation *notes = symbol__annotation(sym); - const size_t size = symbol__size(sym); - - notes->src->cycles_hist = calloc(size, sizeof(struct cyc_hist)); - if (notes->src->cycles_hist == NULL) - return -1; - return 0; -} - void symbol__annotate_zero_histograms(struct symbol *sym) { struct annotation *notes = symbol__annotation(sym); @@ -865,9 +852,10 @@ void symbol__annotate_zero_histograms(struct symbol *sym) if (notes->src != NULL) { memset(notes->src->histograms, 0, notes->src->nr_histograms * notes->src->sizeof_sym_hist); - if (notes->src->cycles_hist) - memset(notes->src->cycles_hist, 0, - symbol__size(sym) * sizeof(struct cyc_hist)); + } + if (notes->branch && notes->branch->cycles_hist) { + memset(notes->branch->cycles_hist, 0, + symbol__size(sym) * sizeof(struct cyc_hist)); } annotation__unlock(notes); } @@ -958,23 +946,33 @@ static int __symbol__inc_addr_samples(struct map_symbol *ms, return 0; } +static struct annotated_branch *annotation__get_branch(struct annotation *notes) +{ + if (notes == NULL) + return NULL; + + if (notes->branch == NULL) + notes->branch = zalloc(sizeof(*notes->branch)); + + return notes->branch; +} + static struct cyc_hist *symbol__cycles_hist(struct symbol *sym) { struct annotation *notes = symbol__annotation(sym); + struct annotated_branch *branch; - if (notes->src == NULL) { - notes->src = annotated_source__new(); - if (notes->src == NULL) - return NULL; - goto alloc_cycles_hist; - } + branch = annotation__get_branch(notes); + if (branch == NULL) + return NULL; + + if (branch->cycles_hist == NULL) { + const size_t size = symbol__size(sym); - if (!notes->src->cycles_hist) { -alloc_cycles_hist: - symbol__alloc_hist_cycles(sym); + branch->cycles_hist = calloc(size, sizeof(struct cyc_hist)); } - return notes->src->cycles_hist; + return branch->cycles_hist; } struct annotated_source *symbol__hists(struct symbol *sym, int nr_hists) @@ -1083,6 +1081,14 @@ static unsigned annotation__count_insn(struct annotation *notes, u64 start, u64 return n_insn; } +static void annotated_branch__delete(struct annotated_branch *branch) +{ + if (branch) { + zfree(&branch->cycles_hist); + free(branch); + } +} + static void annotation__count_and_fill(struct annotation *notes, u64 start, u64 end, struct cyc_hist *ch) { unsigned n_insn; @@ -1091,6 +1097,7 @@ static void annotation__count_and_fill(struct annotation *notes, u64 start, u64 n_insn = annotation__count_insn(notes, start, end); if (n_insn && ch->num && ch->cycles) { + struct annotated_branch *branch; float ipc = n_insn / ((double)ch->cycles / (double)ch->num); /* Hide data when there are too many overlaps. */ @@ -1106,10 +1113,11 @@ static void annotation__count_and_fill(struct annotation *notes, u64 start, u64 } } - if (cover_insn) { - notes->hit_cycles += ch->cycles; - notes->hit_insn += n_insn * ch->num; - notes->cover_insn += cover_insn; + branch = annotation__get_branch(notes); + if (cover_insn && branch) { + branch->hit_cycles += ch->cycles; + branch->hit_insn += n_insn * ch->num; + branch->cover_insn += cover_insn; } } } @@ -1119,19 +1127,19 @@ static int annotation__compute_ipc(struct annotation *notes, size_t size) int err = 0; s64 offset; - if (!notes->src || !notes->src->cycles_hist) + if (!notes->branch || !notes->branch->cycles_hist) return 0; - notes->total_insn = annotation__count_insn(notes, 0, size - 1); - notes->hit_cycles = 0; - notes->hit_insn = 0; - notes->cover_insn = 0; + notes->branch->total_insn = annotation__count_insn(notes, 0, size - 1); + notes->branch->hit_cycles = 0; + notes->branch->hit_insn = 0; + notes->branch->cover_insn = 0; annotation__lock(notes); for (offset = size - 1; offset >= 0; --offset) { struct cyc_hist *ch; - ch = ¬es->src->cycles_hist[offset]; + ch = ¬es->branch->cycles_hist[offset]; if (ch && ch->cycles) { struct annotation_line *al; @@ -1150,13 +1158,12 @@ static int annotation__compute_ipc(struct annotation *notes, size_t size) al->cycles->max = ch->cycles_max; al->cycles->min = ch->cycles_min; } - notes->have_cycles = true; } } if (err) { while (++offset < (s64)size) { - struct cyc_hist *ch = ¬es->src->cycles_hist[offset]; + struct cyc_hist *ch = ¬es->branch->cycles_hist[offset]; if (ch && ch->cycles) { struct annotation_line *al = notes->offsets[offset]; @@ -1322,6 +1329,7 @@ int disasm_line__scnprintf(struct disasm_line *dl, char *bf, size_t size, bool r void annotation__exit(struct annotation *notes) { annotated_source__delete(notes->src); + annotated_branch__delete(notes->branch); } static struct sharded_mutex *sharded_mutex; @@ -3075,13 +3083,14 @@ call_like: static void ipc_coverage_string(char *bf, int size, struct annotation *notes) { double ipc = 0.0, coverage = 0.0; + struct annotated_branch *branch = annotation__get_branch(notes); - if (notes->hit_cycles) - ipc = notes->hit_insn / ((double)notes->hit_cycles); + if (branch && branch->hit_cycles) + ipc = branch->hit_insn / ((double)branch->hit_cycles); - if (notes->total_insn) { - coverage = notes->cover_insn * 100.0 / - ((double)notes->total_insn); + if (branch && branch->total_insn) { + coverage = branch->cover_insn * 100.0 / + ((double)branch->total_insn); } scnprintf(bf, size, "(Average IPC: %.2f, IPC Coverage: %.1f%%)", @@ -3106,7 +3115,7 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati int printed; if (first_line && (al->offset == -1 || percent_max == 0.0)) { - if (notes->have_cycles && al->cycles) { + if (notes->branch && al->cycles) { if (al->cycles->ipc == 0.0 && al->cycles->avg == 0) show_title = true; } else @@ -3143,7 +3152,7 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati } } - if (notes->have_cycles) { + if (notes->branch) { if (al->cycles && al->cycles->ipc) obj__printf(obj, "%*.2f ", ANNOTATION__IPC_WIDTH - 1, al->cycles->ipc); else if (!show_title) diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 19bc2f0391757..508b93d3dcde9 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -271,17 +271,20 @@ struct annotated_source { struct list_head source; int nr_histograms; size_t sizeof_sym_hist; - struct cyc_hist *cycles_hist; struct sym_hist *histograms; }; -struct LOCKABLE annotation { - u64 max_coverage; - u64 start; +struct annotated_branch { u64 hit_cycles; u64 hit_insn; unsigned int total_insn; unsigned int cover_insn; + struct cyc_hist *cycles_hist; +}; + +struct LOCKABLE annotation { + u64 max_coverage; + u64 start; struct annotation_options *options; struct annotation_line **offsets; int nr_events; @@ -297,8 +300,8 @@ struct LOCKABLE annotation { u8 max_addr; u8 max_ins_name; } widths; - bool have_cycles; struct annotated_source *src; + struct annotated_branch *branch; }; static inline void annotation__init(struct annotation *notes __maybe_unused) @@ -312,10 +315,10 @@ bool annotation__trylock(struct annotation *notes) EXCLUSIVE_TRYLOCK_FUNCTION(tr static inline int annotation__cycles_width(struct annotation *notes) { - if (notes->have_cycles && notes->options->show_minmax_cycle) + if (notes->branch && notes->options->show_minmax_cycle) return ANNOTATION__IPC_WIDTH + ANNOTATION__MINMAX_CYCLES_WIDTH; - return notes->have_cycles ? ANNOTATION__IPC_WIDTH + ANNOTATION__CYCLES_WIDTH : 0; + return notes->branch ? ANNOTATION__IPC_WIDTH + ANNOTATION__CYCLES_WIDTH : 0; } static inline int annotation__pcnt_width(struct annotation *notes) diff --git a/tools/perf/util/block-info.c b/tools/perf/util/block-info.c index 591fc1edd385c..08f82c1f166c3 100644 --- a/tools/perf/util/block-info.c +++ b/tools/perf/util/block-info.c @@ -129,9 +129,9 @@ int block_info__process_sym(struct hist_entry *he, struct block_hist *bh, al.sym = he->ms.sym; notes = symbol__annotation(he->ms.sym); - if (!notes || !notes->src || !notes->src->cycles_hist) + if (!notes || !notes->branch || !notes->branch->cycles_hist) return 0; - ch = notes->src->cycles_hist; + ch = notes->branch->cycles_hist; for (unsigned int i = 0; i < symbol__size(he->ms.sym); i++) { if (ch[i].num_aggr) { struct block_info *bi; diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 80e4f61327401..27b123ccd2d15 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -583,21 +583,21 @@ static int hist_entry__sym_ipc_snprintf(struct hist_entry *he, char *bf, { struct symbol *sym = he->ms.sym; - struct annotation *notes; + struct annotated_branch *branch; double ipc = 0.0, coverage = 0.0; char tmp[64]; if (!sym) return repsep_snprintf(bf, size, "%-*s", width, "-"); - notes = symbol__annotation(sym); + branch = symbol__annotation(sym)->branch; - if (notes->hit_cycles) - ipc = notes->hit_insn / ((double)notes->hit_cycles); + if (branch && branch->hit_cycles) + ipc = branch->hit_insn / ((double)branch->hit_cycles); - if (notes->total_insn) { - coverage = notes->cover_insn * 100.0 / - ((double)notes->total_insn); + if (branch && branch->total_insn) { + coverage = branch->cover_insn * 100.0 / + ((double)branch->total_insn); } snprintf(tmp, sizeof(tmp), "%-5.2f [%5.1f%%]", ipc, coverage); -- GitLab From 2b215ec71b888ec2f3ca86846ce3a7f20b0d5e4d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 3 Nov 2023 12:19:05 -0700 Subject: [PATCH 0013/1290] perf annotate: Move max_coverage from 'struct annotation' to 'struct annotated_branch' The max_coverage field is only used when branch stack info is available so it'd be natural to move to 'struct annotated_branch'. Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Christophe JAILLET Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231103191907.54531-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 7 +++++-- tools/perf/util/annotate.c | 2 +- tools/perf/util/annotate.h | 4 +++- tools/perf/util/block-range.c | 7 ++++++- 4 files changed, 15 insertions(+), 5 deletions(-) diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index aeeb801f1ed7b..a9129b51d5118 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -94,6 +94,7 @@ static void process_basic_block(struct addr_map_symbol *start, struct annotation *notes = sym ? symbol__annotation(sym) : NULL; struct block_range_iter iter; struct block_range *entry; + struct annotated_branch *branch; /* * Sanity; NULL isn't executable and the CPU cannot execute backwards @@ -105,6 +106,8 @@ static void process_basic_block(struct addr_map_symbol *start, if (!block_range_iter__valid(&iter)) return; + branch = annotation__get_branch(notes); + /* * First block in range is a branch target. */ @@ -118,8 +121,8 @@ static void process_basic_block(struct addr_map_symbol *start, entry->coverage++; entry->sym = sym; - if (notes) - notes->max_coverage = max(notes->max_coverage, entry->coverage); + if (branch) + branch->max_coverage = max(branch->max_coverage, entry->coverage); } while (block_range_iter__next(&iter)); diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 4e40c94c85d10..077a297f4dadc 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -946,7 +946,7 @@ static int __symbol__inc_addr_samples(struct map_symbol *ms, return 0; } -static struct annotated_branch *annotation__get_branch(struct annotation *notes) +struct annotated_branch *annotation__get_branch(struct annotation *notes) { if (notes == NULL) return NULL; diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 508b93d3dcde9..8497130989538 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -280,10 +280,10 @@ struct annotated_branch { unsigned int total_insn; unsigned int cover_insn; struct cyc_hist *cycles_hist; + u64 max_coverage; }; struct LOCKABLE annotation { - u64 max_coverage; u64 start; struct annotation_options *options; struct annotation_line **offsets; @@ -355,6 +355,8 @@ static inline struct annotation *symbol__annotation(struct symbol *sym) int addr_map_symbol__inc_samples(struct addr_map_symbol *ams, struct perf_sample *sample, struct evsel *evsel); +struct annotated_branch *annotation__get_branch(struct annotation *notes); + int addr_map_symbol__account_cycles(struct addr_map_symbol *ams, struct addr_map_symbol *start, unsigned cycles); diff --git a/tools/perf/util/block-range.c b/tools/perf/util/block-range.c index 680e92774d0cd..15c42196c24c8 100644 --- a/tools/perf/util/block-range.c +++ b/tools/perf/util/block-range.c @@ -311,6 +311,7 @@ done: double block_range__coverage(struct block_range *br) { struct symbol *sym; + struct annotated_branch *branch; if (!br) { if (block_ranges.blocks) @@ -323,5 +324,9 @@ double block_range__coverage(struct block_range *br) if (!sym) return -1; - return (double)br->coverage / symbol__annotation(sym)->max_coverage; + branch = symbol__annotation(sym)->branch; + if (!branch) + return -1; + + return (double)br->coverage / branch->max_coverage; } -- GitLab From 0aae4c99c5f8f748c6cb5ca03bb3b3ae8cfb10df Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 3 Nov 2023 12:19:06 -0700 Subject: [PATCH 0014/1290] perf annotate: Move some source code related fields from 'struct annotation' to 'struct annotated_source' Some fields in the 'struct annotation' are only used with 'struct annotated_source' so better to be moved there in order to reduce memory consumption for other symbols. Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Christophe JAILLET Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231103191907.54531-5-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/annotate.c | 12 ++++++------ tools/perf/util/annotate.c | 17 +++++++++-------- tools/perf/util/annotate.h | 14 +++++++------- 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index d2470f87344d0..1b42db70c9987 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -384,7 +384,7 @@ static bool annotate_browser__toggle_source(struct annotate_browser *browser) if (al->idx_asm < offset) offset = al->idx; - browser->b.nr_entries = notes->nr_entries; + browser->b.nr_entries = notes->src->nr_entries; notes->options->hide_src_code = false; browser->b.seek(&browser->b, -offset, SEEK_CUR); browser->b.top_idx = al->idx - offset; @@ -402,7 +402,7 @@ static bool annotate_browser__toggle_source(struct annotate_browser *browser) if (al->idx_asm < offset) offset = al->idx_asm; - browser->b.nr_entries = notes->nr_asm_entries; + browser->b.nr_entries = notes->src->nr_asm_entries; notes->options->hide_src_code = true; browser->b.seek(&browser->b, -offset, SEEK_CUR); browser->b.top_idx = al->idx_asm - offset; @@ -435,7 +435,7 @@ static void ui_browser__init_asm_mode(struct ui_browser *browser) { struct annotation *notes = browser__annotation(browser); ui_browser__reset_index(browser); - browser->nr_entries = notes->nr_asm_entries; + browser->nr_entries = notes->src->nr_asm_entries; } static int sym_title(struct symbol *sym, struct map *map, char *title, @@ -860,7 +860,7 @@ show_help: browser->b.height, browser->b.index, browser->b.top_idx, - notes->nr_asm_entries); + notes->src->nr_asm_entries); } continue; case K_ENTER: @@ -991,8 +991,8 @@ int symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel, ui_helpline__push("Press ESC to exit"); - browser.b.width = notes->max_line_len; - browser.b.nr_entries = notes->nr_entries; + browser.b.width = notes->src->max_line_len; + browser.b.nr_entries = notes->src->nr_entries; browser.b.entries = ¬es->src->source, browser.b.width += 18; /* Percentage */ diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 077a297f4dadc..23e68b1abc2fa 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -2825,19 +2825,20 @@ void annotation__mark_jump_targets(struct annotation *notes, struct symbol *sym) void annotation__set_offsets(struct annotation *notes, s64 size) { struct annotation_line *al; + struct annotated_source *src = notes->src; - notes->max_line_len = 0; - notes->nr_entries = 0; - notes->nr_asm_entries = 0; + src->max_line_len = 0; + src->nr_entries = 0; + src->nr_asm_entries = 0; - list_for_each_entry(al, ¬es->src->source, node) { + list_for_each_entry(al, &src->source, node) { size_t line_len = strlen(al->line); - if (notes->max_line_len < line_len) - notes->max_line_len = line_len; - al->idx = notes->nr_entries++; + if (src->max_line_len < line_len) + src->max_line_len = line_len; + al->idx = src->nr_entries++; if (al->offset != -1) { - al->idx_asm = notes->nr_asm_entries++; + al->idx_asm = src->nr_asm_entries++; /* * FIXME: short term bandaid to cope with assembly * routines that comes with labels in the same column diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 8497130989538..9eb7b6d3fe958 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -268,10 +268,13 @@ struct cyc_hist { * returns. */ struct annotated_source { - struct list_head source; - int nr_histograms; - size_t sizeof_sym_hist; - struct sym_hist *histograms; + struct list_head source; + size_t sizeof_sym_hist; + struct sym_hist *histograms; + int nr_histograms; + int nr_entries; + int nr_asm_entries; + u16 max_line_len; }; struct annotated_branch { @@ -289,9 +292,6 @@ struct LOCKABLE annotation { struct annotation_line **offsets; int nr_events; int max_jump_sources; - int nr_entries; - int nr_asm_entries; - u16 max_line_len; struct { u8 addr; u8 jumps; -- GitLab From b753d48f53f9dcea655d359f028c8adfdd9504b5 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 3 Nov 2023 12:19:07 -0700 Subject: [PATCH 0015/1290] perf annotate: Move offsets array from 'struct annotation' to 'struct annotated_source' The offsets array keeps pointers to 'struct annotation_line' entries which are available in the 'struct annotated_source'. Let's move it to there. Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Christophe JAILLET Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231103191907.54531-6-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/annotate.c | 4 ++-- tools/perf/util/annotate.c | 20 ++++++++++---------- tools/perf/util/annotate.h | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index 1b42db70c9987..163f916fff68e 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -188,7 +188,7 @@ static void annotate_browser__draw_current_jump(struct ui_browser *browser) * name right after the '<' token and probably treating this like a * 'call' instruction. */ - target = notes->offsets[cursor->ops.target.offset]; + target = notes->src->offsets[cursor->ops.target.offset]; if (target == NULL) { ui_helpline__printf("WARN: jump target inconsistency, press 'o', notes->offsets[%#x] = NULL\n", cursor->ops.target.offset); @@ -1006,6 +1006,6 @@ int symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel, out_free_offsets: if(not_annotated) - zfree(¬es->offsets); + zfree(¬es->src->offsets); return ret; } diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 23e68b1abc2fa..9b68b8e3791ca 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1075,7 +1075,7 @@ static unsigned annotation__count_insn(struct annotation *notes, u64 start, u64 u64 offset; for (offset = start; offset <= end; offset++) { - if (notes->offsets[offset]) + if (notes->src->offsets[offset]) n_insn++; } return n_insn; @@ -1105,7 +1105,7 @@ static void annotation__count_and_fill(struct annotation *notes, u64 start, u64 return; for (offset = start; offset <= end; offset++) { - struct annotation_line *al = notes->offsets[offset]; + struct annotation_line *al = notes->src->offsets[offset]; if (al && al->cycles && al->cycles->ipc == 0.0) { al->cycles->ipc = ipc; @@ -1143,7 +1143,7 @@ static int annotation__compute_ipc(struct annotation *notes, size_t size) if (ch && ch->cycles) { struct annotation_line *al; - al = notes->offsets[offset]; + al = notes->src->offsets[offset]; if (al && al->cycles == NULL) { al->cycles = zalloc(sizeof(*al->cycles)); if (al->cycles == NULL) { @@ -1166,7 +1166,7 @@ static int annotation__compute_ipc(struct annotation *notes, size_t size) struct cyc_hist *ch = ¬es->branch->cycles_hist[offset]; if (ch && ch->cycles) { - struct annotation_line *al = notes->offsets[offset]; + struct annotation_line *al = notes->src->offsets[offset]; if (al) zfree(&al->cycles); } @@ -2800,7 +2800,7 @@ void annotation__mark_jump_targets(struct annotation *notes, struct symbol *sym) return; for (offset = 0; offset < size; ++offset) { - struct annotation_line *al = notes->offsets[offset]; + struct annotation_line *al = notes->src->offsets[offset]; struct disasm_line *dl; dl = disasm_line(al); @@ -2808,7 +2808,7 @@ void annotation__mark_jump_targets(struct annotation *notes, struct symbol *sym) if (!disasm_line__is_valid_local_jump(dl, sym)) continue; - al = notes->offsets[dl->ops.target.offset]; + al = notes->src->offsets[dl->ops.target.offset]; /* * FIXME: Oops, no jump target? Buggy disassembler? Or do we @@ -2847,7 +2847,7 @@ void annotation__set_offsets(struct annotation *notes, s64 size) * E.g. copy_user_generic_unrolled */ if (al->offset < size) - notes->offsets[al->offset] = al; + notes->src->offsets[al->offset] = al; } else al->idx_asm = -1; } @@ -3280,8 +3280,8 @@ int symbol__annotate2(struct map_symbol *ms, struct evsel *evsel, size_t size = symbol__size(sym); int nr_pcnt = 1, err; - notes->offsets = zalloc(size * sizeof(struct annotation_line *)); - if (notes->offsets == NULL) + notes->src->offsets = zalloc(size * sizeof(struct annotation_line *)); + if (notes->src->offsets == NULL) return ENOMEM; if (evsel__is_group_event(evsel)) @@ -3311,7 +3311,7 @@ int symbol__annotate2(struct map_symbol *ms, struct evsel *evsel, return 0; out_free_offsets: - zfree(¬es->offsets); + zfree(¬es->src->offsets); return err; } diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 9eb7b6d3fe958..de59c1aff08e8 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -271,6 +271,7 @@ struct annotated_source { struct list_head source; size_t sizeof_sym_hist; struct sym_hist *histograms; + struct annotation_line **offsets; int nr_histograms; int nr_entries; int nr_asm_entries; @@ -289,7 +290,6 @@ struct annotated_branch { struct LOCKABLE annotation { u64 start; struct annotation_options *options; - struct annotation_line **offsets; int nr_events; int max_jump_sources; struct { -- GitLab From 4a5aaaf308b91e3da21c3b8f7e78dc5a256d9324 Mon Sep 17 00:00:00 2001 From: zhaimingbing Date: Mon, 30 Oct 2023 15:58:25 +0800 Subject: [PATCH 0016/1290] perf tests attr: Fix spelling mistake "whic" to "which" There is a spelling mistake, Please fix it. Reviewed-by: Ian Rogers Signed-off-by: zhaimingbing Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: kernel-janitors@vger.kernel.org Link: https://lore.kernel.org/r/20231030075825.3701-1-zhaimingbing@cmss.chinamobile.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/attr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/tests/attr.c b/tools/perf/tests/attr.c index 61186d0d1cfa1..97e1bdd6ec0e9 100644 --- a/tools/perf/tests/attr.c +++ b/tools/perf/tests/attr.c @@ -188,7 +188,7 @@ static int test__attr(struct test_suite *test __maybe_unused, int subtest __mayb if (perf_pmus__num_core_pmus() > 1) { /* * TODO: Attribute tests hard code the PMU type. If there are >1 - * core PMU then each PMU will have a different type whic + * core PMU then each PMU will have a different type which * requires additional support. */ pr_debug("Skip test on hybrid systems"); -- GitLab From 36c70e44a37b84cbb4094bf6f5cdb01cfd6659df Mon Sep 17 00:00:00 2001 From: Yang Jihong Date: Mon, 30 Oct 2023 11:14:38 +0000 Subject: [PATCH 0017/1290] perf tools: Add the python_ext_build directory to .gitignore `python_ext_build` is the build directory for python.so, ignore it for cleaner git status. Signed-off-by: Yang Jihong Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231030111438.1357962-2-yangjihong1@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore index f533e76fb4800..ee5c14f3b8b19 100644 --- a/tools/perf/.gitignore +++ b/tools/perf/.gitignore @@ -49,3 +49,4 @@ libtraceevent/ libtraceevent_plugins/ fixdep Documentation/doc.dep +python_ext_build/ -- GitLab From b861fd7e0efc4dc2376e7212f59a9b32d1383c00 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 6 Nov 2023 10:16:27 +0100 Subject: [PATCH 0018/1290] perf tests offcpu: Adjust test case perf record offcpu profiling tests for s390 On s390 using linux-next the test case: 87: perf record offcpu profiling tests fails. The root cause is this command # ./perf record --off-cpu -e dummy -- ./perf bench sched messaging -l 10 # Running 'sched/messaging' benchmark: # 20 sender and receiver processes per group # 10 groups == 400 processes run Total time: 0.231 [sec] [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.077 MB perf.data (401 samples) ] # It does not generate 800+ sample entries, on s390 usually around 40[1-9], sometimes a few more, but never more than 450. The higher the number of CPUs the lower the number of samples. Looking at function chain: bench_sched_messaging() +--> group() the senders and receiver threads are created. The senders and receivers call function ready() which writes one bytes and wait for a reply using poll system() call. As context switches are counted, the function ready() will trigger a context switch when no input data is available after the write system call. The write system call does not trigger context switches when the data size is small. And writing 1000 bytes (10 iterations with 100 bytes) is not much and certainly won't block. The 400+ context switch on s390 occur when the some receiver/sender threads call ready() and wait for the response from function bench_sched_messaging() being kicked off. Lower the number of expected context switches to 400 to succeed on s390. Suggested-by: Namhyung Kim Signed-off-by: Ilya Leoshkevich Signed-off-by: Thomas Richter Acked-by: Namhyung Kim Cc: Heiko Carstens Cc: Sumanth Korikkar Cc: Sven Schnelle Cc: Vasily Gorbik Co-developed-by: Ilya Leoshkevich Link: https://lore.kernel.org/r/20231106091627.2022530-1-tmricht@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/record_offcpu.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/tests/shell/record_offcpu.sh b/tools/perf/tests/shell/record_offcpu.sh index a1ef8f0d2b5cc..67c925f3a15aa 100755 --- a/tools/perf/tests/shell/record_offcpu.sh +++ b/tools/perf/tests/shell/record_offcpu.sh @@ -77,9 +77,9 @@ test_offcpu_child() { err=1 return fi - # each process waits for read and write, so it should be more than 800 events + # each process waits at least for poll, so it should be more than 400 events if ! perf report -i ${perfdata} -s comm -q -n -t ';' --percent-limit=90 | \ - awk -F ";" '{ if (NF > 3 && int($3) < 800) exit 1; }' + awk -F ";" '{ if (NF > 3 && int($3) < 400) exit 1; }' then echo "Child task off-cpu test [Failed invalid output]" err=1 -- GitLab From 33ce9fc4f8ddba30b7040bd01cea11080f40b344 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 6 Nov 2023 15:10:48 +0000 Subject: [PATCH 0019/1290] perf test: Add option to change objdump binary All of the other Perf subcommands that use objdump have an option to specify the binary, so add the same option to 'perf test'. This is useful if you have built the kernel with a different toolchain to the system one, where the system objdump may fail to disassemble vmlinux. Now this can be fixed with something like this: $ perf test --objdump llvm-objdump "object code reading" Reviewed-by: Ian Rogers Signed-off-by: James Clark Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Jajeev Cc: Fangrui Song Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kajol Jain Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Tom Rix Cc: Yang Jihong Cc: llvm@lists.linux.dev Link: https://lore.kernel.org/r/20231106151051.129440-2-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/builtin-test.c | 3 +++ tools/perf/tests/code-reading.c | 2 +- tools/perf/tests/tests.h | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index cb6f1dd00dc48..a8d17dd505888 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -32,6 +32,7 @@ static bool dont_fork; const char *dso_to_test; +const char *test_objdump_path = "objdump"; /* * List of architecture specific tests. Not a weak symbol as the array length is @@ -529,6 +530,8 @@ int cmd_test(int argc, const char **argv) "Do not fork for testcase"), OPT_STRING('w', "workload", &workload, "work", "workload to run for testing"), OPT_STRING(0, "dso", &dso_to_test, "dso", "dso to test"), + OPT_STRING(0, "objdump", &test_objdump_path, "path", + "objdump binary to use for disassembly and annotations"), OPT_END() }; const char * const test_subcommands[] = { "list", NULL }; diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index d6e845c579023..8620146d03783 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -185,7 +185,7 @@ static int read_via_objdump(const char *filename, u64 addr, void *buf, int ret; fmt = "%s -z -d --start-address=0x%"PRIx64" --stop-address=0x%"PRIx64" %s"; - ret = snprintf(cmd, sizeof(cmd), fmt, "objdump", addr, addr + len, + ret = snprintf(cmd, sizeof(cmd), fmt, test_objdump_path, addr, addr + len, filename); if (ret <= 0 || (size_t)ret >= sizeof(cmd)) return -1; diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h index b394f3ac2d667..dad3d7414142d 100644 --- a/tools/perf/tests/tests.h +++ b/tools/perf/tests/tests.h @@ -207,5 +207,6 @@ DECLARE_WORKLOAD(brstack); DECLARE_WORKLOAD(datasym); extern const char *dso_to_test; +extern const char *test_objdump_path; #endif /* TESTS_H */ -- GitLab From 6aad765d10c5cd8a62b258c359bae643ab2d45da Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 6 Nov 2023 15:10:49 +0000 Subject: [PATCH 0020/1290] perf test: Add support for setting objdump binary via perf config Add a 'perf config' variable that does the same thing as "perf test --objdump ". Also update the man page. Committer testing: # perf config test.objdump # perf test "object code reading" 26: Object code reading : Ok # perf config test.objdump=blah # perf config test.objdump test.objdump=blah # perf test "object code reading" 26: Object code reading : FAILED! # perf test -v "object code reading" 26: Object code reading : --- start --- test child forked, pid 600599 Looking at the vmlinux_path (8 entries long) Using /proc/kcore for kernel data Using /proc/kallsyms for symbols Parsing event 'cycles' Using CPUID AuthenticAMD-25-21-0 mmap size 528384B Reading object code for memory address: 0x4d9a02 File is: /home/acme/bin/perf On file address is: 0xd9a02 Objdump command is: blah -z -d --start-address=0x4d9a02 --stop-address=0x4d9a82 /home/acme/bin/perf objdump read too few bytes: 128 Bytes read differ from those read by objdump buf1 (dso): 0x48 0x85 0xff 0x74 0x29 0xe8 0x94 0xdf 0x07 0x00 0x8b 0x73 0x1c 0x48 0x8b 0x43 0x08 0xeb 0xa5 0x0f 0x1f 0x00 0x48 0x8b 0x45 0xe8 0x64 0x48 0x2b 0x04 0x25 0x28 0x00 0x00 0x00 0x75 0x0f 0x48 0x8b 0x5d 0xf8 0xc9 0xc3 0x0f 0x1f 0x00 0x48 0x8b 0x43 0x08 0xeb 0x84 0xe8 0xc5 0x3e 0xf3 0xff 0x0f 0x1f 0x44 0x00 0x00 0x55 0x48 0x89 0xe5 0x41 0x56 0x41 0x55 0x49 0x89 0xd5 0x41 0x54 0x49 0x89 0xfc 0x53 0x48 0x89 0xf3 0x48 0x83 0xec 0x30 0x48 0x8b 0x7e 0x20 0x64 0x48 0x8b 0x04 0x25 0x28 0x00 0x00 0x00 0x48 0x89 0x45 0xd8 0x31 0xc0 0x48 0x89 0x75 0xb0 0x48 0xc7 0x45 0xb8 0x00 0x00 0x00 0x00 0x48 0xc7 0x45 0xc0 0x00 0x00 0x00 0x00 0xe8 0xad 0xfa buf2 (objdump): 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 test child finished with -1 ---- end ---- Object code reading: FAILED! # perf config test.objdump=/usr/bin/objdump # perf config test.objdump test.objdump=/usr/bin/objdump # perf test "object code reading" 26: Object code reading : Ok # Signed-off-by: James Clark Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Jajeev Cc: Fangrui Song Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Tom Rix Cc: Yang Jihong Cc: Yonghong Song Cc: llvm@lists.linux.dev Link: https://lore.kernel.org/r/20231106151051.129440-3-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-config.txt | 4 ++++ tools/perf/tests/builtin-test.c | 12 ++++++++++++ 2 files changed, 16 insertions(+) diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt index 0b4e79dbd3f68..16398babd1ef5 100644 --- a/tools/perf/Documentation/perf-config.txt +++ b/tools/perf/Documentation/perf-config.txt @@ -722,6 +722,10 @@ session-.*:: Defines new record session for daemon. The value is record's command line without the 'record' keyword. +test.*:: + + test.objdump:: + objdump binary to use for disassembly and annotations. SEE ALSO -------- diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index a8d17dd505888..113e92119e1d1 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -14,6 +14,7 @@ #include #include #include "builtin.h" +#include "config.h" #include "hist.h" #include "intlist.h" #include "tests.h" @@ -514,6 +515,15 @@ static int run_workload(const char *work, int argc, const char **argv) return -1; } +static int perf_test__config(const char *var, const char *value, + void *data __maybe_unused) +{ + if (!strcmp(var, "test.objdump")) + test_objdump_path = value; + + return 0; +} + int cmd_test(int argc, const char **argv) { const char *test_usage[] = { @@ -541,6 +551,8 @@ int cmd_test(int argc, const char **argv) if (ret < 0) return ret; + perf_config(perf_test__config, NULL); + /* Unbuffered output */ setvbuf(stdout, NULL, _IONBF, 0); -- GitLab From 6512b6aa237db36d881a81cc312db39668e61853 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 2 Nov 2023 10:56:54 -0700 Subject: [PATCH 0021/1290] perf bpf: Don't synthesize BPF events when disabled If BPF sideband events are disabled on the command line, don't synthesize BPF events too. Signed-off-by: Ian Rogers Acked-by: Song Liu Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Jajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Miguel Ojeda Cc: Ming Wang Cc: Namhyung Kim Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231102175735.2272696-13-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf-event.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index 38fcf3ba5749d..830711cae30d3 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -386,6 +386,9 @@ int perf_event__synthesize_bpf_events(struct perf_session *session, int err; int fd; + if (opts->no_bpf_event) + return 0; + event = malloc(sizeof(event->bpf) + KSYM_NAME_LEN + machine->id_hdr_size); if (!event) return -1; -- GitLab From a399ee6773d6a0203f9bd764f8bd9d978878cef1 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 9 Nov 2023 16:34:09 -0300 Subject: [PATCH 0022/1290] tools: Disable __packed attribute compiler warning due to -Werror=attributes Noticed on several perf tools cross build test containers: [perfbuilder@five ~]$ grep FAIL ~/dm.log/summary 19 10.18 debian:experimental-x-mips : FAIL gcc version 12.3.0 (Debian 12.3.0-6) 20 11.21 debian:experimental-x-mips64 : FAIL gcc version 12.3.0 (Debian 12.3.0-6) 21 11.30 debian:experimental-x-mipsel : FAIL gcc version 12.3.0 (Debian 12.3.0-6) 37 12.07 ubuntu:18.04-x-arm : FAIL gcc version 7.5.0 (Ubuntu/Linaro 7.5.0-3ubuntu1~18.04) 42 11.91 ubuntu:18.04-x-riscv64 : FAIL gcc version 7.5.0 (Ubuntu 7.5.0-3ubuntu1~18.04) 44 13.17 ubuntu:18.04-x-sh4 : FAIL gcc version 7.5.0 (Ubuntu 7.5.0-3ubuntu1~18.04) 45 12.09 ubuntu:18.04-x-sparc64 : FAIL gcc version 7.5.0 (Ubuntu 7.5.0-3ubuntu1~18.04) [perfbuilder@five ~]$ In file included from util/intel-pt-decoder/intel-pt-pkt-decoder.c:10: /tmp/perf-6.6.0-rc1/tools/include/asm-generic/unaligned.h: In function 'get_unaligned_le16': /tmp/perf-6.6.0-rc1/tools/include/asm-generic/unaligned.h:13:29: error: packed attribute causes inefficient alignment for 'x' [-Werror=attributes] 13 | const struct { type x; } __packed *__pptr = (typeof(__pptr))(ptr); \ | ^ /tmp/perf-6.6.0-rc1/tools/include/asm-generic/unaligned.h:27:28: note: in expansion of macro '__get_unaligned_t' 27 | return le16_to_cpu(__get_unaligned_t(__le16, p)); | ^~~~~~~~~~~~~~~~~ This comes from the kernel, where the -Wattributes and -Wpacked isn't used, -Wpacked is already disabled, do it for the attributes as well. Fixes: a91c987254651443 ("perf tools: Add get_unaligned_leNN()") Suggested-by: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/7c5b626c-1de9-4c12-a781-e44985b4a797@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/asm-generic/unaligned.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/include/asm-generic/unaligned.h b/tools/include/asm-generic/unaligned.h index 156743d399aed..2fd551915c202 100644 --- a/tools/include/asm-generic/unaligned.h +++ b/tools/include/asm-generic/unaligned.h @@ -8,6 +8,7 @@ */ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wpacked" +#pragma GCC diagnostic ignored "-Wattributes" #define __get_unaligned_t(type, ptr) ({ \ const struct { type x; } __packed *__pptr = (typeof(__pptr))(ptr); \ -- GitLab From dd678532f913c1a742f1a2add6adacfb7ae2b166 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 7 Nov 2023 14:03:31 +0530 Subject: [PATCH 0023/1290] perf header: Additional note on AMD IBS for max_precise pmu cap x86 core PMU exposes supported maximum precision level via max_precise PMU capability. Although, AMD core PMU does not support precise mode, certain core PMU events with precise_ip > 0 are allowed and forwarded to IBS OP PMU. Display a note about this in the 'perf report' header output and document the details in the perf-list man page. Signed-off-by: Ravi Bangoria Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ananth Narayan Cc: Changbin Du Cc: Huacai Chen Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kan Liang Cc: Kim Phillips Cc: Mark Rutland Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ross Zwisler Cc: Sandipan Das Cc: Santosh Shukla Cc: Yang Jihong Link: https://lore.kernel.org/r/20231107083331.901-2-ravi.bangoria@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-list.txt | 12 +++++++----- tools/perf/util/env.c | 18 ++++++++++++++++++ tools/perf/util/env.h | 2 ++ tools/perf/util/header.c | 8 ++++++++ 4 files changed, 35 insertions(+), 5 deletions(-) diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt index d5f78e125efed..1b90575ee3c84 100644 --- a/tools/perf/Documentation/perf-list.txt +++ b/tools/perf/Documentation/perf-list.txt @@ -81,11 +81,13 @@ For Intel systems precise event sampling is implemented with PEBS which supports up to precise-level 2, and precise level 3 for some special cases -On AMD systems it is implemented using IBS (up to precise-level 2). -The precise modifier works with event types 0x76 (cpu-cycles, CPU -clocks not halted) and 0xC1 (micro-ops retired). Both events map to -IBS execution sampling (IBS op) with the IBS Op Counter Control bit -(IbsOpCntCtl) set respectively (see the +On AMD systems it is implemented using IBS OP (up to precise-level 2). +Unlike Intel PEBS which provides levels of precision, AMD core pmu is +inherently non-precise and IBS is inherently precise. (i.e. ibs_op//, +ibs_op//p, ibs_op//pp and ibs_op//ppp are all same). The precise modifier +works with event types 0x76 (cpu-cycles, CPU clocks not halted) and 0xC1 +(micro-ops retired). Both events map to IBS execution sampling (IBS op) +with the IBS Op Counter Control bit (IbsOpCntCtl) set respectively (see the Core Complex (CCX) -> Processor x86 Core -> Instruction Based Sampling (IBS) section of the [AMD Processor Programming Reference (PPR)] relevant to the family, model and stepping of the processor being used). diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index 44140b7f596a3..cbc18b22ace52 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -531,6 +531,24 @@ int perf_env__numa_node(struct perf_env *env, struct perf_cpu cpu) return cpu.cpu >= 0 && cpu.cpu < env->nr_numa_map ? env->numa_map[cpu.cpu] : -1; } +bool perf_env__has_pmu_mapping(struct perf_env *env, const char *pmu_name) +{ + char *pmu_mapping = env->pmu_mappings, *colon; + + for (int i = 0; i < env->nr_pmu_mappings; ++i) { + if (strtoul(pmu_mapping, &colon, 0) == ULONG_MAX || *colon != ':') + goto out_error; + + pmu_mapping = colon + 1; + if (strcmp(pmu_mapping, pmu_name) == 0) + return true; + + pmu_mapping += strlen(pmu_mapping) + 1; + } +out_error: + return false; +} + char *perf_env__find_pmu_cap(struct perf_env *env, const char *pmu_name, const char *cap) { diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h index 48d7f8759a2a7..94596ff124d54 100644 --- a/tools/perf/util/env.h +++ b/tools/perf/util/env.h @@ -179,4 +179,6 @@ struct btf_node *perf_env__find_btf(struct perf_env *env, __u32 btf_id); int perf_env__numa_node(struct perf_env *env, struct perf_cpu cpu); char *perf_env__find_pmu_cap(struct perf_env *env, const char *pmu_name, const char *cap); + +bool perf_env__has_pmu_mapping(struct perf_env *env, const char *pmu_name); #endif /* __PERF_ENV_H */ diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index eeb96a1b63a7c..1c687b5789c0f 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -2145,6 +2145,14 @@ static void print_pmu_caps(struct feat_fd *ff, FILE *fp) __print_pmu_caps(fp, pmu_caps->nr_caps, pmu_caps->caps, pmu_caps->pmu_name); } + + if (strcmp(perf_env__arch(&ff->ph->env), "x86") == 0 && + perf_env__has_pmu_mapping(&ff->ph->env, "ibs_op")) { + char *max_precise = perf_env__find_pmu_cap(&ff->ph->env, "cpu", "max_precise"); + + if (max_precise != NULL && atoi(max_precise) == 0) + fprintf(fp, "# AMD systems uses ibs_op// PMU for some precise events, e.g.: cycles:p, see the 'perf list' man page for further details.\n"); + } } static void print_pmu_mappings(struct feat_fd *ff, FILE *fp) -- GitLab From ded8c48497b825f15436048e8ea3a731a3f7dece Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 9 Nov 2023 15:59:20 -0800 Subject: [PATCH 0024/1290] perf annotate: Pass "-l" option to objdump conditionally The "-l" option is to print line numbers in the objdump output. perf annotate TUI only can show the line numbers later but it causes big slow downs for the kernel binary. Similarly, showing source code also takes a long time and it already has an option to control it. $ time objdump ... -d -S -C vmlinux > /dev/null real 0m3.474s user 0m3.047s sys 0m0.428s $ time objdump ... -d -l -C vmlinux > /dev/null real 0m1.796s user 0m1.459s sys 0m0.338s $ time objdump ... -d -C vmlinux > /dev/null real 0m0.051s user 0m0.036s sys 0m0.016s As it's not needed for data type profiling, let's make it conditional so that it can skip the unnecessary work. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu (Google) Cc: Peter Zijlstra Cc: Stephane Eranian Cc: linux-toolchains@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Link: https://lore.kernel.org/r/20231110000012.3538610-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 9b68b8e3791ca..118195c787b94 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -2144,12 +2144,13 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args) err = asprintf(&command, "%s %s%s --start-address=0x%016" PRIx64 " --stop-address=0x%016" PRIx64 - " -l -d %s %s %s %c%s%c %s%s -C \"$1\"", + " %s -d %s %s %s %c%s%c %s%s -C \"$1\"", opts->objdump_path ?: "objdump", opts->disassembler_style ? "-M " : "", opts->disassembler_style ?: "", map__rip_2objdump(map, sym->start), map__rip_2objdump(map, sym->end), + opts->show_linenr ? "-l" : "", opts->show_asm_raw ? "" : "--no-show-raw-insn", opts->annotate_src ? "-S" : "", opts->prefix ? "--prefix " : "", -- GitLab From fb7fd2a14a503b9a23fe10303923fc0605c22288 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 9 Nov 2023 15:59:21 -0800 Subject: [PATCH 0025/1290] perf annotate: Move raw_comment and raw_func_start fields out of 'struct ins_operands' Thoese two fields are used only for the jump_ops, so move them into the union to save some bytes. Also add jump__delete() callback not to free the fields as they didn't allocate new strings. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Andi Kleen Cc: Huacai Chen Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu (Google) Cc: Peter Zijlstra Cc: Stephane Eranian Cc: WANG Rui Cc: linux-toolchains@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Link: https://lore.kernel.org/r/20231110000012.3538610-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- .../perf/arch/loongarch/annotate/instructions.c | 6 +++--- tools/perf/util/annotate.c | 17 +++++++++++++---- tools/perf/util/annotate.h | 6 ++++-- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/tools/perf/arch/loongarch/annotate/instructions.c b/tools/perf/arch/loongarch/annotate/instructions.c index 98e19c5366acf..21cc7e4149f72 100644 --- a/tools/perf/arch/loongarch/annotate/instructions.c +++ b/tools/perf/arch/loongarch/annotate/instructions.c @@ -61,10 +61,10 @@ static int loongarch_jump__parse(struct arch *arch, struct ins_operands *ops, st const char *c = strchr(ops->raw, '#'); u64 start, end; - ops->raw_comment = strchr(ops->raw, arch->objdump.comment_char); - ops->raw_func_start = strchr(ops->raw, '<'); + ops->jump.raw_comment = strchr(ops->raw, arch->objdump.comment_char); + ops->jump.raw_func_start = strchr(ops->raw, '<'); - if (ops->raw_func_start && c > ops->raw_func_start) + if (ops->jump.raw_func_start && c > ops->jump.raw_func_start) c = NULL; if (c++ != NULL) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 118195c787b94..3364edf30f50e 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -340,10 +340,10 @@ bool ins__is_call(const struct ins *ins) */ static inline const char *validate_comma(const char *c, struct ins_operands *ops) { - if (ops->raw_comment && c > ops->raw_comment) + if (ops->jump.raw_comment && c > ops->jump.raw_comment) return NULL; - if (ops->raw_func_start && c > ops->raw_func_start) + if (ops->jump.raw_func_start && c > ops->jump.raw_func_start) return NULL; return c; @@ -359,8 +359,8 @@ static int jump__parse(struct arch *arch, struct ins_operands *ops, struct map_s const char *c = strchr(ops->raw, ','); u64 start, end; - ops->raw_comment = strchr(ops->raw, arch->objdump.comment_char); - ops->raw_func_start = strchr(ops->raw, '<'); + ops->jump.raw_comment = strchr(ops->raw, arch->objdump.comment_char); + ops->jump.raw_func_start = strchr(ops->raw, '<'); c = validate_comma(c, ops); @@ -462,7 +462,16 @@ static int jump__scnprintf(struct ins *ins, char *bf, size_t size, ops->target.offset); } +static void jump__delete(struct ins_operands *ops __maybe_unused) +{ + /* + * The ops->jump.raw_comment and ops->jump.raw_func_start belong to the + * raw string, don't free them. + */ +} + static struct ins_ops jump_ops = { + .free = jump__delete, .parse = jump__parse, .scnprintf = jump__scnprintf, }; diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index de59c1aff08e8..bc8b95e8b1be1 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -31,8 +31,6 @@ struct ins { struct ins_operands { char *raw; - char *raw_comment; - char *raw_func_start; struct { char *raw; char *name; @@ -52,6 +50,10 @@ struct ins_operands { struct ins ins; struct ins_operands *ops; } locked; + struct { + char *raw_comment; + char *raw_func_start; + } jump; }; }; -- GitLab From 6f1b6291cf73cb3223f6fb9ec16862a5fe7ed957 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 9 Nov 2023 15:59:22 -0800 Subject: [PATCH 0026/1290] perf tools: Add util/debuginfo.[ch] files Split debuginfo data structure and related functions into a separate file so that it can be used by other components than the probe-finder. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu (Google) Cc: Peter Zijlstra Cc: Stephane Eranian Cc: linux-toolchains@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Link: https://lore.kernel.org/r/20231110000012.3538610-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/Build | 1 + tools/perf/util/debuginfo.c | 205 +++++++++++++++++++++++++++++++++ tools/perf/util/debuginfo.h | 64 ++++++++++ tools/perf/util/probe-finder.c | 193 +------------------------------ tools/perf/util/probe-finder.h | 19 +-- 5 files changed, 272 insertions(+), 210 deletions(-) create mode 100644 tools/perf/util/debuginfo.c create mode 100644 tools/perf/util/debuginfo.h diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 96058f949ec9f..73e3f194f949f 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -195,6 +195,7 @@ endif perf-$(CONFIG_DWARF) += probe-finder.o perf-$(CONFIG_DWARF) += dwarf-aux.o perf-$(CONFIG_DWARF) += dwarf-regs.o +perf-$(CONFIG_DWARF) += debuginfo.o perf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o perf-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind-local.o diff --git a/tools/perf/util/debuginfo.c b/tools/perf/util/debuginfo.c new file mode 100644 index 0000000000000..19acf4775d358 --- /dev/null +++ b/tools/perf/util/debuginfo.c @@ -0,0 +1,205 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * DWARF debug information handling code. Copied from probe-finder.c. + * + * Written by Masami Hiramatsu + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "build-id.h" +#include "dso.h" +#include "debug.h" +#include "debuginfo.h" +#include "symbol.h" + +#ifdef HAVE_DEBUGINFOD_SUPPORT +#include +#endif + +/* Dwarf FL wrappers */ +static char *debuginfo_path; /* Currently dummy */ + +static const Dwfl_Callbacks offline_callbacks = { + .find_debuginfo = dwfl_standard_find_debuginfo, + .debuginfo_path = &debuginfo_path, + + .section_address = dwfl_offline_section_address, + + /* We use this table for core files too. */ + .find_elf = dwfl_build_id_find_elf, +}; + +/* Get a Dwarf from offline image */ +static int debuginfo__init_offline_dwarf(struct debuginfo *dbg, + const char *path) +{ + GElf_Addr dummy; + int fd; + + fd = open(path, O_RDONLY); + if (fd < 0) + return fd; + + dbg->dwfl = dwfl_begin(&offline_callbacks); + if (!dbg->dwfl) + goto error; + + dwfl_report_begin(dbg->dwfl); + dbg->mod = dwfl_report_offline(dbg->dwfl, "", "", fd); + if (!dbg->mod) + goto error; + + dbg->dbg = dwfl_module_getdwarf(dbg->mod, &dbg->bias); + if (!dbg->dbg) + goto error; + + dwfl_module_build_id(dbg->mod, &dbg->build_id, &dummy); + + dwfl_report_end(dbg->dwfl, NULL, NULL); + + return 0; +error: + if (dbg->dwfl) + dwfl_end(dbg->dwfl); + else + close(fd); + memset(dbg, 0, sizeof(*dbg)); + + return -ENOENT; +} + +static struct debuginfo *__debuginfo__new(const char *path) +{ + struct debuginfo *dbg = zalloc(sizeof(*dbg)); + if (!dbg) + return NULL; + + if (debuginfo__init_offline_dwarf(dbg, path) < 0) + zfree(&dbg); + if (dbg) + pr_debug("Open Debuginfo file: %s\n", path); + return dbg; +} + +enum dso_binary_type distro_dwarf_types[] = { + DSO_BINARY_TYPE__FEDORA_DEBUGINFO, + DSO_BINARY_TYPE__UBUNTU_DEBUGINFO, + DSO_BINARY_TYPE__OPENEMBEDDED_DEBUGINFO, + DSO_BINARY_TYPE__BUILDID_DEBUGINFO, + DSO_BINARY_TYPE__MIXEDUP_UBUNTU_DEBUGINFO, + DSO_BINARY_TYPE__NOT_FOUND, +}; + +struct debuginfo *debuginfo__new(const char *path) +{ + enum dso_binary_type *type; + char buf[PATH_MAX], nil = '\0'; + struct dso *dso; + struct debuginfo *dinfo = NULL; + struct build_id bid; + + /* Try to open distro debuginfo files */ + dso = dso__new(path); + if (!dso) + goto out; + + /* Set the build id for DSO_BINARY_TYPE__BUILDID_DEBUGINFO */ + if (is_regular_file(path) && filename__read_build_id(path, &bid) > 0) + dso__set_build_id(dso, &bid); + + for (type = distro_dwarf_types; + !dinfo && *type != DSO_BINARY_TYPE__NOT_FOUND; + type++) { + if (dso__read_binary_type_filename(dso, *type, &nil, + buf, PATH_MAX) < 0) + continue; + dinfo = __debuginfo__new(buf); + } + dso__put(dso); + +out: + /* if failed to open all distro debuginfo, open given binary */ + return dinfo ? : __debuginfo__new(path); +} + +void debuginfo__delete(struct debuginfo *dbg) +{ + if (dbg) { + if (dbg->dwfl) + dwfl_end(dbg->dwfl); + free(dbg); + } +} + +/* For the kernel module, we need a special code to get a DIE */ +int debuginfo__get_text_offset(struct debuginfo *dbg, Dwarf_Addr *offs, + bool adjust_offset) +{ + int n, i; + Elf32_Word shndx; + Elf_Scn *scn; + Elf *elf; + GElf_Shdr mem, *shdr; + const char *p; + + elf = dwfl_module_getelf(dbg->mod, &dbg->bias); + if (!elf) + return -EINVAL; + + /* Get the number of relocations */ + n = dwfl_module_relocations(dbg->mod); + if (n < 0) + return -ENOENT; + /* Search the relocation related .text section */ + for (i = 0; i < n; i++) { + p = dwfl_module_relocation_info(dbg->mod, i, &shndx); + if (strcmp(p, ".text") == 0) { + /* OK, get the section header */ + scn = elf_getscn(elf, shndx); + if (!scn) + return -ENOENT; + shdr = gelf_getshdr(scn, &mem); + if (!shdr) + return -ENOENT; + *offs = shdr->sh_addr; + if (adjust_offset) + *offs -= shdr->sh_offset; + } + } + return 0; +} + +#ifdef HAVE_DEBUGINFOD_SUPPORT +int get_source_from_debuginfod(const char *raw_path, + const char *sbuild_id, char **new_path) +{ + debuginfod_client *c = debuginfod_begin(); + const char *p = raw_path; + int fd; + + if (!c) + return -ENOMEM; + + fd = debuginfod_find_source(c, (const unsigned char *)sbuild_id, + 0, p, new_path); + pr_debug("Search %s from debuginfod -> %d\n", p, fd); + if (fd >= 0) + close(fd); + debuginfod_end(c); + if (fd < 0) { + pr_debug("Failed to find %s in debuginfod (%s)\n", + raw_path, sbuild_id); + return -ENOENT; + } + pr_debug("Got a source %s\n", *new_path); + + return 0; +} +#endif /* HAVE_DEBUGINFOD_SUPPORT */ diff --git a/tools/perf/util/debuginfo.h b/tools/perf/util/debuginfo.h new file mode 100644 index 0000000000000..4d65b8c605fc5 --- /dev/null +++ b/tools/perf/util/debuginfo.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _PERF_DEBUGINFO_H +#define _PERF_DEBUGINFO_H + +#include +#include + +#ifdef HAVE_DWARF_SUPPORT + +#include "dwarf-aux.h" + +/* debug information structure */ +struct debuginfo { + Dwarf *dbg; + Dwfl_Module *mod; + Dwfl *dwfl; + Dwarf_Addr bias; + const unsigned char *build_id; +}; + +/* This also tries to open distro debuginfo */ +struct debuginfo *debuginfo__new(const char *path); +void debuginfo__delete(struct debuginfo *dbg); + +int debuginfo__get_text_offset(struct debuginfo *dbg, Dwarf_Addr *offs, + bool adjust_offset); + +#else /* HAVE_DWARF_SUPPORT */ + +/* dummy debug information structure */ +struct debuginfo { +}; + +static inline struct debuginfo *debuginfo__new(const char *path __maybe_unused) +{ + return NULL; +} + +static inline void debuginfo__delete(struct debuginfo *dbg __maybe_unused) +{ +} + +static inline int debuginfo__get_text_offset(struct debuginfo *dbg __maybe_unused, + Dwarf_Addr *offs __maybe_unused, + bool adjust_offset __maybe_unused) +{ + return -EINVAL; +} + +#endif /* HAVE_DWARF_SUPPORT */ + +#ifdef HAVE_DEBUGINFOD_SUPPORT +int get_source_from_debuginfod(const char *raw_path, const char *sbuild_id, + char **new_path); +#else /* HAVE_DEBUGINFOD_SUPPORT */ +static inline int get_source_from_debuginfod(const char *raw_path __maybe_unused, + const char *sbuild_id __maybe_unused, + char **new_path __maybe_unused) +{ + return -ENOTSUP; +} +#endif /* HAVE_DEBUGINFOD_SUPPORT */ + +#endif /* _PERF_DEBUGINFO_H */ diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index f171360b0ef4d..8d3dd85f9ff4c 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -23,6 +23,7 @@ #include "event.h" #include "dso.h" #include "debug.h" +#include "debuginfo.h" #include "intlist.h" #include "strbuf.h" #include "strlist.h" @@ -31,128 +32,9 @@ #include "probe-file.h" #include "string2.h" -#ifdef HAVE_DEBUGINFOD_SUPPORT -#include -#endif - /* Kprobe tracer basic type is up to u64 */ #define MAX_BASIC_TYPE_BITS 64 -/* Dwarf FL wrappers */ -static char *debuginfo_path; /* Currently dummy */ - -static const Dwfl_Callbacks offline_callbacks = { - .find_debuginfo = dwfl_standard_find_debuginfo, - .debuginfo_path = &debuginfo_path, - - .section_address = dwfl_offline_section_address, - - /* We use this table for core files too. */ - .find_elf = dwfl_build_id_find_elf, -}; - -/* Get a Dwarf from offline image */ -static int debuginfo__init_offline_dwarf(struct debuginfo *dbg, - const char *path) -{ - GElf_Addr dummy; - int fd; - - fd = open(path, O_RDONLY); - if (fd < 0) - return fd; - - dbg->dwfl = dwfl_begin(&offline_callbacks); - if (!dbg->dwfl) - goto error; - - dwfl_report_begin(dbg->dwfl); - dbg->mod = dwfl_report_offline(dbg->dwfl, "", "", fd); - if (!dbg->mod) - goto error; - - dbg->dbg = dwfl_module_getdwarf(dbg->mod, &dbg->bias); - if (!dbg->dbg) - goto error; - - dwfl_module_build_id(dbg->mod, &dbg->build_id, &dummy); - - dwfl_report_end(dbg->dwfl, NULL, NULL); - - return 0; -error: - if (dbg->dwfl) - dwfl_end(dbg->dwfl); - else - close(fd); - memset(dbg, 0, sizeof(*dbg)); - - return -ENOENT; -} - -static struct debuginfo *__debuginfo__new(const char *path) -{ - struct debuginfo *dbg = zalloc(sizeof(*dbg)); - if (!dbg) - return NULL; - - if (debuginfo__init_offline_dwarf(dbg, path) < 0) - zfree(&dbg); - if (dbg) - pr_debug("Open Debuginfo file: %s\n", path); - return dbg; -} - -enum dso_binary_type distro_dwarf_types[] = { - DSO_BINARY_TYPE__FEDORA_DEBUGINFO, - DSO_BINARY_TYPE__UBUNTU_DEBUGINFO, - DSO_BINARY_TYPE__OPENEMBEDDED_DEBUGINFO, - DSO_BINARY_TYPE__BUILDID_DEBUGINFO, - DSO_BINARY_TYPE__MIXEDUP_UBUNTU_DEBUGINFO, - DSO_BINARY_TYPE__NOT_FOUND, -}; - -struct debuginfo *debuginfo__new(const char *path) -{ - enum dso_binary_type *type; - char buf[PATH_MAX], nil = '\0'; - struct dso *dso; - struct debuginfo *dinfo = NULL; - struct build_id bid; - - /* Try to open distro debuginfo files */ - dso = dso__new(path); - if (!dso) - goto out; - - /* Set the build id for DSO_BINARY_TYPE__BUILDID_DEBUGINFO */ - if (is_regular_file(path) && filename__read_build_id(path, &bid) > 0) - dso__set_build_id(dso, &bid); - - for (type = distro_dwarf_types; - !dinfo && *type != DSO_BINARY_TYPE__NOT_FOUND; - type++) { - if (dso__read_binary_type_filename(dso, *type, &nil, - buf, PATH_MAX) < 0) - continue; - dinfo = __debuginfo__new(buf); - } - dso__put(dso); - -out: - /* if failed to open all distro debuginfo, open given binary */ - return dinfo ? : __debuginfo__new(path); -} - -void debuginfo__delete(struct debuginfo *dbg) -{ - if (dbg) { - if (dbg->dwfl) - dwfl_end(dbg->dwfl); - free(dbg); - } -} - /* * Probe finder related functions */ @@ -1677,44 +1559,6 @@ int debuginfo__find_available_vars_at(struct debuginfo *dbg, return (ret < 0) ? ret : af.nvls; } -/* For the kernel module, we need a special code to get a DIE */ -int debuginfo__get_text_offset(struct debuginfo *dbg, Dwarf_Addr *offs, - bool adjust_offset) -{ - int n, i; - Elf32_Word shndx; - Elf_Scn *scn; - Elf *elf; - GElf_Shdr mem, *shdr; - const char *p; - - elf = dwfl_module_getelf(dbg->mod, &dbg->bias); - if (!elf) - return -EINVAL; - - /* Get the number of relocations */ - n = dwfl_module_relocations(dbg->mod); - if (n < 0) - return -ENOENT; - /* Search the relocation related .text section */ - for (i = 0; i < n; i++) { - p = dwfl_module_relocation_info(dbg->mod, i, &shndx); - if (strcmp(p, ".text") == 0) { - /* OK, get the section header */ - scn = elf_getscn(elf, shndx); - if (!scn) - return -ENOENT; - shdr = gelf_getshdr(scn, &mem); - if (!shdr) - return -ENOENT; - *offs = shdr->sh_addr; - if (adjust_offset) - *offs -= shdr->sh_offset; - } - } - return 0; -} - /* Reverse search */ int debuginfo__find_probe_point(struct debuginfo *dbg, u64 addr, struct perf_probe_point *ppt) @@ -2009,41 +1853,6 @@ found: return (ret < 0) ? ret : lf.found; } -#ifdef HAVE_DEBUGINFOD_SUPPORT -/* debuginfod doesn't require the comp_dir but buildid is required */ -static int get_source_from_debuginfod(const char *raw_path, - const char *sbuild_id, char **new_path) -{ - debuginfod_client *c = debuginfod_begin(); - const char *p = raw_path; - int fd; - - if (!c) - return -ENOMEM; - - fd = debuginfod_find_source(c, (const unsigned char *)sbuild_id, - 0, p, new_path); - pr_debug("Search %s from debuginfod -> %d\n", p, fd); - if (fd >= 0) - close(fd); - debuginfod_end(c); - if (fd < 0) { - pr_debug("Failed to find %s in debuginfod (%s)\n", - raw_path, sbuild_id); - return -ENOENT; - } - pr_debug("Got a source %s\n", *new_path); - - return 0; -} -#else -static inline int get_source_from_debuginfod(const char *raw_path __maybe_unused, - const char *sbuild_id __maybe_unused, - char **new_path __maybe_unused) -{ - return -ENOTSUP; -} -#endif /* * Find a src file from a DWARF tag path. Prepend optional source path prefix * and chop off leading directories that do not exist. Result is passed back as diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h index 8bc1c80d3c1c0..3add5ff516e12 100644 --- a/tools/perf/util/probe-finder.h +++ b/tools/perf/util/probe-finder.h @@ -24,21 +24,7 @@ static inline int is_c_varname(const char *name) #ifdef HAVE_DWARF_SUPPORT #include "dwarf-aux.h" - -/* TODO: export debuginfo data structure even if no dwarf support */ - -/* debug information structure */ -struct debuginfo { - Dwarf *dbg; - Dwfl_Module *mod; - Dwfl *dwfl; - Dwarf_Addr bias; - const unsigned char *build_id; -}; - -/* This also tries to open distro debuginfo */ -struct debuginfo *debuginfo__new(const char *path); -void debuginfo__delete(struct debuginfo *dbg); +#include "debuginfo.h" /* Find probe_trace_events specified by perf_probe_event from debuginfo */ int debuginfo__find_trace_events(struct debuginfo *dbg, @@ -49,9 +35,6 @@ int debuginfo__find_trace_events(struct debuginfo *dbg, int debuginfo__find_probe_point(struct debuginfo *dbg, u64 addr, struct perf_probe_point *ppt); -int debuginfo__get_text_offset(struct debuginfo *dbg, Dwarf_Addr *offs, - bool adjust_offset); - /* Find a line range */ int debuginfo__find_line_range(struct debuginfo *dbg, struct line_range *lr); -- GitLab From a65e8c0b7855e951138eff077af4a6a8721a7ef6 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 9 Nov 2023 15:59:23 -0800 Subject: [PATCH 0027/1290] perf dwarf-aux: Fix die_get_typename() for void * The die_get_typename() is to return a C-like type name from DWARF debug entry and it follows data type if the target entry is a pointer type. But I found that void pointers don't have the type attribute to follow and then the function returns an error for that case. This results in a broken type string for void pointer types. For example, the following type entries are pointer types. <1><48c>: Abbrev Number: 4 (DW_TAG_pointer_type) <48d> DW_AT_byte_size : 8 <48d> DW_AT_type : <0x481> <1><491>: Abbrev Number: 211 (DW_TAG_pointer_type) <493> DW_AT_byte_size : 8 <1><494>: Abbrev Number: 4 (DW_TAG_pointer_type) <495> DW_AT_byte_size : 8 <495> DW_AT_type : <0x49e> The first one at offset 48c and the third one at offset 494 have type information. Then they are pointer types for the referenced types. But the second one at offset 491 doesn't have the type attribute. Signed-off-by: Namhyung Kim Acked-by: Masami Hiramatsu (Google) Cc: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: linux-toolchains@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Link: https://lore.kernel.org/r/20231110000012.3538610-5-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-aux.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index 2941d88f2199c..4849c3bbfd95d 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -1090,7 +1090,14 @@ int die_get_typename(Dwarf_Die *vr_die, struct strbuf *buf) return strbuf_addf(buf, "%s%s", tmp, name ?: ""); } ret = die_get_typename(&type, buf); - return ret ? ret : strbuf_addstr(buf, tmp); + if (ret < 0) { + /* void pointer has no type attribute */ + if (tag == DW_TAG_pointer_type && ret == -ENOENT) + return strbuf_addf(buf, "void*"); + + return ret; + } + return strbuf_addstr(buf, tmp); } /** -- GitLab From 3796eba7c137e073d1caa95b48d4a320fd2d9600 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 9 Nov 2023 15:59:24 -0800 Subject: [PATCH 0028/1290] perf dwarf-aux: Move #else block of #ifdef HAVE_DWARF_GETLOCATIONS_SUPPORT code to the header file It's a usual convention that the conditional code is handled in a header file. As I'm planning to add some more of them, let's move the current code to the header first. Signed-off-by: Namhyung Kim Acked-by: Masami Hiramatsu (Google) Cc: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: linux-toolchains@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Link: https://lore.kernel.org/r/20231110000012.3538610-6-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-aux.c | 7 ------- tools/perf/util/dwarf-aux.h | 19 +++++++++++++++++-- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index 4849c3bbfd95d..adef2635587de 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -1245,13 +1245,6 @@ int die_get_var_range(Dwarf_Die *sp_die, Dwarf_Die *vr_die, struct strbuf *buf) out: return ret; } -#else -int die_get_var_range(Dwarf_Die *sp_die __maybe_unused, - Dwarf_Die *vr_die __maybe_unused, - struct strbuf *buf __maybe_unused) -{ - return -ENOTSUP; -} #endif /* diff --git a/tools/perf/util/dwarf-aux.h b/tools/perf/util/dwarf-aux.h index 7ec8bc1083bb3..4f5d0211ee4f2 100644 --- a/tools/perf/util/dwarf-aux.h +++ b/tools/perf/util/dwarf-aux.h @@ -121,7 +121,6 @@ int die_get_typename(Dwarf_Die *vr_die, struct strbuf *buf); /* Get the name and type of given variable DIE, stored as "type\tname" */ int die_get_varname(Dwarf_Die *vr_die, struct strbuf *buf); -int die_get_var_range(Dwarf_Die *sp_die, Dwarf_Die *vr_die, struct strbuf *buf); /* Check if target program is compiled with optimization */ bool die_is_optimized_target(Dwarf_Die *cu_die); @@ -130,4 +129,20 @@ bool die_is_optimized_target(Dwarf_Die *cu_die); void die_skip_prologue(Dwarf_Die *sp_die, Dwarf_Die *cu_die, Dwarf_Addr *entrypc); -#endif +#ifdef HAVE_DWARF_GETLOCATIONS_SUPPORT + +/* Get byte offset range of given variable DIE */ +int die_get_var_range(Dwarf_Die *sp_die, Dwarf_Die *vr_die, struct strbuf *buf); + +#else /* HAVE_DWARF_GETLOCATIONS_SUPPORT */ + +static inline int die_get_var_range(Dwarf_Die *sp_die __maybe_unused, + Dwarf_Die *vr_die __maybe_unused, + struct strbuf *buf __maybe_unused) +{ + return -ENOTSUP; +} + +#endif /* HAVE_DWARF_GETLOCATIONS_SUPPORT */ + +#endif /* _DWARF_AUX_H */ -- GitLab From 981620fd2776c45a514ed621a718e2a6941ad7c5 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 9 Nov 2023 15:59:25 -0800 Subject: [PATCH 0029/1290] perf dwarf-aux: Add die_get_scopes() alternative to dwarf_getscopes() The die_get_scopes() returns the number of enclosing DIEs for the given address and it fills an array of DIEs like dwarf_getscopes(). But it doesn't follow the abstract origin of inlined functions as we want information of the concrete instance. This is needed to check the location of parameters and local variables properly. Users can check the origin separately if needed. Signed-off-by: Namhyung Kim Acked-by: Masami Hiramatsu (Google) Cc: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: linux-toolchains@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Link: https://lore.kernel.org/r/20231110000012.3538610-7-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-aux.c | 53 +++++++++++++++++++++++++++++++++++++ tools/perf/util/dwarf-aux.h | 3 +++ 2 files changed, 56 insertions(+) diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index adef2635587de..10aa32334d6f2 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -1425,3 +1425,56 @@ void die_skip_prologue(Dwarf_Die *sp_die, Dwarf_Die *cu_die, *entrypc = postprologue_addr; } + +/* Internal parameters for __die_find_scope_cb() */ +struct find_scope_data { + /* Target instruction address */ + Dwarf_Addr pc; + /* Number of scopes found [output] */ + int nr; + /* Array of scopes found, 0 for the outermost one. [output] */ + Dwarf_Die *scopes; +}; + +static int __die_find_scope_cb(Dwarf_Die *die_mem, void *arg) +{ + struct find_scope_data *data = arg; + + if (dwarf_haspc(die_mem, data->pc)) { + Dwarf_Die *tmp; + + tmp = realloc(data->scopes, (data->nr + 1) * sizeof(*tmp)); + if (tmp == NULL) + return DIE_FIND_CB_END; + + memcpy(tmp + data->nr, die_mem, sizeof(*die_mem)); + data->scopes = tmp; + data->nr++; + return DIE_FIND_CB_CHILD; + } + return DIE_FIND_CB_SIBLING; +} + +/** + * die_get_scopes - Return a list of scopes including the address + * @cu_die: a compile unit DIE + * @pc: the address to find + * @scopes: the array of DIEs for scopes (result) + * + * This function does the same as the dwarf_getscopes() but doesn't follow + * the origins of inlined functions. It returns the number of scopes saved + * in the @scopes argument. The outer scope will be saved first (index 0) and + * the last one is the innermost scope at the @pc. + */ +int die_get_scopes(Dwarf_Die *cu_die, Dwarf_Addr pc, Dwarf_Die **scopes) +{ + struct find_scope_data data = { + .pc = pc, + }; + Dwarf_Die die_mem; + + die_find_child(cu_die, __die_find_scope_cb, &data, &die_mem); + + *scopes = data.scopes; + return data.nr; +} diff --git a/tools/perf/util/dwarf-aux.h b/tools/perf/util/dwarf-aux.h index 4f5d0211ee4f2..f9d765f80fb02 100644 --- a/tools/perf/util/dwarf-aux.h +++ b/tools/perf/util/dwarf-aux.h @@ -129,6 +129,9 @@ bool die_is_optimized_target(Dwarf_Die *cu_die); void die_skip_prologue(Dwarf_Die *sp_die, Dwarf_Die *cu_die, Dwarf_Addr *entrypc); +/* Get the list of including scopes */ +int die_get_scopes(Dwarf_Die *cu_die, Dwarf_Addr pc, Dwarf_Die **scopes); + #ifdef HAVE_DWARF_GETLOCATIONS_SUPPORT /* Get byte offset range of given variable DIE */ -- GitLab From 3f5928e461e394d27bc1ed7d0785b1e63c43d6a1 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 9 Nov 2023 15:59:26 -0800 Subject: [PATCH 0030/1290] perf dwarf-aux: Add die_find_variable_by_reg() helper The die_find_variable_by_reg() will search for a variable or a parameter sub-DIE in the given scope DIE where the location matches to the given register. For the simplest and most common case, memory access usually happens with a base register and an offset to the field so the register holds a pointer in a variable or function parameter. Then we can find one if it has a location expression at the (instruction) address. This function only handles such a simple case for now. In this case, the expression has a DW_OP_regN operation where N < 32. If the register index (N) is greater than or equal to 32, DW_OP_regx operation with an operand which saves the value for the N would be used. It rejects expressions with more operations. Signed-off-by: Namhyung Kim Acked-by: Masami Hiramatsu (Google) Cc: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: linux-toolchains@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Link: https://lore.kernel.org/r/20231110000012.3538610-8-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-aux.c | 67 +++++++++++++++++++++++++++++++++++++ tools/perf/util/dwarf-aux.h | 12 +++++++ 2 files changed, 79 insertions(+) diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index 10aa32334d6f2..652e6e7368a26 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -1245,6 +1245,73 @@ int die_get_var_range(Dwarf_Die *sp_die, Dwarf_Die *vr_die, struct strbuf *buf) out: return ret; } + +/* Interval parameters for __die_find_var_reg_cb() */ +struct find_var_data { + /* Target instruction address */ + Dwarf_Addr pc; + /* Target register */ + unsigned reg; +}; + +/* Max number of registers DW_OP_regN supports */ +#define DWARF_OP_DIRECT_REGS 32 + +/* Only checks direct child DIEs in the given scope. */ +static int __die_find_var_reg_cb(Dwarf_Die *die_mem, void *arg) +{ + struct find_var_data *data = arg; + int tag = dwarf_tag(die_mem); + ptrdiff_t off = 0; + Dwarf_Attribute attr; + Dwarf_Addr base, start, end; + Dwarf_Op *ops; + size_t nops; + + if (tag != DW_TAG_variable && tag != DW_TAG_formal_parameter) + return DIE_FIND_CB_SIBLING; + + if (dwarf_attr(die_mem, DW_AT_location, &attr) == NULL) + return DIE_FIND_CB_SIBLING; + + while ((off = dwarf_getlocations(&attr, off, &base, &start, &end, &ops, &nops)) > 0) { + /* Assuming the location list is sorted by address */ + if (end < data->pc) + continue; + if (start > data->pc) + break; + + /* Only match with a simple case */ + if (data->reg < DWARF_OP_DIRECT_REGS) { + if (ops->atom == (DW_OP_reg0 + data->reg) && nops == 1) + return DIE_FIND_CB_END; + } else { + if (ops->atom == DW_OP_regx && ops->number == data->reg && + nops == 1) + return DIE_FIND_CB_END; + } + } + return DIE_FIND_CB_SIBLING; +} + +/** + * die_find_variable_by_reg - Find a variable saved in a register + * @sc_die: a scope DIE + * @pc: the program address to find + * @reg: the register number to find + * @die_mem: a buffer to save the resulting DIE + * + * Find the variable DIE accessed by the given register. + */ +Dwarf_Die *die_find_variable_by_reg(Dwarf_Die *sc_die, Dwarf_Addr pc, int reg, + Dwarf_Die *die_mem) +{ + struct find_var_data data = { + .pc = pc, + .reg = reg, + }; + return die_find_child(sc_die, __die_find_var_reg_cb, &data, die_mem); +} #endif /* diff --git a/tools/perf/util/dwarf-aux.h b/tools/perf/util/dwarf-aux.h index f9d765f80fb02..b6f430730bd13 100644 --- a/tools/perf/util/dwarf-aux.h +++ b/tools/perf/util/dwarf-aux.h @@ -137,6 +137,10 @@ int die_get_scopes(Dwarf_Die *cu_die, Dwarf_Addr pc, Dwarf_Die **scopes); /* Get byte offset range of given variable DIE */ int die_get_var_range(Dwarf_Die *sp_die, Dwarf_Die *vr_die, struct strbuf *buf); +/* Find a variable saved in the 'reg' at given address */ +Dwarf_Die *die_find_variable_by_reg(Dwarf_Die *sc_die, Dwarf_Addr pc, int reg, + Dwarf_Die *die_mem); + #else /* HAVE_DWARF_GETLOCATIONS_SUPPORT */ static inline int die_get_var_range(Dwarf_Die *sp_die __maybe_unused, @@ -146,6 +150,14 @@ static inline int die_get_var_range(Dwarf_Die *sp_die __maybe_unused, return -ENOTSUP; } +static inline Dwarf_Die *die_find_variable_by_reg(Dwarf_Die *sc_die __maybe_unused, + Dwarf_Addr pc __maybe_unused, + int reg __maybe_unused, + Dwarf_Die *die_mem __maybe_unused) +{ + return NULL; +} + #endif /* HAVE_DWARF_GETLOCATIONS_SUPPORT */ #endif /* _DWARF_AUX_H */ -- GitLab From f67f2fda7d997a43e3323aec88d76f1b5e0ea5f2 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 9 Nov 2023 15:59:27 -0800 Subject: [PATCH 0031/1290] perf build: Add feature check for dwarf_getcfi() The dwarf_getcfi() is available on libdw 0.142+. Instead of just checking the version number, it'd be nice to have a config item to check the feature at build time. Suggested-by: Masami Hiramatsu (Google) Signed-off-by: Namhyung Kim Acked-by: Masami Hiramatsu (Google) Cc: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: linux-toolchains@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Link: https://lore.kernel.org/r/20231110000012.3538610-9-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/build/Makefile.feature | 1 + tools/build/feature/Makefile | 4 ++++ tools/build/feature/test-dwarf_getcfi.c | 9 +++++++++ 3 files changed, 14 insertions(+) create mode 100644 tools/build/feature/test-dwarf_getcfi.c diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature index 934e2777a2dbc..64df118376df6 100644 --- a/tools/build/Makefile.feature +++ b/tools/build/Makefile.feature @@ -32,6 +32,7 @@ FEATURE_TESTS_BASIC := \ backtrace \ dwarf \ dwarf_getlocations \ + dwarf_getcfi \ eventfd \ fortify-source \ get_current_dir_name \ diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index dad79ede4e0ae..37722e509eb9f 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -7,6 +7,7 @@ FILES= \ test-bionic.bin \ test-dwarf.bin \ test-dwarf_getlocations.bin \ + test-dwarf_getcfi.bin \ test-eventfd.bin \ test-fortify-source.bin \ test-get_current_dir_name.bin \ @@ -154,6 +155,9 @@ $(OUTPUT)test-dwarf.bin: $(OUTPUT)test-dwarf_getlocations.bin: $(BUILD) $(DWARFLIBS) +$(OUTPUT)test-dwarf_getcfi.bin: + $(BUILD) $(DWARFLIBS) + $(OUTPUT)test-libelf-getphdrnum.bin: $(BUILD) -lelf diff --git a/tools/build/feature/test-dwarf_getcfi.c b/tools/build/feature/test-dwarf_getcfi.c new file mode 100644 index 0000000000000..50e7d7cb7bdf9 --- /dev/null +++ b/tools/build/feature/test-dwarf_getcfi.c @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +int main(void) +{ + Dwarf *dwarf = NULL; + return dwarf_getcfi(dwarf) == NULL; +} -- GitLab From c06547d02094e7eb81389e9485a45d91cc21914c Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 9 Nov 2023 15:59:28 -0800 Subject: [PATCH 0032/1290] perf probe: Convert to check dwarf_getcfi feature Now it has a feature check for the dwarf_getcfi(), use it and convert the code to check HAVE_DWARF_CFI_SUPPORT definition. Suggested-by: Masami Hiramatsu (Google) Signed-off-by: Namhyung Kim Acked-by: Masami Hiramatsu (Google) Cc: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: linux-toolchains@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Link: https://lore.kernel.org/r/20231110000012.3538610-10-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.config | 5 +++++ tools/perf/util/probe-finder.c | 8 ++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 8b6cffbc48583..aa55850fbc213 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -476,6 +476,11 @@ else else CFLAGS += -DHAVE_DWARF_GETLOCATIONS_SUPPORT endif # dwarf_getlocations + ifneq ($(feature-dwarf_getcfi), 1) + msg := $(warning Old libdw.h, finding variables at given 'perf probe' point will not work, install elfutils-devel/libdw-dev >= 0.142); + else + CFLAGS += -DHAVE_DWARF_CFI_SUPPORT + endif # dwarf_getcfi endif # Dwarf support endif # libelf support endif # NO_LIBELF diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index 8d3dd85f9ff4c..c8923375e30d6 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -604,7 +604,7 @@ static int call_probe_finder(Dwarf_Die *sc_die, struct probe_finder *pf) ret = dwarf_getlocation_addr(&fb_attr, pf->addr, &pf->fb_ops, &nops, 1); if (ret <= 0 || nops == 0) { pf->fb_ops = NULL; -#if _ELFUTILS_PREREQ(0, 142) +#ifdef HAVE_DWARF_CFI_SUPPORT } else if (nops == 1 && pf->fb_ops[0].atom == DW_OP_call_frame_cfa && (pf->cfi_eh != NULL || pf->cfi_dbg != NULL)) { if ((dwarf_cfi_addrframe(pf->cfi_eh, pf->addr, &frame) != 0 && @@ -615,7 +615,7 @@ static int call_probe_finder(Dwarf_Die *sc_die, struct probe_finder *pf) free(frame); return -ENOENT; } -#endif +#endif /* HAVE_DWARF_CFI_SUPPORT */ } /* Call finder's callback handler */ @@ -1140,7 +1140,7 @@ static int debuginfo__find_probes(struct debuginfo *dbg, pf->machine = ehdr.e_machine; -#if _ELFUTILS_PREREQ(0, 142) +#ifdef HAVE_DWARF_CFI_SUPPORT do { GElf_Shdr shdr; @@ -1150,7 +1150,7 @@ static int debuginfo__find_probes(struct debuginfo *dbg, pf->cfi_dbg = dwarf_getcfi(dbg->dbg); } while (0); -#endif +#endif /* HAVE_DWARF_CFI_SUPPORT */ ret = debuginfo__find_probe_location(dbg, pf); return ret; -- GitLab From ccb9e9dd2a996f5814d7072b968b3d2f4b094188 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 11 Nov 2023 15:32:21 +0100 Subject: [PATCH 0033/1290] dt-bindings: input: samsung,s6sy761: convert to DT schema Convert Samsung S6SY761 touchscreen controller bindings to DT schema. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20231111143221.55452-1-krzysztof.kozlowski@linaro.org Signed-off-by: Dmitry Torokhov --- .../input/touchscreen/samsung,s6sy761.txt | 32 ----------- .../input/touchscreen/samsung,s6sy761.yaml | 54 +++++++++++++++++++ 2 files changed, 54 insertions(+), 32 deletions(-) delete mode 100644 Documentation/devicetree/bindings/input/touchscreen/samsung,s6sy761.txt create mode 100644 Documentation/devicetree/bindings/input/touchscreen/samsung,s6sy761.yaml diff --git a/Documentation/devicetree/bindings/input/touchscreen/samsung,s6sy761.txt b/Documentation/devicetree/bindings/input/touchscreen/samsung,s6sy761.txt deleted file mode 100644 index 6805d10d226d3..0000000000000 --- a/Documentation/devicetree/bindings/input/touchscreen/samsung,s6sy761.txt +++ /dev/null @@ -1,32 +0,0 @@ -* Samsung S6SY761 touchscreen controller - -Required properties: -- compatible : must be "samsung,s6sy761" -- reg : I2C slave address, (e.g. 0x48) -- interrupts : interrupt specification -- avdd-supply : analogic power supply -- vdd-supply : power supply - -Optional properties: -- touchscreen-size-x : see touchscreen.txt. This property is embedded in the - device. If defined it forces a different x resolution. -- touchscreen-size-y : see touchscreen.txt. This property is embedded in the - device. If defined it forces a different y resolution. - -Example: - -i2c@00000000 { - - /* ... */ - - touchscreen@48 { - compatible = "samsung,s6sy761"; - reg = <0x48>; - interrupt-parent = <&gpa1>; - interrupts = <1 IRQ_TYPE_NONE>; - avdd-supply = <&ldo30_reg>; - vdd-supply = <&ldo31_reg>; - touchscreen-size-x = <4096>; - touchscreen-size-y = <4096>; - }; -}; diff --git a/Documentation/devicetree/bindings/input/touchscreen/samsung,s6sy761.yaml b/Documentation/devicetree/bindings/input/touchscreen/samsung,s6sy761.yaml new file mode 100644 index 0000000000000..1ffd17af3c53f --- /dev/null +++ b/Documentation/devicetree/bindings/input/touchscreen/samsung,s6sy761.yaml @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/input/touchscreen/samsung,s6sy761.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Samsung S6SY761 touchscreen controller + +maintainers: + - Andi Shyti + +allOf: + - $ref: touchscreen.yaml# + +properties: + compatible: + const: samsung,s6sy761 + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + avdd-supply: true + vdd-supply: true + +unevaluatedProperties: false + +required: + - compatible + - reg + - interrupts + - avdd-supply + - vdd-supply + +examples: + - | + #include + i2c { + #address-cells = <1>; + #size-cells = <0>; + + touchscreen@48 { + compatible = "samsung,s6sy761"; + reg = <0x48>; + interrupt-parent = <&gpa1>; + interrupts = <1 IRQ_TYPE_LEVEL_HIGH>; + avdd-supply = <&ldo30_reg>; + vdd-supply = <&ldo31_reg>; + touchscreen-size-x = <4096>; + touchscreen-size-y = <4096>; + }; + }; -- GitLab From b539deafbadb2fc6ba79307a797196454b14f501 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Fri, 10 Nov 2023 12:09:08 +0100 Subject: [PATCH 0034/1290] perf report: Add s390 raw data interpretation for PAI counters Commit 39d62336f5c126ad ("s390/pai: add support for cryptography counters") added support for Processor Activity Instrumentation Facility (PAI) counters. These counters values are added as raw data with the perf sample during 'perf record'. Now add support to display these counters in the 'perf report' command. The counter number, its assigned name and value is now printed in addition to the hexadecimal output. Output before: # perf report -D 6 514766399626050 0x7b058 [0x48]: PERF_RECORD_SAMPLE(IP, 0x1): 303977/303977: 0 period: 1 addr: 0 ... thread: paitest:303977 ...... dso: 0x7b0a0@/root/perf.data.paicrypto [0x48]: event: 9 . . ... raw event: size 72 bytes . 0000: 00 00 00 09 00 01 00 48 00 00 00 00 00 00 00 00 .......H........ . 0010: 00 04 a3 69 00 04 a3 69 00 01 d4 2d 76 de a0 bb ...i...i...-v... . 0020: 00 00 00 00 00 01 5c 53 00 00 00 06 00 00 00 00 ......\S........ . 0030: 00 00 00 00 00 00 00 01 00 00 00 0c 00 07 00 00 ................ . 0040: 00 00 00 53 96 af 00 00 ...S.... Output after: # perf report -D 6 514766399626050 0x7b058 [0x48]: PERF_RECORD_SAMPLE(IP, 0x1): 303977/303977: 0 period: 1 addr: 0 ... thread: paitest:303977 ...... dso: 0x7b0a0@/root/perf.data.paicrypto [0x48]: event: 9 . . ... raw event: size 72 bytes . 0000: 00 00 00 09 00 01 00 48 00 00 00 00 00 00 00 00 .......H........ . 0010: 00 04 a3 69 00 04 a3 69 00 01 d4 2d 76 de a0 bb ...i...i...-v... . 0020: 00 00 00 00 00 01 5c 53 00 00 00 06 00 00 00 00 ......\S........ . 0030: 00 00 00 00 00 00 00 01 00 00 00 0c 00 07 00 00 ................ . 0040: 00 00 00 53 96 af 00 00 ...S.... Counter:007 km_aes_128 Value:0x00000000005396af <--- new Committer notes: Had to add ignore pragmas for that __packed function: +#pragma GCC diagnostic ignored "-Wpacked" +#pragma GCC diagnostic ignored "-Wattributes" Otherwise this doesn't build in things like debian experimentao cross building to mips64, etc. Signed-off-by: Thomas Richter Tested-by: Sumanth Korikkar Acked-by: Sumanth Korikkar Cc: Heiko Carstens Cc: Sven Schnelle Cc: Vasily Gorbik Link: https://lore.kernel.org/r/20231110110908.2312308-1-tmricht@linux.ibm.com [ Corrected non-existent commit referred to the right one: 39d62336f5c126ad ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/s390-cpumcf-kernel.h | 2 + tools/perf/util/s390-sample-raw.c | 109 +++++++++++++++++++++++++-- 2 files changed, 103 insertions(+), 8 deletions(-) diff --git a/tools/perf/util/s390-cpumcf-kernel.h b/tools/perf/util/s390-cpumcf-kernel.h index f55ca07f3ca12..74b36644e3849 100644 --- a/tools/perf/util/s390-cpumcf-kernel.h +++ b/tools/perf/util/s390-cpumcf-kernel.h @@ -12,6 +12,8 @@ #define S390_CPUMCF_DIAG_DEF 0xfeef /* Counter diagnostic entry ID */ #define PERF_EVENT_CPUM_CF_DIAG 0xBC000 /* Event: Counter sets */ #define PERF_EVENT_CPUM_SF_DIAG 0xBD000 /* Event: Combined-sampling */ +#define PERF_EVENT_PAI_CRYPTO_ALL 0x1000 /* Event: CRYPTO_ALL */ +#define PERF_EVENT_PAI_NNPA_ALL 0x1800 /* Event: NNPA_ALL */ struct cf_ctrset_entry { /* CPU-M CF counter set entry (8 byte) */ unsigned int def:16; /* 0-15 Data Entry Format */ diff --git a/tools/perf/util/s390-sample-raw.c b/tools/perf/util/s390-sample-raw.c index 115b16edb4513..29a744eeb71eb 100644 --- a/tools/perf/util/s390-sample-raw.c +++ b/tools/perf/util/s390-sample-raw.c @@ -125,6 +125,9 @@ static int get_counterset_start(int setnr) return 128; case CPUMF_CTR_SET_MT_DIAG: /* Diagnostic counter set */ return 448; + case PERF_EVENT_PAI_NNPA_ALL: /* PAI NNPA counter set */ + case PERF_EVENT_PAI_CRYPTO_ALL: /* PAI CRYPTO counter set */ + return setnr; default: return -1; } @@ -212,27 +215,117 @@ static void s390_cpumcfdg_dump(struct perf_pmu *pmu, struct perf_sample *sample) } } +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpacked" +#pragma GCC diagnostic ignored "-Wattributes" +/* + * Check for consistency of PAI_CRYPTO/PAI_NNPA raw data. + */ +struct pai_data { /* Event number and value */ + u16 event_nr; + u64 event_val; +} __packed; + +#pragma GCC diagnostic pop + +/* + * Test for valid raw data. At least one PAI event should be in the raw + * data section. + */ +static bool s390_pai_all_test(struct perf_sample *sample) +{ + unsigned char *buf = sample->raw_data; + size_t len = sample->raw_size; + + if (len < 0xa || !buf) + return false; + return true; +} + +static void s390_pai_all_dump(struct evsel *evsel, struct perf_sample *sample) +{ + size_t len = sample->raw_size, offset = 0; + unsigned char *p = sample->raw_data; + const char *color = PERF_COLOR_BLUE; + struct pai_data pai_data; + char *ev_name; + + while (offset < len) { + memcpy(&pai_data.event_nr, p, sizeof(pai_data.event_nr)); + pai_data.event_nr = be16_to_cpu(pai_data.event_nr); + p += sizeof(pai_data.event_nr); + offset += sizeof(pai_data.event_nr); + + memcpy(&pai_data.event_val, p, sizeof(pai_data.event_val)); + pai_data.event_val = be64_to_cpu(pai_data.event_val); + p += sizeof(pai_data.event_val); + offset += sizeof(pai_data.event_val); + + ev_name = get_counter_name(evsel->core.attr.config, + pai_data.event_nr, evsel->pmu); + color_fprintf(stdout, color, "\tCounter:%03d %s Value:%#018lx\n", + pai_data.event_nr, ev_name ?: "", + pai_data.event_val); + free(ev_name); + + if (offset + 0xa > len) + break; + } + color_fprintf(stdout, color, "\n"); +} + /* S390 specific trace event function. Check for PERF_RECORD_SAMPLE events - * and if the event was triggered by a counter set diagnostic event display - * its raw data. + * and if the event was triggered by a + * - counter set diagnostic event + * - processor activity assist (PAI) crypto counter event + * - processor activity assist (PAI) neural network processor assist (NNPA) + * counter event + * display its raw data. * The function is only invoked when the dump flag -D is set. + * + * Function evlist__s390_sample_raw() is defined as call back after it has + * been verified that the perf.data file was created on s390 platform. */ -void evlist__s390_sample_raw(struct evlist *evlist, union perf_event *event, struct perf_sample *sample) +void evlist__s390_sample_raw(struct evlist *evlist, union perf_event *event, + struct perf_sample *sample) { + const char *pai_name; struct evsel *evsel; if (event->header.type != PERF_RECORD_SAMPLE) return; evsel = evlist__event2evsel(evlist, event); - if (evsel == NULL || - evsel->core.attr.config != PERF_EVENT_CPUM_CF_DIAG) + if (!evsel) return; /* Display raw data on screen */ - if (!s390_cpumcfdg_testctr(sample)) { - pr_err("Invalid counter set data encountered\n"); + if (evsel->core.attr.config == PERF_EVENT_CPUM_CF_DIAG) { + if (!evsel->pmu) + evsel->pmu = perf_pmus__find("cpum_cf"); + if (!s390_cpumcfdg_testctr(sample)) + pr_err("Invalid counter set data encountered\n"); + else + s390_cpumcfdg_dump(evsel->pmu, sample); return; } - s390_cpumcfdg_dump(evsel->pmu, sample); + + switch (evsel->core.attr.config) { + case PERF_EVENT_PAI_NNPA_ALL: + pai_name = "NNPA_ALL"; + break; + case PERF_EVENT_PAI_CRYPTO_ALL: + pai_name = "CRYPTO_ALL"; + break; + default: + return; + } + + if (!s390_pai_all_test(sample)) { + pr_err("Invalid %s raw data encountered\n", pai_name); + } else { + if (!evsel->pmu) + evsel->pmu = perf_pmus__find_by_type(evsel->core.attr.type); + s390_pai_all_dump(evsel, sample); + } } -- GitLab From acbf6de674ef7b1b5870b25e7b3c695bf84273d0 Mon Sep 17 00:00:00 2001 From: Ji Sheng Teoh Date: Fri, 3 Nov 2023 16:24:41 +0800 Subject: [PATCH 0035/1290] perf vendor events riscv: Add StarFive Dubhe-80 JSON file StarFive's Dubhe-80 supports raw event id 0x00 - 0x22. The raw events are enabled through PMU node of DT binding. Besides raw event, add standard RISC-V firmware events to support monitoring of firmware event. Example of PMU DT node: pmu { compatible = "riscv,pmu"; riscv,raw-event-to-mhpmcounters = /* Event ID 1-31 */ <0x00 0x00 0xFFFFFFFF 0xFFFFFFE0 0x00007FF8>, /* Event ID 32-33 */ <0x00 0x20 0xFFFFFFFF 0xFFFFFFFE 0x00007FF8>, /* Event ID 34 */ <0x00 0x22 0xFFFFFFFF 0xFFFFFF22 0x00007FF8>; }; Example of 'perf stat' output: [root@user]# perf stat -a \ -e access_mmu_stlb \ -e miss_mmu_stlb \ -e access_mmu_pte_c \ -e rob_flush \ -e btb_prediction_miss \ -e itlb_miss \ -e sync_del_fetch_g \ -e icache_miss \ -e bpu_br_retire \ -e bpu_br_miss \ -e ret_ins_retire \ -e ret_ins_miss \ -- openssl speed rsa2048 Doing 2048 bits private rsa's for 10s: 39 2048 bits private RSA's in 10.14s Doing 2048 bits public rsa's for 10s: 1563 2048 bits public RSA's in 10.00s version: 3.0.11 built on: Tue Sep 19 13:02:31 2023 UTC options: bn(64,64) CPUINFO: N/A sign verify sign/s verify/s rsa 2048 bits 0.260000s 0.006398s 3.8 156.3 Performance counter stats for 'system wide': 1338350 access_mmu_stlb 1154025 miss_mmu_stlb 1162691 access_mmu_pte_c 34067 rob_flush 11212384 btb_prediction_miss 1256242 itlb_miss 652523491 sync_del_fetch_g 384465 icache_miss 64635789 bpu_br_retire 323440 bpu_br_miss 8785143 ret_ins_retire 31236 ret_ins_miss 20.760822480 seconds time elapsed Reviewed-by: Ian Rogers Signed-off-by: Ji Sheng Teoh Cc: Adrian Hunter Cc: Albert Ou Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Ley Foon Tan Cc: Mark Rutland Cc: Namhyung Kim Cc: Nikita Shubin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: linux-riscv@lists.infradead.org Link: https://lore.kernel.org/r/20231103082441.1389842-1-jisheng.teoh@starfivetech.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/riscv/mapfile.csv | 1 + .../arch/riscv/starfive/dubhe-80/common.json | 172 ++++++++++++++++++ .../riscv/starfive/dubhe-80/firmware.json | 68 +++++++ 3 files changed, 241 insertions(+) create mode 100644 tools/perf/pmu-events/arch/riscv/starfive/dubhe-80/common.json create mode 100644 tools/perf/pmu-events/arch/riscv/starfive/dubhe-80/firmware.json diff --git a/tools/perf/pmu-events/arch/riscv/mapfile.csv b/tools/perf/pmu-events/arch/riscv/mapfile.csv index c61b3d6ef6166..ee61e26f90cd3 100644 --- a/tools/perf/pmu-events/arch/riscv/mapfile.csv +++ b/tools/perf/pmu-events/arch/riscv/mapfile.csv @@ -15,3 +15,4 @@ # #MVENDORID-MARCHID-MIMPID,Version,Filename,EventType 0x489-0x8000000000000007-0x[[:xdigit:]]+,v1,sifive/u74,core +0x67e-0x80000000db000080-0x[[:xdigit:]]+,v1,starfive/dubhe-80,core diff --git a/tools/perf/pmu-events/arch/riscv/starfive/dubhe-80/common.json b/tools/perf/pmu-events/arch/riscv/starfive/dubhe-80/common.json new file mode 100644 index 0000000000000..fbffcacb2aceb --- /dev/null +++ b/tools/perf/pmu-events/arch/riscv/starfive/dubhe-80/common.json @@ -0,0 +1,172 @@ +[ + { + "EventName": "ACCESS_MMU_STLB", + "EventCode": "0x1", + "BriefDescription": "access MMU STLB" + }, + { + "EventName": "MISS_MMU_STLB", + "EventCode": "0x2", + "BriefDescription": "miss MMU STLB" + }, + { + "EventName": "ACCESS_MMU_PTE_C", + "EventCode": "0x3", + "BriefDescription": "access MMU PTE-Cache" + }, + { + "EventName": "MISS_MMU_PTE_C", + "EventCode": "0x4", + "BriefDescription": "miss MMU PTE-Cache" + }, + { + "EventName": "ROB_FLUSH", + "EventCode": "0x5", + "BriefDescription": "ROB flush (all kinds of exceptions)" + }, + { + "EventName": "BTB_PREDICTION_MISS", + "EventCode": "0x6", + "BriefDescription": "BTB prediction miss" + }, + { + "EventName": "ITLB_MISS", + "EventCode": "0x7", + "BriefDescription": "ITLB miss" + }, + { + "EventName": "SYNC_DEL_FETCH_G", + "EventCode": "0x8", + "BriefDescription": "SYNC delivery a fetch-group" + }, + { + "EventName": "ICACHE_MISS", + "EventCode": "0x9", + "BriefDescription": "ICache miss" + }, + { + "EventName": "BPU_BR_RETIRE", + "EventCode": "0xA", + "BriefDescription": "condition branch instruction retire" + }, + { + "EventName": "BPU_BR_MISS", + "EventCode": "0xB", + "BriefDescription": "condition branch instruction miss" + }, + { + "EventName": "RET_INS_RETIRE", + "EventCode": "0xC", + "BriefDescription": "return instruction retire" + }, + { + "EventName": "RET_INS_MISS", + "EventCode": "0xD", + "BriefDescription": "return instruction miss" + }, + { + "EventName": "INDIRECT_JR_MISS", + "EventCode": "0xE", + "BriefDescription": "indirect JR instruction miss (inlcude without target)" + }, + { + "EventName": "IBUF_VAL_ID_NORDY", + "EventCode": "0xF", + "BriefDescription": "IBUF valid while ID not ready" + }, + { + "EventName": "IBUF_NOVAL_ID_RDY", + "EventCode": "0x10", + "BriefDescription": "IBUF not valid while ID ready" + }, + { + "EventName": "REN_INT_PHY_REG_NORDY", + "EventCode": "0x11", + "BriefDescription": "REN integer physical register file is not ready" + }, + { + "EventName": "REN_FP_PHY_REG_NORDY", + "EventCode": "0x12", + "BriefDescription": "REN floating point physical register file is not ready" + }, + { + "EventName": "REN_CP_NORDY", + "EventCode": "0x13", + "BriefDescription": "REN checkpoint is not ready" + }, + { + "EventName": "DEC_VAL_ROB_NORDY", + "EventCode": "0x14", + "BriefDescription": "DEC is valid and ROB is not ready" + }, + { + "EventName": "OOD_FLUSH_LS_DEP", + "EventCode": "0x15", + "BriefDescription": "out of order flush due to load/store dependency" + }, + { + "EventName": "BRU_RET_IJR_INS", + "EventCode": "0x16", + "BriefDescription": "BRU retire an IJR instruction" + }, + { + "EventName": "ACCESS_DTLB", + "EventCode": "0x17", + "BriefDescription": "access DTLB" + }, + { + "EventName": "MISS_DTLB", + "EventCode": "0x18", + "BriefDescription": "miss DTLB" + }, + { + "EventName": "LOAD_INS_DCACHE", + "EventCode": "0x19", + "BriefDescription": "load instruction access DCache" + }, + { + "EventName": "LOAD_INS_MISS_DCACHE", + "EventCode": "0x1A", + "BriefDescription": "load instruction miss DCache" + }, + { + "EventName": "STORE_INS_DCACHE", + "EventCode": "0x1B", + "BriefDescription": "store/amo instruction access DCache" + }, + { + "EventName": "STORE_INS_MISS_DCACHE", + "EventCode": "0x1C", + "BriefDescription": "store/amo instruction miss DCache" + }, + { + "EventName": "LOAD_SCACHE", + "EventCode": "0x1D", + "BriefDescription": "load access SCache" + }, + { + "EventName": "STORE_SCACHE", + "EventCode": "0x1E", + "BriefDescription": "store access SCache" + }, + { + "EventName": "LOAD_MISS_SCACHE", + "EventCode": "0x1F", + "BriefDescription": "load miss SCache" + }, + { + "EventName": "STORE_MISS_SCACHE", + "EventCode": "0x20", + "BriefDescription": "store miss SCache" + }, + { + "EventName": "L2C_PF_REQ", + "EventCode": "0x21", + "BriefDescription": "L2C data-prefetcher request" + }, + { + "EventName": "L2C_PF_HIT", + "EventCode": "0x22", + "BriefDescription": "L2C data-prefetcher hit" + } +] diff --git a/tools/perf/pmu-events/arch/riscv/starfive/dubhe-80/firmware.json b/tools/perf/pmu-events/arch/riscv/starfive/dubhe-80/firmware.json new file mode 100644 index 0000000000000..9b4a032186a7b --- /dev/null +++ b/tools/perf/pmu-events/arch/riscv/starfive/dubhe-80/firmware.json @@ -0,0 +1,68 @@ +[ + { + "ArchStdEvent": "FW_MISALIGNED_LOAD" + }, + { + "ArchStdEvent": "FW_MISALIGNED_STORE" + }, + { + "ArchStdEvent": "FW_ACCESS_LOAD" + }, + { + "ArchStdEvent": "FW_ACCESS_STORE" + }, + { + "ArchStdEvent": "FW_ILLEGAL_INSN" + }, + { + "ArchStdEvent": "FW_SET_TIMER" + }, + { + "ArchStdEvent": "FW_IPI_SENT" + }, + { + "ArchStdEvent": "FW_IPI_RECEIVED" + }, + { + "ArchStdEvent": "FW_FENCE_I_SENT" + }, + { + "ArchStdEvent": "FW_FENCE_I_RECEIVED" + }, + { + "ArchStdEvent": "FW_SFENCE_VMA_SENT" + }, + { + "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED" + }, + { + "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED" + }, + { + "ArchStdEvent": "FW_SFENCE_VMA_ASID_RECEIVED" + }, + { + "ArchStdEvent": "FW_HFENCE_GVMA_SENT" + }, + { + "ArchStdEvent": "FW_HFENCE_GVMA_RECEIVED" + }, + { + "ArchStdEvent": "FW_HFENCE_GVMA_VMID_SENT" + }, + { + "ArchStdEvent": "FW_HFENCE_GVMA_VMID_RECEIVED" + }, + { + "ArchStdEvent": "FW_HFENCE_VVMA_SENT" + }, + { + "ArchStdEvent": "FW_HFENCE_VVMA_RECEIVED" + }, + { + "ArchStdEvent": "FW_HFENCE_VVMA_ASID_SENT" + }, + { + "ArchStdEvent": "FW_HFENCE_VVMA_ASID_RECEIVED" + } +] -- GitLab From b9afaa069e58939d95923c27c2fd76a0523119a7 Mon Sep 17 00:00:00 2001 From: Asmaa Mnebhi Date: Mon, 30 Oct 2023 16:30:58 -0400 Subject: [PATCH 0036/1290] power: reset: pwr-mlxbf: support graceful reboot instead of emergency reset Replace the soft reset with a graceful reboot. An acpi event will be triggered by the irq in the pwr-mlxbf.c to trigger the graceful reboot. Signed-off-by: Asmaa Mnebhi Link: https://lore.kernel.org/r/20231030203058.8056-1-asmaa@nvidia.com Signed-off-by: Sebastian Reichel --- drivers/power/reset/pwr-mlxbf.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/power/reset/pwr-mlxbf.c b/drivers/power/reset/pwr-mlxbf.c index de35d24bb7ef3..1775b318d0ef4 100644 --- a/drivers/power/reset/pwr-mlxbf.c +++ b/drivers/power/reset/pwr-mlxbf.c @@ -17,11 +17,17 @@ #include struct pwr_mlxbf { - struct work_struct send_work; + struct work_struct reboot_work; + struct work_struct shutdown_work; const char *hid; }; -static void pwr_mlxbf_send_work(struct work_struct *work) +static void pwr_mlxbf_reboot_work(struct work_struct *work) +{ + acpi_bus_generate_netlink_event("button/reboot.*", "Reboot Button", 0x80, 1); +} + +static void pwr_mlxbf_shutdown_work(struct work_struct *work) { acpi_bus_generate_netlink_event("button/power.*", "Power Button", 0x80, 1); } @@ -33,10 +39,10 @@ static irqreturn_t pwr_mlxbf_irq(int irq, void *ptr) struct pwr_mlxbf *priv = ptr; if (!strncmp(priv->hid, rst_pwr_hid, 8)) - emergency_restart(); + schedule_work(&priv->reboot_work); if (!strncmp(priv->hid, low_pwr_hid, 8)) - schedule_work(&priv->send_work); + schedule_work(&priv->shutdown_work); return IRQ_HANDLED; } @@ -64,7 +70,11 @@ static int pwr_mlxbf_probe(struct platform_device *pdev) if (irq < 0) return dev_err_probe(dev, irq, "Error getting %s irq.\n", priv->hid); - err = devm_work_autocancel(dev, &priv->send_work, pwr_mlxbf_send_work); + err = devm_work_autocancel(dev, &priv->shutdown_work, pwr_mlxbf_shutdown_work); + if (err) + return err; + + err = devm_work_autocancel(dev, &priv->reboot_work, pwr_mlxbf_reboot_work); if (err) return err; -- GitLab From 160dff476f81b928ee4a4d2be95066fa32513483 Mon Sep 17 00:00:00 2001 From: Elliot Berman Date: Tue, 31 Oct 2023 11:27:00 -0700 Subject: [PATCH 0037/1290] dt-bindings: power: reset: $ref reboot-mode in syscon-reboot-mode syscon-reboot-mode.yaml should $ref: reboot-mode.yaml, but instead rewrites the properties. Update so it $refs instead. Signed-off-by: Elliot Berman Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20231031-ref-reboot-mode-v1-1-18dde4faf7e8@quicinc.com Signed-off-by: Sebastian Reichel --- .../bindings/power/reset/syscon-reboot-mode.yaml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/Documentation/devicetree/bindings/power/reset/syscon-reboot-mode.yaml b/Documentation/devicetree/bindings/power/reset/syscon-reboot-mode.yaml index 9b1ffceefe3de..b6acff199cdec 100644 --- a/Documentation/devicetree/bindings/power/reset/syscon-reboot-mode.yaml +++ b/Documentation/devicetree/bindings/power/reset/syscon-reboot-mode.yaml @@ -29,12 +29,10 @@ properties: $ref: /schemas/types.yaml#/definitions/uint32 description: Offset in the register map for the mode register (in bytes) -patternProperties: - "^mode-.+": - $ref: /schemas/types.yaml#/definitions/uint32 - description: Vendor-specific mode value written to the mode register +allOf: + - $ref: reboot-mode.yaml# -additionalProperties: false +unevaluatedProperties: false required: - compatible -- GitLab From 5739da3e16ad0ebe99c31cabe960856b53eaaabe Mon Sep 17 00:00:00 2001 From: Elliot Berman Date: Tue, 31 Oct 2023 11:28:22 -0700 Subject: [PATCH 0038/1290] dt-bindings: power: reset: $ref reboot-mode in nvmem-reboot-mode nvmem-reboot-mode.yaml should $ref: reboot-mode.yaml, but instead rewrites the properties. Update so it $refs instead. Signed-off-by: Elliot Berman Reviewed-by: Rob Herring Reviewed-by: Mukesh Ojha Link: https://lore.kernel.org/r/20231031-ref-nvmem-reboot-mode-v1-1-c1af9070ce52@quicinc.com Signed-off-by: Sebastian Reichel --- .../bindings/power/reset/nvmem-reboot-mode.yaml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/Documentation/devicetree/bindings/power/reset/nvmem-reboot-mode.yaml b/Documentation/devicetree/bindings/power/reset/nvmem-reboot-mode.yaml index 14a262bcbf7cd..627f8a6078c29 100644 --- a/Documentation/devicetree/bindings/power/reset/nvmem-reboot-mode.yaml +++ b/Documentation/devicetree/bindings/power/reset/nvmem-reboot-mode.yaml @@ -28,17 +28,15 @@ properties: items: - const: reboot-mode -patternProperties: - "^mode-.+": - $ref: /schemas/types.yaml#/definitions/uint32 - description: Vendor-specific mode value written to the mode register +allOf: + - $ref: reboot-mode.yaml# required: - compatible - nvmem-cells - nvmem-cell-names -additionalProperties: false +unevaluatedProperties: false examples: - | -- GitLab From dfcb264a01a9199e8338a548731baf5bbe77ef19 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sat, 4 Nov 2023 16:49:06 +0100 Subject: [PATCH 0039/1290] power: supply: bq27xxx: Stop and start delayed work in suspend and resume This driver uses delayed work to perform periodic battery state read out. This delayed work is not stopped across suspend and resume cycle. The read out can occur early in the resume cycle. In case of an I2C variant of this hardware, that read out triggers I2C transfer. That I2C transfer may happen while the I2C controller is still suspended, which produces a WARNING in the kernel log. Fix this by introducing trivial PM ops, which stop the delayed work before the system enters suspend, and schedule the delayed work right after the system resumes. Signed-off-by: Marek Vasut Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20231104154920.68585-1-marex@denx.de Signed-off-by: Sebastian Reichel --- drivers/power/supply/bq27xxx_battery.c | 22 ++++++++++++++++++++++ drivers/power/supply/bq27xxx_battery_i2c.c | 1 + include/linux/power/bq27xxx_battery.h | 1 + 3 files changed, 24 insertions(+) diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c index 4296600e8912a..1c4a9d1377442 100644 --- a/drivers/power/supply/bq27xxx_battery.c +++ b/drivers/power/supply/bq27xxx_battery.c @@ -2162,6 +2162,28 @@ void bq27xxx_battery_teardown(struct bq27xxx_device_info *di) } EXPORT_SYMBOL_GPL(bq27xxx_battery_teardown); +#ifdef CONFIG_PM_SLEEP +static int bq27xxx_battery_suspend(struct device *dev) +{ + struct bq27xxx_device_info *di = dev_get_drvdata(dev); + + cancel_delayed_work(&di->work); + return 0; +} + +static int bq27xxx_battery_resume(struct device *dev) +{ + struct bq27xxx_device_info *di = dev_get_drvdata(dev); + + schedule_delayed_work(&di->work, 0); + return 0; +} +#endif /* CONFIG_PM_SLEEP */ + +SIMPLE_DEV_PM_OPS(bq27xxx_battery_battery_pm_ops, + bq27xxx_battery_suspend, bq27xxx_battery_resume); +EXPORT_SYMBOL_GPL(bq27xxx_battery_battery_pm_ops); + MODULE_AUTHOR("Rodolfo Giometti "); MODULE_DESCRIPTION("BQ27xxx battery monitor driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/power/supply/bq27xxx_battery_i2c.c b/drivers/power/supply/bq27xxx_battery_i2c.c index 9b5475590518f..3a1798b0c1a79 100644 --- a/drivers/power/supply/bq27xxx_battery_i2c.c +++ b/drivers/power/supply/bq27xxx_battery_i2c.c @@ -295,6 +295,7 @@ static struct i2c_driver bq27xxx_battery_i2c_driver = { .driver = { .name = "bq27xxx-battery", .of_match_table = of_match_ptr(bq27xxx_battery_i2c_of_match_table), + .pm = &bq27xxx_battery_battery_pm_ops, }, .probe = bq27xxx_battery_i2c_probe, .remove = bq27xxx_battery_i2c_remove, diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h index 7c8d65414a70a..7d8025fb74b70 100644 --- a/include/linux/power/bq27xxx_battery.h +++ b/include/linux/power/bq27xxx_battery.h @@ -83,5 +83,6 @@ struct bq27xxx_device_info { void bq27xxx_battery_update(struct bq27xxx_device_info *di); int bq27xxx_battery_setup(struct bq27xxx_device_info *di); void bq27xxx_battery_teardown(struct bq27xxx_device_info *di); +extern const struct dev_pm_ops bq27xxx_battery_battery_pm_ops; #endif -- GitLab From 099806de68b75a0fe114376b1ee162fdff572ecc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sat, 4 Nov 2023 22:15:03 +0100 Subject: [PATCH 0040/1290] power: reset: at91-poweroff: Stop using module_platform_driver_probe() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On today's platforms the benefit of platform_driver_probe() isn't that relevant any more. It allows to drop some code after booting (or module loading) for .probe() and discard the .remove() function completely if the driver is built-in. This typically saves a few 100k. The downside of platform_driver_probe() is that the driver cannot be bound and unbound at runtime which is ancient and so slightly complicates testing. There are also thoughts to deprecate platform_driver_probe() because it adds some complexity in the driver core for little gain. Also many drivers don't use it correctly. This driver for example misses to mark the driver struct with __ref which is needed to suppress a (W=1) modpost warning. Signed-off-by: Uwe Kleine-König Reviewed-by: Claudiu Beznea Link: https://lore.kernel.org/r/20231104211501.3676352-17-u.kleine-koenig@pengutronix.de Signed-off-by: Sebastian Reichel --- drivers/power/reset/at91-poweroff.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/power/reset/at91-poweroff.c b/drivers/power/reset/at91-poweroff.c index dd5399785b691..83567428ab432 100644 --- a/drivers/power/reset/at91-poweroff.c +++ b/drivers/power/reset/at91-poweroff.c @@ -149,7 +149,7 @@ static void at91_poweroff_dt_set_wakeup_mode(struct platform_device *pdev) writel(wakeup_mode | mode, at91_shdwc.shdwc_base + AT91_SHDW_MR); } -static int __init at91_poweroff_probe(struct platform_device *pdev) +static int at91_poweroff_probe(struct platform_device *pdev) { struct device_node *np; u32 ddr_type; @@ -202,7 +202,7 @@ clk_disable: return ret; } -static int __exit at91_poweroff_remove(struct platform_device *pdev) +static int at91_poweroff_remove(struct platform_device *pdev) { if (pm_power_off == at91_poweroff) pm_power_off = NULL; @@ -224,13 +224,14 @@ static const struct of_device_id at91_poweroff_of_match[] = { MODULE_DEVICE_TABLE(of, at91_poweroff_of_match); static struct platform_driver at91_poweroff_driver = { - .remove = __exit_p(at91_poweroff_remove), + .probe = at91_poweroff_probe, + .remove = at91_poweroff_remove, .driver = { .name = "at91-poweroff", .of_match_table = at91_poweroff_of_match, }, }; -module_platform_driver_probe(at91_poweroff_driver, at91_poweroff_probe); +module_platform_driver(at91_poweroff_driver); MODULE_AUTHOR("Atmel Corporation"); MODULE_DESCRIPTION("Shutdown driver for Atmel SoCs"); -- GitLab From 12389c657b623da34ba9b30306e13919d0b42f3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sat, 4 Nov 2023 22:15:04 +0100 Subject: [PATCH 0041/1290] power: reset: at91-reset: Stop using module_platform_driver_probe() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On today's platforms the benefit of platform_driver_probe() isn't that relevant any more. It allows to drop some code after booting (or module loading) for .probe() and discard the .remove() function completely if the driver is built-in. This typically saves a few 100k. The downside of platform_driver_probe() is that the driver cannot be bound and unbound at runtime which is ancient and so slightly complicates testing. There are also thoughts to deprecate platform_driver_probe() because it adds some complexity in the driver core for little gain. Also many drivers don't use it correctly. This driver for example misses to mark the driver struct with __ref which is needed to suppress a (W=1) modpost warning. Signed-off-by: Uwe Kleine-König Reviewed-by: Claudiu Beznea Link: https://lore.kernel.org/r/20231104211501.3676352-18-u.kleine-koenig@pengutronix.de Signed-off-by: Sebastian Reichel --- drivers/power/reset/at91-reset.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/power/reset/at91-reset.c b/drivers/power/reset/at91-reset.c index aa9b012d3d00b..af85f2f929ba7 100644 --- a/drivers/power/reset/at91-reset.c +++ b/drivers/power/reset/at91-reset.c @@ -337,7 +337,7 @@ static int at91_rcdev_init(struct at91_reset *reset, return devm_reset_controller_register(&pdev->dev, &reset->rcdev); } -static int __init at91_reset_probe(struct platform_device *pdev) +static int at91_reset_probe(struct platform_device *pdev) { const struct of_device_id *match; struct at91_reset *reset; @@ -417,7 +417,7 @@ disable_clk: return ret; } -static int __exit at91_reset_remove(struct platform_device *pdev) +static int at91_reset_remove(struct platform_device *pdev) { struct at91_reset *reset = platform_get_drvdata(pdev); @@ -428,13 +428,14 @@ static int __exit at91_reset_remove(struct platform_device *pdev) } static struct platform_driver at91_reset_driver = { - .remove = __exit_p(at91_reset_remove), + .probe = at91_reset_probe, + .remove = at91_reset_remove, .driver = { .name = "at91-reset", .of_match_table = at91_reset_of_match, }, }; -module_platform_driver_probe(at91_reset_driver, at91_reset_probe); +module_platform_driver(at91_reset_driver); MODULE_AUTHOR("Atmel Corporation"); MODULE_DESCRIPTION("Reset driver for Atmel SoCs"); -- GitLab From dde74a5de817e0a011e4783cf26295d7f6fdca26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sat, 4 Nov 2023 22:15:05 +0100 Subject: [PATCH 0042/1290] power: reset: at91-sama5d2_shdwc: Stop using module_platform_driver_probe() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On today's platforms the benefit of platform_driver_probe() isn't that relevant any more. It allows to drop some code after booting (or module loading) for .probe() and discard the .remove() function completely if the driver is built-in. This typically saves a few 100k. The downside of platform_driver_probe() is that the driver cannot be bound and unbound at runtime which is ancient and so slightly complicates testing. There are also thoughts to deprecate platform_driver_probe() because it adds some complexity in the driver core for little gain. Also many drivers don't use it correctly. This driver for example misses to mark the driver struct with __ref which is needed to suppress a (W=1) modpost warning. Signed-off-by: Uwe Kleine-König Reviewed-by: Claudiu Beznea Link: https://lore.kernel.org/r/20231104211501.3676352-19-u.kleine-koenig@pengutronix.de Signed-off-by: Sebastian Reichel --- drivers/power/reset/at91-sama5d2_shdwc.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/power/reset/at91-sama5d2_shdwc.c b/drivers/power/reset/at91-sama5d2_shdwc.c index e76b102b57b1f..ef8add623363a 100644 --- a/drivers/power/reset/at91-sama5d2_shdwc.c +++ b/drivers/power/reset/at91-sama5d2_shdwc.c @@ -329,7 +329,7 @@ static const struct of_device_id at91_pmc_ids[] = { { /* Sentinel. */ } }; -static int __init at91_shdwc_probe(struct platform_device *pdev) +static int at91_shdwc_probe(struct platform_device *pdev) { const struct of_device_id *match; struct device_node *np; @@ -421,7 +421,7 @@ clk_disable: return ret; } -static int __exit at91_shdwc_remove(struct platform_device *pdev) +static int at91_shdwc_remove(struct platform_device *pdev) { struct shdwc *shdw = platform_get_drvdata(pdev); @@ -442,13 +442,14 @@ static int __exit at91_shdwc_remove(struct platform_device *pdev) } static struct platform_driver at91_shdwc_driver = { - .remove = __exit_p(at91_shdwc_remove), + .probe = at91_shdwc_probe, + .remove = at91_shdwc_remove, .driver = { .name = "at91-shdwc", .of_match_table = at91_shdwc_of_match, }, }; -module_platform_driver_probe(at91_shdwc_driver, at91_shdwc_probe); +module_platform_driver(at91_shdwc_driver); MODULE_AUTHOR("Nicolas Ferre "); MODULE_DESCRIPTION("Atmel shutdown controller driver"); -- GitLab From 904e582f0c7282b3d7c76c73c06f3ad3b0910335 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sat, 4 Nov 2023 22:15:06 +0100 Subject: [PATCH 0043/1290] power: reset: as3722-poweroff: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20231104211501.3676352-20-u.kleine-koenig@pengutronix.de Signed-off-by: Sebastian Reichel --- drivers/power/reset/as3722-poweroff.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/power/reset/as3722-poweroff.c b/drivers/power/reset/as3722-poweroff.c index 829e0dba2fda3..ab3350ce2d621 100644 --- a/drivers/power/reset/as3722-poweroff.c +++ b/drivers/power/reset/as3722-poweroff.c @@ -61,13 +61,11 @@ static int as3722_poweroff_probe(struct platform_device *pdev) return 0; } -static int as3722_poweroff_remove(struct platform_device *pdev) +static void as3722_poweroff_remove(struct platform_device *pdev) { if (pm_power_off == as3722_pm_power_off) pm_power_off = NULL; as3722_pm_poweroff = NULL; - - return 0; } static struct platform_driver as3722_poweroff_driver = { @@ -75,7 +73,7 @@ static struct platform_driver as3722_poweroff_driver = { .name = "as3722-power-off", }, .probe = as3722_poweroff_probe, - .remove = as3722_poweroff_remove, + .remove_new = as3722_poweroff_remove, }; module_platform_driver(as3722_poweroff_driver); -- GitLab From a31438ece3ec27057c77822b0b0bc0614798c425 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sat, 4 Nov 2023 22:15:07 +0100 Subject: [PATCH 0044/1290] power: reset: at91-poweroff: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Reviewed-by: Claudiu Beznea Link: https://lore.kernel.org/r/20231104211501.3676352-21-u.kleine-koenig@pengutronix.de Signed-off-by: Sebastian Reichel --- drivers/power/reset/at91-poweroff.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/power/reset/at91-poweroff.c b/drivers/power/reset/at91-poweroff.c index 83567428ab432..126e774e210c0 100644 --- a/drivers/power/reset/at91-poweroff.c +++ b/drivers/power/reset/at91-poweroff.c @@ -202,7 +202,7 @@ clk_disable: return ret; } -static int at91_poweroff_remove(struct platform_device *pdev) +static void at91_poweroff_remove(struct platform_device *pdev) { if (pm_power_off == at91_poweroff) pm_power_off = NULL; @@ -211,8 +211,6 @@ static int at91_poweroff_remove(struct platform_device *pdev) iounmap(at91_shdwc.mpddrc_base); clk_disable_unprepare(at91_shdwc.sclk); - - return 0; } static const struct of_device_id at91_poweroff_of_match[] = { @@ -225,7 +223,7 @@ MODULE_DEVICE_TABLE(of, at91_poweroff_of_match); static struct platform_driver at91_poweroff_driver = { .probe = at91_poweroff_probe, - .remove = at91_poweroff_remove, + .remove_new = at91_poweroff_remove, .driver = { .name = "at91-poweroff", .of_match_table = at91_poweroff_of_match, -- GitLab From 6f539f3151721f1c90fcdafa2962c19a8efc1afc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sat, 4 Nov 2023 22:15:08 +0100 Subject: [PATCH 0045/1290] power: reset: atc260x-poweroff: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20231104211501.3676352-22-u.kleine-koenig@pengutronix.de Signed-off-by: Sebastian Reichel --- drivers/power/reset/atc260x-poweroff.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/power/reset/atc260x-poweroff.c b/drivers/power/reset/atc260x-poweroff.c index 98f20251a6d18..b4aa50e9685e1 100644 --- a/drivers/power/reset/atc260x-poweroff.c +++ b/drivers/power/reset/atc260x-poweroff.c @@ -233,7 +233,7 @@ static int atc260x_pwrc_probe(struct platform_device *pdev) return ret; } -static int atc260x_pwrc_remove(struct platform_device *pdev) +static void atc260x_pwrc_remove(struct platform_device *pdev) { struct atc260x_pwrc *priv = platform_get_drvdata(pdev); @@ -243,13 +243,11 @@ static int atc260x_pwrc_remove(struct platform_device *pdev) } unregister_restart_handler(&priv->restart_nb); - - return 0; } static struct platform_driver atc260x_pwrc_driver = { .probe = atc260x_pwrc_probe, - .remove = atc260x_pwrc_remove, + .remove_new = atc260x_pwrc_remove, .driver = { .name = "atc260x-pwrc", }, -- GitLab From 6642b13206b2225b0d75451f5dd763574a2c8bb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sat, 4 Nov 2023 22:15:09 +0100 Subject: [PATCH 0046/1290] power: reset: ltc2952-poweroff: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20231104211501.3676352-23-u.kleine-koenig@pengutronix.de Signed-off-by: Sebastian Reichel --- drivers/power/reset/ltc2952-poweroff.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/power/reset/ltc2952-poweroff.c b/drivers/power/reset/ltc2952-poweroff.c index eea05921a054b..fa25fbd539343 100644 --- a/drivers/power/reset/ltc2952-poweroff.c +++ b/drivers/power/reset/ltc2952-poweroff.c @@ -286,7 +286,7 @@ static int ltc2952_poweroff_probe(struct platform_device *pdev) return 0; } -static int ltc2952_poweroff_remove(struct platform_device *pdev) +static void ltc2952_poweroff_remove(struct platform_device *pdev) { struct ltc2952_poweroff *data = platform_get_drvdata(pdev); @@ -295,7 +295,6 @@ static int ltc2952_poweroff_remove(struct platform_device *pdev) hrtimer_cancel(&data->timer_wde); atomic_notifier_chain_unregister(&panic_notifier_list, &data->panic_notifier); - return 0; } static const struct of_device_id of_ltc2952_poweroff_match[] = { @@ -306,7 +305,7 @@ MODULE_DEVICE_TABLE(of, of_ltc2952_poweroff_match); static struct platform_driver ltc2952_poweroff_driver = { .probe = ltc2952_poweroff_probe, - .remove = ltc2952_poweroff_remove, + .remove_new = ltc2952_poweroff_remove, .driver = { .name = "ltc2952-poweroff", .of_match_table = of_ltc2952_poweroff_match, -- GitLab From 99f7fa6c7cc573ebe2529959723b71b4d12ec2f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sat, 4 Nov 2023 22:15:10 +0100 Subject: [PATCH 0047/1290] power: reset: mt6323-poweroff: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20231104211501.3676352-24-u.kleine-koenig@pengutronix.de Signed-off-by: Sebastian Reichel --- drivers/power/reset/mt6323-poweroff.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/power/reset/mt6323-poweroff.c b/drivers/power/reset/mt6323-poweroff.c index 108167f7738bb..57a63c0ab7fb7 100644 --- a/drivers/power/reset/mt6323-poweroff.c +++ b/drivers/power/reset/mt6323-poweroff.c @@ -70,12 +70,10 @@ static int mt6323_pwrc_probe(struct platform_device *pdev) return 0; } -static int mt6323_pwrc_remove(struct platform_device *pdev) +static void mt6323_pwrc_remove(struct platform_device *pdev) { if (pm_power_off == &mt6323_do_pwroff) pm_power_off = NULL; - - return 0; } static const struct of_device_id mt6323_pwrc_dt_match[] = { @@ -86,7 +84,7 @@ MODULE_DEVICE_TABLE(of, mt6323_pwrc_dt_match); static struct platform_driver mt6323_pwrc_driver = { .probe = mt6323_pwrc_probe, - .remove = mt6323_pwrc_remove, + .remove_new = mt6323_pwrc_remove, .driver = { .name = "mt6323-pwrc", .of_match_table = mt6323_pwrc_dt_match, -- GitLab From 1a0457ab2ce81d92c2077a79080141d924884c5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sat, 4 Nov 2023 22:15:11 +0100 Subject: [PATCH 0048/1290] power: reset: qnap-poweroff: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20231104211501.3676352-25-u.kleine-koenig@pengutronix.de Signed-off-by: Sebastian Reichel --- drivers/power/reset/qnap-poweroff.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/power/reset/qnap-poweroff.c b/drivers/power/reset/qnap-poweroff.c index 0ddf7f25f7b87..e0f2ff6b147c1 100644 --- a/drivers/power/reset/qnap-poweroff.c +++ b/drivers/power/reset/qnap-poweroff.c @@ -111,15 +111,14 @@ static int qnap_power_off_probe(struct platform_device *pdev) return 0; } -static int qnap_power_off_remove(struct platform_device *pdev) +static void qnap_power_off_remove(struct platform_device *pdev) { pm_power_off = NULL; - return 0; } static struct platform_driver qnap_power_off_driver = { .probe = qnap_power_off_probe, - .remove = qnap_power_off_remove, + .remove_new = qnap_power_off_remove, .driver = { .name = "qnap_power_off", .of_match_table = of_match_ptr(qnap_power_off_of_match_table), -- GitLab From 6f7be7b2f15a654a5f08b7d37df282c171b9380b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sat, 4 Nov 2023 22:15:12 +0100 Subject: [PATCH 0049/1290] power: reset: regulator-poweroff: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20231104211501.3676352-26-u.kleine-koenig@pengutronix.de Signed-off-by: Sebastian Reichel --- drivers/power/reset/regulator-poweroff.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/power/reset/regulator-poweroff.c b/drivers/power/reset/regulator-poweroff.c index 7f87fbb8b051e..15160809c423a 100644 --- a/drivers/power/reset/regulator-poweroff.c +++ b/drivers/power/reset/regulator-poweroff.c @@ -52,12 +52,10 @@ static int regulator_poweroff_probe(struct platform_device *pdev) return 0; } -static int regulator_poweroff_remove(__maybe_unused struct platform_device *pdev) +static void regulator_poweroff_remove(struct platform_device *pdev) { if (pm_power_off == ®ulator_poweroff_do_poweroff) pm_power_off = NULL; - - return 0; } static const struct of_device_id of_regulator_poweroff_match[] = { @@ -68,7 +66,7 @@ MODULE_DEVICE_TABLE(of, of_regulator_poweroff_match); static struct platform_driver regulator_poweroff_driver = { .probe = regulator_poweroff_probe, - .remove = regulator_poweroff_remove, + .remove_new = regulator_poweroff_remove, .driver = { .name = "poweroff-regulator", .of_match_table = of_regulator_poweroff_match, -- GitLab From aedd4da0aa27fe9d0b03b3fbf62376aebbfc385b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sat, 4 Nov 2023 22:15:13 +0100 Subject: [PATCH 0050/1290] power: reset: restart-poweroff: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20231104211501.3676352-27-u.kleine-koenig@pengutronix.de Signed-off-by: Sebastian Reichel --- drivers/power/reset/restart-poweroff.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/power/reset/restart-poweroff.c b/drivers/power/reset/restart-poweroff.c index 28f1822db1626..f4d6004793d3a 100644 --- a/drivers/power/reset/restart-poweroff.c +++ b/drivers/power/reset/restart-poweroff.c @@ -33,12 +33,10 @@ static int restart_poweroff_probe(struct platform_device *pdev) return 0; } -static int restart_poweroff_remove(struct platform_device *pdev) +static void restart_poweroff_remove(struct platform_device *pdev) { if (pm_power_off == &restart_poweroff_do_poweroff) pm_power_off = NULL; - - return 0; } static const struct of_device_id of_restart_poweroff_match[] = { @@ -49,7 +47,7 @@ MODULE_DEVICE_TABLE(of, of_restart_poweroff_match); static struct platform_driver restart_poweroff_driver = { .probe = restart_poweroff_probe, - .remove = restart_poweroff_remove, + .remove_new = restart_poweroff_remove, .driver = { .name = "poweroff-restart", .of_match_table = of_restart_poweroff_match, -- GitLab From 30d26d2be83de0d036f59f384b805dd3bf4bf5ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sat, 4 Nov 2023 22:15:14 +0100 Subject: [PATCH 0051/1290] power: reset: rmobile-reset: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20231104211501.3676352-28-u.kleine-koenig@pengutronix.de Signed-off-by: Sebastian Reichel --- drivers/power/reset/rmobile-reset.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/power/reset/rmobile-reset.c b/drivers/power/reset/rmobile-reset.c index bd3b396558e0d..5df9b41c68c79 100644 --- a/drivers/power/reset/rmobile-reset.c +++ b/drivers/power/reset/rmobile-reset.c @@ -59,11 +59,10 @@ fail_unmap: return error; } -static int rmobile_reset_remove(struct platform_device *pdev) +static void rmobile_reset_remove(struct platform_device *pdev) { unregister_restart_handler(&rmobile_reset_nb); iounmap(sysc_base2); - return 0; } static const struct of_device_id rmobile_reset_of_match[] = { @@ -74,7 +73,7 @@ MODULE_DEVICE_TABLE(of, rmobile_reset_of_match); static struct platform_driver rmobile_reset_driver = { .probe = rmobile_reset_probe, - .remove = rmobile_reset_remove, + .remove_new = rmobile_reset_remove, .driver = { .name = "rmobile_reset", .of_match_table = rmobile_reset_of_match, -- GitLab From 2973706c4160ed8c1e695b4dfef4bdb3124c5753 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sat, 4 Nov 2023 22:15:15 +0100 Subject: [PATCH 0052/1290] power: reset: syscon-poweroff: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20231104211501.3676352-29-u.kleine-koenig@pengutronix.de Signed-off-by: Sebastian Reichel --- drivers/power/reset/syscon-poweroff.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/power/reset/syscon-poweroff.c b/drivers/power/reset/syscon-poweroff.c index c3aab7f59345a..1b2ce7734260c 100644 --- a/drivers/power/reset/syscon-poweroff.c +++ b/drivers/power/reset/syscon-poweroff.c @@ -76,12 +76,10 @@ static int syscon_poweroff_probe(struct platform_device *pdev) return 0; } -static int syscon_poweroff_remove(struct platform_device *pdev) +static void syscon_poweroff_remove(struct platform_device *pdev) { if (pm_power_off == syscon_poweroff) pm_power_off = NULL; - - return 0; } static const struct of_device_id syscon_poweroff_of_match[] = { @@ -91,7 +89,7 @@ static const struct of_device_id syscon_poweroff_of_match[] = { static struct platform_driver syscon_poweroff_driver = { .probe = syscon_poweroff_probe, - .remove = syscon_poweroff_remove, + .remove_new = syscon_poweroff_remove, .driver = { .name = "syscon-poweroff", .of_match_table = syscon_poweroff_of_match, -- GitLab From 0bf7207e09673144df6425b2cad8bcb015e3501a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sat, 4 Nov 2023 22:15:16 +0100 Subject: [PATCH 0053/1290] power: reset: tps65086-restart: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Returning an error if unregister_restart_handler() failed has no effect but triggering another error message. So converting this driver to .remove_new() has no effect but to suppress the duplicated error message. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20231104211501.3676352-30-u.kleine-koenig@pengutronix.de Signed-off-by: Sebastian Reichel --- drivers/power/reset/tps65086-restart.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/power/reset/tps65086-restart.c b/drivers/power/reset/tps65086-restart.c index 5ec819eac7da4..ee8e9f4b837ea 100644 --- a/drivers/power/reset/tps65086-restart.c +++ b/drivers/power/reset/tps65086-restart.c @@ -62,19 +62,21 @@ static int tps65086_restart_probe(struct platform_device *pdev) return 0; } -static int tps65086_restart_remove(struct platform_device *pdev) +static void tps65086_restart_remove(struct platform_device *pdev) { struct tps65086_restart *tps65086_restart = platform_get_drvdata(pdev); int ret; ret = unregister_restart_handler(&tps65086_restart->handler); if (ret) { + /* + * tps65086_restart_probe() registered the restart handler. So + * unregistering should work fine. Checking the error code + * shouldn't be needed, still doing it for completeness. + */ dev_err(&pdev->dev, "%s: cannot unregister restart handler: %d\n", __func__, ret); - return -ENODEV; } - - return 0; } static const struct platform_device_id tps65086_restart_id_table[] = { @@ -88,7 +90,7 @@ static struct platform_driver tps65086_restart_driver = { .name = "tps65086-restart", }, .probe = tps65086_restart_probe, - .remove = tps65086_restart_remove, + .remove_new = tps65086_restart_remove, .id_table = tps65086_restart_id_table, }; module_platform_driver(tps65086_restart_driver); -- GitLab From 20cea2b59abe33b80536b936b6729c88d1de2624 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sun, 5 Nov 2023 10:47:13 +0100 Subject: [PATCH 0054/1290] power: reset: at91-reset: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Reviewed-by: Claudiu Beznea Link: https://lore.kernel.org/r/20231105094712.3706799-3-u.kleine-koenig@pengutronix.de Signed-off-by: Sebastian Reichel --- drivers/power/reset/at91-reset.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/power/reset/at91-reset.c b/drivers/power/reset/at91-reset.c index af85f2f929ba7..16512654295f5 100644 --- a/drivers/power/reset/at91-reset.c +++ b/drivers/power/reset/at91-reset.c @@ -417,19 +417,17 @@ disable_clk: return ret; } -static int at91_reset_remove(struct platform_device *pdev) +static void at91_reset_remove(struct platform_device *pdev) { struct at91_reset *reset = platform_get_drvdata(pdev); unregister_restart_handler(&reset->nb); clk_disable_unprepare(reset->sclk); - - return 0; } static struct platform_driver at91_reset_driver = { .probe = at91_reset_probe, - .remove = at91_reset_remove, + .remove_new = at91_reset_remove, .driver = { .name = "at91-reset", .of_match_table = at91_reset_of_match, -- GitLab From 054eb2377523530404fb64c24c8747feb022c5b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sun, 5 Nov 2023 10:47:14 +0100 Subject: [PATCH 0055/1290] power: reset: at91-sama5d2_shdwc: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Reviewed-by: Claudiu Beznea Link: https://lore.kernel.org/r/20231105094712.3706799-4-u.kleine-koenig@pengutronix.de Signed-off-by: Sebastian Reichel --- drivers/power/reset/at91-sama5d2_shdwc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/power/reset/at91-sama5d2_shdwc.c b/drivers/power/reset/at91-sama5d2_shdwc.c index ef8add623363a..af95c7b39cb3c 100644 --- a/drivers/power/reset/at91-sama5d2_shdwc.c +++ b/drivers/power/reset/at91-sama5d2_shdwc.c @@ -421,7 +421,7 @@ clk_disable: return ret; } -static int at91_shdwc_remove(struct platform_device *pdev) +static void at91_shdwc_remove(struct platform_device *pdev) { struct shdwc *shdw = platform_get_drvdata(pdev); @@ -437,13 +437,11 @@ static int at91_shdwc_remove(struct platform_device *pdev) iounmap(shdw->pmc_base); clk_disable_unprepare(shdw->sclk); - - return 0; } static struct platform_driver at91_shdwc_driver = { .probe = at91_shdwc_probe, - .remove = at91_shdwc_remove, + .remove_new = at91_shdwc_remove, .driver = { .name = "at91-shdwc", .of_match_table = at91_shdwc_of_match, -- GitLab From f37669119423ca852ca855b24732f25c0737aa57 Mon Sep 17 00:00:00 2001 From: Jan Palus Date: Sat, 11 Nov 2023 23:17:04 +0100 Subject: [PATCH 0056/1290] power: supply: cw2015: correct time_to_empty units in sysfs RRT_ALRT register holds remaining battery time in minutes therefore it needs to be scaled accordingly when exposing TIME_TO_EMPTY via sysfs expressed in seconds Fixes: b4c7715c10c1 ("power: supply: add CellWise cw2015 fuel gauge driver") Signed-off-by: Jan Palus Link: https://lore.kernel.org/r/20231111221704.5579-1-jpalus@fastmail.com Signed-off-by: Sebastian Reichel --- drivers/power/supply/cw2015_battery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/power/supply/cw2015_battery.c b/drivers/power/supply/cw2015_battery.c index bb29e9ebd24a8..99f3ccdc30a6a 100644 --- a/drivers/power/supply/cw2015_battery.c +++ b/drivers/power/supply/cw2015_battery.c @@ -491,7 +491,7 @@ static int cw_battery_get_property(struct power_supply *psy, case POWER_SUPPLY_PROP_TIME_TO_EMPTY_NOW: if (cw_battery_valid_time_to_empty(cw_bat)) - val->intval = cw_bat->time_to_empty; + val->intval = cw_bat->time_to_empty * 60; else val->intval = 0; break; -- GitLab From 8c91ef9883bfb5a6315917a7104dd1e06c85b278 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Mon, 30 Oct 2023 10:48:17 +0100 Subject: [PATCH 0057/1290] dt-bindings: phy: qcom,sc8280xp-qmp-ufs-phy: document the SM8650 QMP UFS PHY Document the QMP UFS PHY on the SM8650 Platform. Signed-off-by: Neil Armstrong Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20231030-topic-sm8650-upstream-phy-v2-1-a543a4c4b491@linaro.org Signed-off-by: Vinod Koul --- .../devicetree/bindings/phy/qcom,sc8280xp-qmp-ufs-phy.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-ufs-phy.yaml b/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-ufs-phy.yaml index f3a3296c811cb..8474eef8d0ff5 100644 --- a/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-ufs-phy.yaml +++ b/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-ufs-phy.yaml @@ -32,6 +32,7 @@ properties: - qcom,sm8350-qmp-ufs-phy - qcom,sm8450-qmp-ufs-phy - qcom,sm8550-qmp-ufs-phy + - qcom,sm8650-qmp-ufs-phy reg: maxItems: 1 @@ -112,6 +113,7 @@ allOf: - qcom,sm8250-qmp-ufs-phy - qcom,sm8350-qmp-ufs-phy - qcom,sm8550-qmp-ufs-phy + - qcom,sm8650-qmp-ufs-phy then: properties: clocks: -- GitLab From 9e3f381986f6be1a7199587f8921cbb7e89a733e Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Mon, 30 Oct 2023 10:48:18 +0100 Subject: [PATCH 0058/1290] dt-bindings: phy: qcom,sc8280xp-qmp-pcie-phy: document the SM8650 QMP PCIe PHYs Document the QMP PCIe PHYs on the SM8650 Platform. Reviewed-by: Krzysztof Kozlowski Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20231030-topic-sm8650-upstream-phy-v2-2-a543a4c4b491@linaro.org Signed-off-by: Vinod Koul --- .../devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml b/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml index 2c3d6553a7bac..6c03f2d5fca3c 100644 --- a/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml +++ b/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml @@ -36,6 +36,8 @@ properties: - qcom,sm8450-qmp-gen4x2-pcie-phy - qcom,sm8550-qmp-gen3x2-pcie-phy - qcom,sm8550-qmp-gen4x2-pcie-phy + - qcom,sm8650-qmp-gen3x2-pcie-phy + - qcom,sm8650-qmp-gen4x2-pcie-phy reg: minItems: 1 @@ -147,6 +149,8 @@ allOf: - qcom,sm8450-qmp-gen3x2-pcie-phy - qcom,sm8550-qmp-gen3x2-pcie-phy - qcom,sm8550-qmp-gen4x2-pcie-phy + - qcom,sm8650-qmp-gen3x2-pcie-phy + - qcom,sm8650-qmp-gen4x2-pcie-phy then: properties: clocks: @@ -189,6 +193,7 @@ allOf: contains: enum: - qcom,sm8550-qmp-gen4x2-pcie-phy + - qcom,sm8650-qmp-gen4x2-pcie-phy then: properties: resets: -- GitLab From 685c00ac4240c1205005f617916f68b76da78908 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Mon, 30 Oct 2023 10:48:19 +0100 Subject: [PATCH 0059/1290] dt-bindings: phy: qcom,sc8280xp-qmp-usb43dp-phy: document the SM8650 QMP USB/DP Combo PHY Document the QMP USB/DP Combo PHY on the SM8650 Platform. Reviewed-by: Krzysztof Kozlowski Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20231030-topic-sm8650-upstream-phy-v2-3-a543a4c4b491@linaro.org Signed-off-by: Vinod Koul --- .../devicetree/bindings/phy/qcom,sc8280xp-qmp-usb43dp-phy.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-usb43dp-phy.yaml b/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-usb43dp-phy.yaml index 9af203dc8793f..ae83cb8cb21f8 100644 --- a/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-usb43dp-phy.yaml +++ b/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-usb43dp-phy.yaml @@ -27,6 +27,7 @@ properties: - qcom,sm8350-qmp-usb3-dp-phy - qcom,sm8450-qmp-usb3-dp-phy - qcom,sm8550-qmp-usb3-dp-phy + - qcom,sm8650-qmp-usb3-dp-phy reg: maxItems: 1 @@ -128,6 +129,7 @@ allOf: - qcom,sc8280xp-qmp-usb43dp-phy - qcom,sm6350-qmp-usb3-dp-phy - qcom,sm8550-qmp-usb3-dp-phy + - qcom,sm8650-qmp-usb3-dp-phy then: required: - power-domains -- GitLab From 330df15dab25931c9214bfa279319755d2201d7f Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Mon, 30 Oct 2023 10:48:20 +0100 Subject: [PATCH 0060/1290] dt-bindings: phy: qcom,snps-eusb2: document the SM8650 Synopsys eUSB2 PHY Document the Synopsys eUSB2 PHY on the SM8650 Platform by using the SM8550 as fallback. Reviewed-by: Krzysztof Kozlowski Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20231030-topic-sm8650-upstream-phy-v2-4-a543a4c4b491@linaro.org Signed-off-by: Vinod Koul --- Documentation/devicetree/bindings/phy/qcom,snps-eusb2-phy.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/phy/qcom,snps-eusb2-phy.yaml b/Documentation/devicetree/bindings/phy/qcom,snps-eusb2-phy.yaml index c95828607ab6b..8f5d7362046c9 100644 --- a/Documentation/devicetree/bindings/phy/qcom,snps-eusb2-phy.yaml +++ b/Documentation/devicetree/bindings/phy/qcom,snps-eusb2-phy.yaml @@ -18,6 +18,7 @@ properties: - items: - enum: - qcom,sdx75-snps-eusb2-phy + - qcom,sm8650-snps-eusb2-phy - const: qcom,sm8550-snps-eusb2-phy - const: qcom,sm8550-snps-eusb2-phy -- GitLab From 7c4bf8cb9d40a7222bb5d641979952b196707985 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Mon, 30 Oct 2023 10:48:21 +0100 Subject: [PATCH 0061/1290] phy: qcom: qmp-ufs: add QMP UFS PHY tables for SM8650 Add QMP UFS PHY support for the SM8650 platform. Reviewed-by: Dmitry Baryshkov Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20231030-topic-sm8650-upstream-phy-v2-5-a543a4c4b491@linaro.org Signed-off-by: Vinod Koul --- .../phy/qualcomm/phy-qcom-qmp-pcs-ufs-v6.h | 1 + .../phy-qcom-qmp-qserdes-txrx-ufs-v6.h | 7 ++ drivers/phy/qualcomm/phy-qcom-qmp-ufs.c | 86 +++++++++++++++++++ 3 files changed, 94 insertions(+) diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-pcs-ufs-v6.h b/drivers/phy/qualcomm/phy-qcom-qmp-pcs-ufs-v6.h index c23d5e41e25b5..fe6c450f61238 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp-pcs-ufs-v6.h +++ b/drivers/phy/qualcomm/phy-qcom-qmp-pcs-ufs-v6.h @@ -12,6 +12,7 @@ #define QPHY_V6_PCS_UFS_SW_RESET 0x008 #define QPHY_V6_PCS_UFS_TIMER_20US_CORECLK_STEPS_MSB 0x00c #define QPHY_V6_PCS_UFS_TIMER_20US_CORECLK_STEPS_LSB 0x010 +#define QPHY_V6_PCS_UFS_PCS_CTRL1 0x020 #define QPHY_V6_PCS_UFS_PLL_CNTL 0x02c #define QPHY_V6_PCS_UFS_TX_LARGE_AMP_DRV_LVL 0x030 #define QPHY_V6_PCS_UFS_TX_SMALL_AMP_DRV_LVL 0x038 diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-ufs-v6.h b/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-ufs-v6.h index 15bcb4ba91399..ae220fd04d10d 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-ufs-v6.h +++ b/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-ufs-v6.h @@ -10,10 +10,17 @@ #define QSERDES_UFS_V6_TX_RES_CODE_LANE_RX 0x2c #define QSERDES_UFS_V6_TX_RES_CODE_LANE_OFFSET_TX 0x30 #define QSERDES_UFS_V6_TX_RES_CODE_LANE_OFFSET_RX 0x34 +#define QSERDES_UFS_V6_TX_LANE_MODE_1 0x7c #define QSERDES_UFS_V6_RX_UCDR_FASTLOCK_FO_GAIN_RATE2 0x08 #define QSERDES_UFS_V6_RX_UCDR_FASTLOCK_FO_GAIN_RATE4 0x10 +#define QSERDES_UFS_V6_RX_UCDR_SO_SATURATION 0x28 +#define QSERDES_UFS_V6_RX_UCDR_PI_CTRL1 0x58 +#define QSERDES_UFS_V6_RX_RX_TERM_BW_CTRL0 0xc4 +#define QSERDES_UFS_V6_RX_UCDR_FO_GAIN_RATE2 0xd4 +#define QSERDES_UFS_V6_RX_UCDR_FO_GAIN_RATE4 0xdc #define QSERDES_UFS_V6_RX_VGA_CAL_MAN_VAL 0x178 +#define QSERDES_UFS_V6_RX_INTERFACE_MODE 0x1e0 #define QSERDES_UFS_V6_RX_MODE_RATE_0_1_B0 0x208 #define QSERDES_UFS_V6_RX_MODE_RATE_0_1_B1 0x20c #define QSERDES_UFS_V6_RX_MODE_RATE_0_1_B3 0x214 diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-ufs.c b/drivers/phy/qualcomm/phy-qcom-qmp-ufs.c index 514fa14df6345..5f79d188b435e 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp-ufs.c +++ b/drivers/phy/qualcomm/phy-qcom-qmp-ufs.c @@ -803,6 +803,67 @@ static const struct qmp_phy_init_tbl sm8550_ufsphy_pcs[] = { QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_MULTI_LANE_CTRL1, 0x02), }; +static const struct qmp_phy_init_tbl sm8650_ufsphy_serdes[] = { + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SYSCLK_EN_SEL, 0xd9), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_CMN_CONFIG_1, 0x16), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_HSCLK_SEL_1, 0x11), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_HSCLK_HS_SWITCH_SEL_1, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP_EN, 0x01), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_IVCO, 0x0f), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_VCO_TUNE_MAP, 0x44), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_VCO_TUNE_INITVAL2, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DEC_START_MODE0, 0x41), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_CP_CTRL_MODE0, 0x0a), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_RCTRL_MODE0, 0x18), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_CCTRL_MODE0, 0x14), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP1_MODE0, 0x7f), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP2_MODE0, 0x06), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DEC_START_MODE1, 0x4c), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_CP_CTRL_MODE1, 0x0a), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_RCTRL_MODE1, 0x18), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_CCTRL_MODE1, 0x14), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP1_MODE1, 0x99), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP2_MODE1, 0x07), +}; + +static const struct qmp_phy_init_tbl sm8650_ufsphy_tx[] = { + QMP_PHY_INIT_CFG(QSERDES_UFS_V6_TX_LANE_MODE_1, 0x05), + QMP_PHY_INIT_CFG(QSERDES_UFS_V6_TX_RES_CODE_LANE_OFFSET_TX, 0x07), +}; + +static const struct qmp_phy_init_tbl sm8650_ufsphy_rx[] = { + QMP_PHY_INIT_CFG(QSERDES_UFS_V6_RX_UCDR_FO_GAIN_RATE2, 0x0c), + QMP_PHY_INIT_CFG(QSERDES_UFS_V6_RX_UCDR_FO_GAIN_RATE4, 0x0f), + QMP_PHY_INIT_CFG(QSERDES_UFS_V6_RX_VGA_CAL_MAN_VAL, 0x0e), + QMP_PHY_INIT_CFG(QSERDES_UFS_V6_RX_MODE_RATE_0_1_B0, 0xc2), + QMP_PHY_INIT_CFG(QSERDES_UFS_V6_RX_MODE_RATE_0_1_B1, 0xc2), + QMP_PHY_INIT_CFG(QSERDES_UFS_V6_RX_MODE_RATE_0_1_B3, 0x1a), + QMP_PHY_INIT_CFG(QSERDES_UFS_V6_RX_MODE_RATE_0_1_B6, 0x60), + QMP_PHY_INIT_CFG(QSERDES_UFS_V6_RX_MODE_RATE2_B3, 0x9e), + QMP_PHY_INIT_CFG(QSERDES_UFS_V6_RX_MODE_RATE2_B6, 0x60), + QMP_PHY_INIT_CFG(QSERDES_UFS_V6_RX_MODE_RATE3_B3, 0x9e), + QMP_PHY_INIT_CFG(QSERDES_UFS_V6_RX_MODE_RATE3_B4, 0x0e), + QMP_PHY_INIT_CFG(QSERDES_UFS_V6_RX_MODE_RATE3_B5, 0x36), + QMP_PHY_INIT_CFG(QSERDES_UFS_V6_RX_MODE_RATE3_B8, 0x02), + QMP_PHY_INIT_CFG(QSERDES_UFS_V6_RX_MODE_RATE4_B3, 0xb9), + QMP_PHY_INIT_CFG(QSERDES_UFS_V6_RX_MODE_RATE4_B6, 0xff), + QMP_PHY_INIT_CFG(QSERDES_UFS_V6_RX_UCDR_SO_SATURATION, 0x1f), + QMP_PHY_INIT_CFG(QSERDES_UFS_V6_RX_UCDR_PI_CTRL1, 0x94), + QMP_PHY_INIT_CFG(QSERDES_UFS_V6_RX_RX_TERM_BW_CTRL0, 0xfa), +}; + +static const struct qmp_phy_init_tbl sm8650_ufsphy_pcs[] = { + QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_MULTI_LANE_CTRL1, 0x00), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_TX_MID_TERM_CTRL1, 0x43), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_PCS_CTRL1, 0xc1), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_PLL_CNTL, 0x33), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_TX_HSGEAR_CAPABILITY, 0x04), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_RX_HSGEAR_CAPABILITY, 0x04), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_TX_LARGE_AMP_DRV_LVL, 0x0f), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_RX_SIGDET_CTRL2, 0x69), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_MULTI_LANE_CTRL1, 0x02), +}; + struct qmp_ufs_offsets { u16 serdes; u16 pcs; @@ -1303,6 +1364,28 @@ static const struct qmp_phy_cfg sm8550_ufsphy_cfg = { .regs = ufsphy_v6_regs_layout, }; +static const struct qmp_phy_cfg sm8650_ufsphy_cfg = { + .lanes = 2, + + .offsets = &qmp_ufs_offsets_v6, + + .tbls = { + .serdes = sm8650_ufsphy_serdes, + .serdes_num = ARRAY_SIZE(sm8650_ufsphy_serdes), + .tx = sm8650_ufsphy_tx, + .tx_num = ARRAY_SIZE(sm8650_ufsphy_tx), + .rx = sm8650_ufsphy_rx, + .rx_num = ARRAY_SIZE(sm8650_ufsphy_rx), + .pcs = sm8650_ufsphy_pcs, + .pcs_num = ARRAY_SIZE(sm8650_ufsphy_pcs), + }, + .clk_list = sdm845_ufs_phy_clk_l, + .num_clks = ARRAY_SIZE(sdm845_ufs_phy_clk_l), + .vreg_list = qmp_phy_vreg_l, + .num_vregs = ARRAY_SIZE(qmp_phy_vreg_l), + .regs = ufsphy_v6_regs_layout, +}; + static void qmp_ufs_configure_lane(void __iomem *base, const struct qmp_phy_init_tbl tbl[], int num, @@ -1826,6 +1909,9 @@ static const struct of_device_id qmp_ufs_of_match_table[] = { }, { .compatible = "qcom,sm8550-qmp-ufs-phy", .data = &sm8550_ufsphy_cfg, + }, { + .compatible = "qcom,sm8650-qmp-ufs-phy", + .data = &sm8650_ufsphy_cfg, }, { }, }; -- GitLab From c954b6d347e78690f053342b2d2759c650e61a43 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Mon, 30 Oct 2023 10:48:22 +0100 Subject: [PATCH 0062/1290] phy: qcom: qmp-pcie: add QMP PCIe PHY tables for SM8650 Add QMP PCIe PHY support for the SM8650 platform. Reviewed-by: Dmitry Baryshkov Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20231030-topic-sm8650-upstream-phy-v2-6-a543a4c4b491@linaro.org Signed-off-by: Vinod Koul --- drivers/phy/qualcomm/phy-qcom-qmp-pcie.c | 65 ++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-pcie.c b/drivers/phy/qualcomm/phy-qcom-qmp-pcie.c index b64598ac59f4d..2af7115ef9689 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp-pcie.c +++ b/drivers/phy/qualcomm/phy-qcom-qmp-pcie.c @@ -1909,6 +1909,35 @@ static const struct qmp_phy_init_tbl sm8550_qmp_gen4x2_pcie_pcs_misc_tbl[] = { QMP_PHY_INIT_CFG(QPHY_PCIE_V6_20_PCS_G4_FOM_EQ_CONFIG5, 0xf2), }; +static const struct qmp_phy_init_tbl sm8650_qmp_gen4x2_pcie_rx_tbl[] = { + QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_UCDR_FO_GAIN_RATE_2, 0x0a), + QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_UCDR_FO_GAIN_RATE_3, 0x0a), + QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_UCDR_PI_CONTROLS, 0x16), + QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_UCDR_SO_ACC_DEFAULT_VAL_RATE3, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_IVCM_CAL_CTRL2, 0x82), + QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_DFE_3, 0x05), + QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_VGA_CAL_MAN_VAL, 0x0a), + QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_GM_CAL, 0x0d), + QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_EQU_ADAPTOR_CNTRL4, 0x0b), + QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_SIGDET_ENABLES, 0x1c), + QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_PHPRE_CTRL, 0x20), + QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_DFE_CTLE_POST_CAL_OFFSET, 0x38), + QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_MODE_RATE2_B0, 0xd3), + QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_MODE_RATE2_B1, 0xd3), + QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_MODE_RATE2_B2, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_MODE_RATE2_B3, 0x9a), + QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_MODE_RATE2_B4, 0x06), + QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_MODE_RATE2_B5, 0xb6), + QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_MODE_RATE2_B6, 0xee), + QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_MODE_RATE3_B0, 0x23), + QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_MODE_RATE3_B1, 0x9b), + QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_MODE_RATE3_B2, 0x60), + QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_MODE_RATE3_B3, 0xdf), + QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_MODE_RATE3_B4, 0x43), + QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_MODE_RATE3_B5, 0x76), + QMP_PHY_INIT_CFG(QSERDES_V6_20_RX_MODE_RATE3_B6, 0xff), +}; + static const struct qmp_phy_init_tbl sa8775p_qmp_gen4x2_pcie_serdes_alt_tbl[] = { QMP_PHY_INIT_CFG(QSERDES_V5_COM_BIAS_EN_CLKBUFLR_EN, 0x14), QMP_PHY_INIT_CFG(QSERDES_V5_COM_PLL_IVCO, 0x0f), @@ -3047,6 +3076,36 @@ static const struct qmp_phy_cfg sm8550_qmp_gen4x2_pciephy_cfg = { .has_nocsr_reset = true, }; +static const struct qmp_phy_cfg sm8650_qmp_gen4x2_pciephy_cfg = { + .lanes = 2, + + .offsets = &qmp_pcie_offsets_v6_20, + + .tbls = { + .serdes = sm8550_qmp_gen4x2_pcie_serdes_tbl, + .serdes_num = ARRAY_SIZE(sm8550_qmp_gen4x2_pcie_serdes_tbl), + .tx = sm8550_qmp_gen4x2_pcie_tx_tbl, + .tx_num = ARRAY_SIZE(sm8550_qmp_gen4x2_pcie_tx_tbl), + .rx = sm8650_qmp_gen4x2_pcie_rx_tbl, + .rx_num = ARRAY_SIZE(sm8650_qmp_gen4x2_pcie_rx_tbl), + .pcs = sm8550_qmp_gen4x2_pcie_pcs_tbl, + .pcs_num = ARRAY_SIZE(sm8550_qmp_gen4x2_pcie_pcs_tbl), + .pcs_misc = sm8550_qmp_gen4x2_pcie_pcs_misc_tbl, + .pcs_misc_num = ARRAY_SIZE(sm8550_qmp_gen4x2_pcie_pcs_misc_tbl), + .ln_shrd = sm8550_qmp_gen4x2_pcie_ln_shrd_tbl, + .ln_shrd_num = ARRAY_SIZE(sm8550_qmp_gen4x2_pcie_ln_shrd_tbl), + }, + .reset_list = sdm845_pciephy_reset_l, + .num_resets = ARRAY_SIZE(sdm845_pciephy_reset_l), + .vreg_list = sm8550_qmp_phy_vreg_l, + .num_vregs = ARRAY_SIZE(sm8550_qmp_phy_vreg_l), + .regs = pciephy_v5_regs_layout, + + .pwrdn_ctrl = SW_PWRDN | REFCLK_DRV_DSBL, + .phy_status = PHYSTATUS_4_20, + .has_nocsr_reset = true, +}; + static const struct qmp_phy_cfg sa8775p_qmp_gen4x2_pciephy_cfg = { .lanes = 2, .offsets = &qmp_pcie_offsets_v5_20, @@ -3820,6 +3879,12 @@ static const struct of_device_id qmp_pcie_of_match_table[] = { }, { .compatible = "qcom,sm8550-qmp-gen4x2-pcie-phy", .data = &sm8550_qmp_gen4x2_pciephy_cfg, + }, { + .compatible = "qcom,sm8650-qmp-gen3x2-pcie-phy", + .data = &sm8550_qmp_gen3x2_pciephy_cfg, + }, { + .compatible = "qcom,sm8650-qmp-gen4x2-pcie-phy", + .data = &sm8650_qmp_gen4x2_pciephy_cfg, }, { }, }; -- GitLab From 80c1afe8c5fec68d9467c543551a0b2fddd463ff Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Mon, 30 Oct 2023 10:48:23 +0100 Subject: [PATCH 0063/1290] phy: qcom: qmp-combo: add QMP USB3/DP PHY tables for SM8650 Add QMP USB3/DP Combo PHY support for the SM8650 platform. Reviewed-by: Dmitry Baryshkov Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20231030-topic-sm8650-upstream-phy-v2-7-a543a4c4b491@linaro.org Signed-off-by: Vinod Koul --- drivers/phy/qualcomm/phy-qcom-qmp-combo.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-combo.c b/drivers/phy/qualcomm/phy-qcom-qmp-combo.c index 9c87845c78ec2..0417856b8e7bc 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp-combo.c +++ b/drivers/phy/qualcomm/phy-qcom-qmp-combo.c @@ -3558,6 +3558,10 @@ static const struct of_device_id qmp_combo_of_match_table[] = { .compatible = "qcom,sm8550-qmp-usb3-dp-phy", .data = &sm8550_usb3dpphy_cfg, }, + { + .compatible = "qcom,sm8650-qmp-usb3-dp-phy", + .data = &sm8550_usb3dpphy_cfg, + }, { } }; MODULE_DEVICE_TABLE(of, qmp_combo_of_match_table); -- GitLab From 772dd70a5ed6845d87738f82c788c9ff4e37fd7f Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 1 Nov 2023 16:57:57 +0100 Subject: [PATCH 0064/1290] phy: core: Remove usage of the deprecated ida_simple_xx() API ida_alloc() and ida_free() should be preferred to the deprecated ida_simple_get() and ida_simple_remove(). This is less verbose. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/d2323636c6cd2ec22f73a0ae6c2d34ac99b4abd5.1698854255.git.christophe.jaillet@wanadoo.fr Signed-off-by: Vinod Koul --- drivers/phy/phy-core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c index 96a0b1e111f34..d9be6a4d53838 100644 --- a/drivers/phy/phy-core.c +++ b/drivers/phy/phy-core.c @@ -959,7 +959,7 @@ struct phy *phy_create(struct device *dev, struct device_node *node, if (!phy) return ERR_PTR(-ENOMEM); - id = ida_simple_get(&phy_ida, 0, 0, GFP_KERNEL); + id = ida_alloc(&phy_ida, GFP_KERNEL); if (id < 0) { dev_err(dev, "unable to get id\n"); ret = id; @@ -1232,7 +1232,7 @@ static void phy_release(struct device *dev) dev_vdbg(dev, "releasing '%s'\n", dev_name(dev)); debugfs_remove_recursive(phy->debugfs); regulator_put(phy->pwr); - ida_simple_remove(&phy_ida, phy->id); + ida_free(&phy_ida, phy->id); kfree(phy); } -- GitLab From b55d073e6501dc6077edaa945a6dad8ac5c8bbab Mon Sep 17 00:00:00 2001 From: Su Hui Date: Thu, 16 Nov 2023 12:18:23 +0800 Subject: [PATCH 0065/1290] power: supply: bq256xx: fix some problem in bq256xx_hw_init smatch complains that there is a buffer overflow and clang complains 'ret' is never read. Smatch error: drivers/power/supply/bq256xx_charger.c:1578 bq256xx_hw_init() error: buffer overflow 'bq256xx_watchdog_time' 4 <= 4 Clang static checker: Value stored to 'ret' is never read. Add check for buffer overflow and error code from regmap_update_bits(). Fixes: 32e4978bb920 ("power: supply: bq256xx: Introduce the BQ256XX charger driver") Signed-off-by: Su Hui Link: https://lore.kernel.org/r/20231116041822.1378758-1-suhui@nfschina.com Signed-off-by: Sebastian Reichel --- drivers/power/supply/bq256xx_charger.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/power/supply/bq256xx_charger.c b/drivers/power/supply/bq256xx_charger.c index 789a31bd70c39..1a935bc885108 100644 --- a/drivers/power/supply/bq256xx_charger.c +++ b/drivers/power/supply/bq256xx_charger.c @@ -1574,13 +1574,16 @@ static int bq256xx_hw_init(struct bq256xx_device *bq) wd_reg_val = i; break; } - if (bq->watchdog_timer > bq256xx_watchdog_time[i] && + if (i + 1 < BQ256XX_NUM_WD_VAL && + bq->watchdog_timer > bq256xx_watchdog_time[i] && bq->watchdog_timer < bq256xx_watchdog_time[i + 1]) wd_reg_val = i; } ret = regmap_update_bits(bq->regmap, BQ256XX_CHARGER_CONTROL_1, BQ256XX_WATCHDOG_MASK, wd_reg_val << BQ256XX_WDT_BIT_SHIFT); + if (ret) + return ret; ret = power_supply_get_battery_info(bq->charger, &bat_info); if (ret == -ENOMEM) -- GitLab From fedfa36d045ab78ea9a0aa2c5a3d5d74c27207d3 Mon Sep 17 00:00:00 2001 From: Jiri Valek - 2N Date: Fri, 17 Nov 2023 02:41:52 +0000 Subject: [PATCH 0066/1290] Input: cap11xx - remove unnecessary IRQ parsing Separate IRQ parsing is not necessary, I2C core do the job. Signed-off-by: Jiri Valek - 2N Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/cap11xx.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/input/keyboard/cap11xx.c b/drivers/input/keyboard/cap11xx.c index 1b4937dce6725..01e7ead68fead 100644 --- a/drivers/input/keyboard/cap11xx.c +++ b/drivers/input/keyboard/cap11xx.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include @@ -334,7 +334,7 @@ static int cap11xx_i2c_probe(struct i2c_client *i2c_client) struct cap11xx_priv *priv; struct device_node *node; const struct cap11xx_hw_model *cap; - int i, error, irq, gain = 0; + int i, error, gain = 0; unsigned int val, rev; u32 gain32; @@ -474,13 +474,8 @@ static int cap11xx_i2c_probe(struct i2c_client *i2c_client) if (error) return error; - irq = irq_of_parse_and_map(node, 0); - if (!irq) { - dev_err(dev, "Unable to parse or map IRQ\n"); - return -ENXIO; - } - - error = devm_request_threaded_irq(dev, irq, NULL, cap11xx_thread_func, + error = devm_request_threaded_irq(dev, i2c_client->irq, + NULL, cap11xx_thread_func, IRQF_ONESHOT, dev_name(dev), priv); if (error) return error; -- GitLab From 3d762e21d56370a43478b55e604b4a83dd85aafc Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 6 Nov 2023 10:23:10 -0600 Subject: [PATCH 0067/1290] rtc: cmos: Use ACPI alarm for non-Intel x86 systems too Intel systems > 2015 have been configured to use ACPI alarm instead of HPET to avoid s2idle issues. Having HPET programmed for wakeup causes problems on AMD systems with s2idle as well. One particular case is that the systemd "SuspendThenHibernate" feature doesn't work properly on the Framework 13" AMD model. Switching to using ACPI alarm fixes the issue. Adjust the quirk to apply to AMD/Hygon systems from 2021 onwards. This matches what has been tested and is specifically to avoid potential risk to older systems. Cc: # 6.1+ Reported-by: Reported-by: Closes: https://github.com/systemd/systemd/issues/24279 Reported-by: Kelvie Wong Closes: https://community.frame.work/t/systemd-suspend-then-hibernate-wakes-up-after-5-minutes/39392 Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20231106162310.85711-1-mario.limonciello@amd.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-cmos.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index 228fb2d11c709..696cfa7025ded 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -818,18 +818,24 @@ static void rtc_wake_off(struct device *dev) } #ifdef CONFIG_X86 -/* Enable use_acpi_alarm mode for Intel platforms no earlier than 2015 */ static void use_acpi_alarm_quirks(void) { - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (dmi_get_bios_year() < 2015) + return; + break; + case X86_VENDOR_AMD: + case X86_VENDOR_HYGON: + if (dmi_get_bios_year() < 2021) + return; + break; + default: return; - + } if (!is_hpet_enabled()) return; - if (dmi_get_bios_year() < 2015) - return; - use_acpi_alarm = true; } #else -- GitLab From e44a4dc4b36cc087878596b937d52caca35e9b19 Mon Sep 17 00:00:00 2001 From: Dimitri John Ledkov Date: Sun, 22 Oct 2023 20:40:26 +0100 Subject: [PATCH 0068/1290] apparmor: switch SECURITY_APPARMOR_HASH from sha1 to sha256 sha1 is insecure and has colisions, thus it is not useful for even lightweight policy hash checks. Switch to sha256, which on modern hardware is fast enough. Separately as per NIST Policy on Hash Functions, sha1 usage must be withdrawn by 2030. This config option currently is one of many that holds up sha1 usage. Signed-off-by: Dimitri John Ledkov Signed-off-by: John Johansen --- security/apparmor/Kconfig | 12 ++++++------ security/apparmor/apparmorfs.c | 16 ++++++++-------- security/apparmor/crypto.c | 6 +++--- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/security/apparmor/Kconfig b/security/apparmor/Kconfig index e0d1dd0a192a9..64cc3044a42ce 100644 --- a/security/apparmor/Kconfig +++ b/security/apparmor/Kconfig @@ -57,10 +57,10 @@ config SECURITY_APPARMOR_INTROSPECT_POLICY cpu is paramount. config SECURITY_APPARMOR_HASH - bool "Enable introspection of sha1 hashes for loaded profiles" + bool "Enable introspection of sha256 hashes for loaded profiles" depends on SECURITY_APPARMOR_INTROSPECT_POLICY select CRYPTO - select CRYPTO_SHA1 + select CRYPTO_SHA256 default y help This option selects whether introspection of loaded policy @@ -74,10 +74,10 @@ config SECURITY_APPARMOR_HASH_DEFAULT depends on SECURITY_APPARMOR_HASH default y help - This option selects whether sha1 hashing of loaded policy - is enabled by default. The generation of sha1 hashes for - loaded policy provide system administrators a quick way - to verify that policy in the kernel matches what is expected, + This option selects whether sha256 hashing of loaded policy + is enabled by default. The generation of sha256 hashes for + loaded policy provide system administrators a quick way to + verify that policy in the kernel matches what is expected, however it can slow down policy load on some devices. In these cases policy hashing can be disabled by default and enabled only if needed. diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 38650e52ef578..21a6413b84726 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -1474,7 +1474,7 @@ int __aa_fs_create_rawdata(struct aa_ns *ns, struct aa_loaddata *rawdata) rawdata->dents[AAFS_LOADDATA_REVISION] = dent; if (aa_g_hash_policy) { - dent = aafs_create_file("sha1", S_IFREG | 0444, dir, + dent = aafs_create_file("sha256", S_IFREG | 0444, dir, rawdata, &seq_rawdata_hash_fops); if (IS_ERR(dent)) goto fail; @@ -1648,11 +1648,11 @@ static const char *rawdata_get_link_base(struct dentry *dentry, return target; } -static const char *rawdata_get_link_sha1(struct dentry *dentry, +static const char *rawdata_get_link_sha256(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - return rawdata_get_link_base(dentry, inode, done, "sha1"); + return rawdata_get_link_base(dentry, inode, done, "sha256"); } static const char *rawdata_get_link_abi(struct dentry *dentry, @@ -1669,8 +1669,8 @@ static const char *rawdata_get_link_data(struct dentry *dentry, return rawdata_get_link_base(dentry, inode, done, "raw_data"); } -static const struct inode_operations rawdata_link_sha1_iops = { - .get_link = rawdata_get_link_sha1, +static const struct inode_operations rawdata_link_sha256_iops = { + .get_link = rawdata_get_link_sha256, }; static const struct inode_operations rawdata_link_abi_iops = { @@ -1743,7 +1743,7 @@ int __aafs_profile_mkdir(struct aa_profile *profile, struct dentry *parent) profile->dents[AAFS_PROF_ATTACH] = dent; if (profile->hash) { - dent = create_profile_file(dir, "sha1", profile, + dent = create_profile_file(dir, "sha256", profile, &seq_profile_hash_fops); if (IS_ERR(dent)) goto fail; @@ -1753,9 +1753,9 @@ int __aafs_profile_mkdir(struct aa_profile *profile, struct dentry *parent) #ifdef CONFIG_SECURITY_APPARMOR_EXPORT_BINARY if (profile->rawdata) { if (aa_g_hash_policy) { - dent = aafs_create("raw_sha1", S_IFLNK | 0444, dir, + dent = aafs_create("raw_sha256", S_IFLNK | 0444, dir, profile->label.proxy, NULL, NULL, - &rawdata_link_sha1_iops); + &rawdata_link_sha256_iops); if (IS_ERR(dent)) goto fail; aa_get_proxy(profile->label.proxy); diff --git a/security/apparmor/crypto.c b/security/apparmor/crypto.c index 6724e2ff6da89..aad486b2fca65 100644 --- a/security/apparmor/crypto.c +++ b/security/apparmor/crypto.c @@ -106,16 +106,16 @@ static int __init init_profile_hash(void) if (!apparmor_initialized) return 0; - tfm = crypto_alloc_shash("sha1", 0, 0); + tfm = crypto_alloc_shash("sha256", 0, 0); if (IS_ERR(tfm)) { int error = PTR_ERR(tfm); - AA_ERROR("failed to setup profile sha1 hashing: %d\n", error); + AA_ERROR("failed to setup profile sha256 hashing: %d\n", error); return error; } apparmor_tfm = tfm; apparmor_hash_size = crypto_shash_digestsize(apparmor_tfm); - aa_info_message("AppArmor sha1 policy hashing enabled"); + aa_info_message("AppArmor sha256 policy hashing enabled"); return 0; } -- GitLab From 3c49ce0e220912406a478ac128657672608200a0 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Thu, 9 Nov 2023 06:32:28 -0800 Subject: [PATCH 0069/1290] apparmor: declare stack_msg as static stack_msg in upstream code is only used in securit/apparmor/domain.c so declare it as static. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202311092251.TwKSNZ0u-lkp@intel.com/ Signed-off-by: John Johansen --- security/apparmor/domain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c index 89fbeab4b33bd..571158ec6188f 100644 --- a/security/apparmor/domain.c +++ b/security/apparmor/domain.c @@ -1311,7 +1311,7 @@ static int change_profile_perms_wrapper(const char *op, const char *name, return error; } -const char *stack_msg = "change_profile unprivileged unconfined converted to stacking"; +static const char *stack_msg = "change_profile unprivileged unconfined converted to stacking"; /** * aa_change_profile - perform a one-way profile transition -- GitLab From 735ad5d1532a811d068a731dff46fa02c2185981 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Sun, 19 Nov 2023 01:14:10 -0800 Subject: [PATCH 0070/1290] apparmor: declare nulldfa as static With the conversion to a refcounted pdb the nulldfa is now only used in security/apparmor/lsm.c so declar it as static. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202311092038.lqfYnvmf-lkp@intel.com/ Signed-off-by: John Johansen --- security/apparmor/lsm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 4981bdf029931..d3d2fc13c6e7c 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -2106,7 +2106,7 @@ __initcall(apparmor_nf_ip_init); static char nulldfa_src[] = { #include "nulldfa.in" }; -struct aa_dfa *nulldfa; +static struct aa_dfa *nulldfa; static char stacksplitdfa_src[] = { #include "stacksplitdfa.in" -- GitLab From a7e405a2de69fe5e6657046e978a81683b140051 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Sun, 19 Nov 2023 01:19:41 -0800 Subject: [PATCH 0071/1290] apparmor: add missing params to aa_may_ptrace kernel-doc comments When the cred was explicit passed through to aa_may_ptrace() the kernel-doc comment was not properly updated. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202311040508.AUhi04RY-lkp@intel.com/ Signed-off-by: John Johansen --- security/apparmor/task.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/security/apparmor/task.c b/security/apparmor/task.c index f29a2e80e6bf6..c87fb9f4ac18a 100644 --- a/security/apparmor/task.c +++ b/security/apparmor/task.c @@ -278,7 +278,9 @@ static int profile_tracer_perm(const struct cred *cred, /** * aa_may_ptrace - test if tracer task can trace the tracee + * @tracer_cred: cred of task doing the tracing (NOT NULL) * @tracer: label of the task doing the tracing (NOT NULL) + * @tracee_cred: cred of task to be traced * @tracee: task label to be traced * @request: permission request * -- GitLab From 5c68b66d4d7eff8cdb6f508f8537faa30c5faa6d Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 20 Nov 2023 20:07:39 +0100 Subject: [PATCH 0072/1290] soundwire: qcom: drop unneeded qcom_swrm_stream_alloc_ports() cleanup The cleanup in "err" goto label clears bits from pconfig array which is a local variable. This does not have any effect outside of this function, so drop this useless code. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20231120190740.339350-1-krzysztof.kozlowski@linaro.org Signed-off-by: Vinod Koul --- drivers/soundwire/qcom.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/soundwire/qcom.c b/drivers/soundwire/qcom.c index a1e2d6c981862..754870a4a047c 100644 --- a/drivers/soundwire/qcom.c +++ b/drivers/soundwire/qcom.c @@ -1157,7 +1157,7 @@ static int qcom_swrm_stream_alloc_ports(struct qcom_swrm_ctrl *ctrl, struct sdw_port_runtime *p_rt; struct sdw_slave *slave; unsigned long *port_mask; - int i, maxport, pn, nports = 0, ret = 0; + int maxport, pn, nports = 0, ret = 0; unsigned int m_port; mutex_lock(&ctrl->port_lock); @@ -1183,7 +1183,7 @@ static int qcom_swrm_stream_alloc_ports(struct qcom_swrm_ctrl *ctrl, if (pn > maxport) { dev_err(ctrl->dev, "All ports busy\n"); ret = -EBUSY; - goto err; + goto out; } set_bit(pn, port_mask); pconfig[nports].num = pn; @@ -1205,12 +1205,7 @@ static int qcom_swrm_stream_alloc_ports(struct qcom_swrm_ctrl *ctrl, sconfig.bps = 1; sdw_stream_add_master(&ctrl->bus, &sconfig, pconfig, nports, stream); -err: - if (ret) { - for (i = 0; i < nports; i++) - clear_bit(pconfig[i].num, port_mask); - } - +out: mutex_unlock(&ctrl->port_lock); return ret; -- GitLab From 5bdc61ef45007908df9d8587111c7a5a552bdd46 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 20 Nov 2023 20:07:40 +0100 Subject: [PATCH 0073/1290] soundwire: qcom: move sconfig in qcom_swrm_stream_alloc_ports() out of critical section Setting members of local variable "sconfig" in qcom_swrm_stream_alloc_ports() does not depend on any earlier code in this function, so can be moved up before the critical section. This makes the code a bit easier to follow because critical section is smaller. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20231120190740.339350-2-krzysztof.kozlowski@linaro.org Signed-off-by: Vinod Koul --- drivers/soundwire/qcom.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/soundwire/qcom.c b/drivers/soundwire/qcom.c index 754870a4a047c..e9a52c1bd3594 100644 --- a/drivers/soundwire/qcom.c +++ b/drivers/soundwire/qcom.c @@ -1160,6 +1160,17 @@ static int qcom_swrm_stream_alloc_ports(struct qcom_swrm_ctrl *ctrl, int maxport, pn, nports = 0, ret = 0; unsigned int m_port; + if (direction == SNDRV_PCM_STREAM_CAPTURE) + sconfig.direction = SDW_DATA_DIR_TX; + else + sconfig.direction = SDW_DATA_DIR_RX; + + /* hw parameters wil be ignored as we only support PDM */ + sconfig.ch_count = 1; + sconfig.frame_rate = params_rate(params); + sconfig.type = stream->type; + sconfig.bps = 1; + mutex_lock(&ctrl->port_lock); list_for_each_entry(m_rt, &stream->master_list, stream_node) { if (m_rt->direction == SDW_DATA_DIR_RX) { @@ -1193,16 +1204,6 @@ static int qcom_swrm_stream_alloc_ports(struct qcom_swrm_ctrl *ctrl, } } - if (direction == SNDRV_PCM_STREAM_CAPTURE) - sconfig.direction = SDW_DATA_DIR_TX; - else - sconfig.direction = SDW_DATA_DIR_RX; - - /* hw parameters wil be ignored as we only support PDM */ - sconfig.ch_count = 1; - sconfig.frame_rate = params_rate(params); - sconfig.type = stream->type; - sconfig.bps = 1; sdw_stream_add_master(&ctrl->bus, &sconfig, pconfig, nports, stream); out: -- GitLab From 21f4c443731fdb064c0dd31a743aafd0b075156c Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 20 Nov 2023 18:47:20 +0100 Subject: [PATCH 0074/1290] soundwire: stream: constify sdw_port_config when adding devices sdw_stream_add_master() and sdw_stream_add_slave() do not modify contents of passed sdw_port_config, so it can be made const for code safety and as documentation of expected usage. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20231120174720.239610-1-krzysztof.kozlowski@linaro.org Signed-off-by: Vinod Koul --- drivers/soundwire/stream.c | 10 +++++----- include/linux/soundwire/sdw.h | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/soundwire/stream.c b/drivers/soundwire/stream.c index 69719b335bcb1..9dc6399f206af 100644 --- a/drivers/soundwire/stream.c +++ b/drivers/soundwire/stream.c @@ -897,7 +897,7 @@ static struct sdw_port_runtime *sdw_port_alloc(struct list_head *port_list) } static int sdw_port_config(struct sdw_port_runtime *p_rt, - struct sdw_port_config *port_config, + const struct sdw_port_config *port_config, int port_index) { p_rt->ch_mask = port_config[port_index].ch_mask; @@ -970,7 +970,7 @@ static int sdw_slave_port_is_valid_range(struct device *dev, int num) static int sdw_slave_port_config(struct sdw_slave *slave, struct sdw_slave_runtime *s_rt, - struct sdw_port_config *port_config) + const struct sdw_port_config *port_config) { struct sdw_port_runtime *p_rt; int ret; @@ -1026,7 +1026,7 @@ static int sdw_master_port_alloc(struct sdw_master_runtime *m_rt, } static int sdw_master_port_config(struct sdw_master_runtime *m_rt, - struct sdw_port_config *port_config) + const struct sdw_port_config *port_config) { struct sdw_port_runtime *p_rt; int ret; @@ -1861,7 +1861,7 @@ EXPORT_SYMBOL(sdw_release_stream); */ int sdw_stream_add_master(struct sdw_bus *bus, struct sdw_stream_config *stream_config, - struct sdw_port_config *port_config, + const struct sdw_port_config *port_config, unsigned int num_ports, struct sdw_stream_runtime *stream) { @@ -1981,7 +1981,7 @@ EXPORT_SYMBOL(sdw_stream_remove_master); */ int sdw_stream_add_slave(struct sdw_slave *slave, struct sdw_stream_config *stream_config, - struct sdw_port_config *port_config, + const struct sdw_port_config *port_config, unsigned int num_ports, struct sdw_stream_runtime *stream) { diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 4f3d14bb15385..904004d8b5622 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -1040,7 +1040,7 @@ int sdw_compute_params(struct sdw_bus *bus); int sdw_stream_add_master(struct sdw_bus *bus, struct sdw_stream_config *stream_config, - struct sdw_port_config *port_config, + const struct sdw_port_config *port_config, unsigned int num_ports, struct sdw_stream_runtime *stream); int sdw_stream_remove_master(struct sdw_bus *bus, @@ -1062,7 +1062,7 @@ void sdw_extract_slave_id(struct sdw_bus *bus, u64 addr, struct sdw_slave_id *id int sdw_stream_add_slave(struct sdw_slave *slave, struct sdw_stream_config *stream_config, - struct sdw_port_config *port_config, + const struct sdw_port_config *port_config, unsigned int num_ports, struct sdw_stream_runtime *stream); int sdw_stream_remove_slave(struct sdw_slave *slave, @@ -1084,7 +1084,7 @@ int sdw_update_no_pm(struct sdw_slave *slave, u32 addr, u8 mask, u8 val); static inline int sdw_stream_add_slave(struct sdw_slave *slave, struct sdw_stream_config *stream_config, - struct sdw_port_config *port_config, + const struct sdw_port_config *port_config, unsigned int num_ports, struct sdw_stream_runtime *stream) { -- GitLab From 280b4e4a9e8009affd8f20ec2d467cb4deb05c1c Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Tue, 12 Sep 2023 16:07:59 +1000 Subject: [PATCH 0075/1290] perf tools: Address python 3.6 DeprecationWarning for string scapes Python 3.6 introduced a DeprecationWarning for invalid escape sequences. This is upgraded to a SyntaxWarning in Python 3.12, and will eventually be a syntax error. Fix these now to get ahead of it before it's an error. Signed-off-by: Benjamin Gray Acked-by: Adrian Hunter Cc: Alexander Shishkin Cc: Andrii Nakryiko Cc: Hartley Sweeten Cc: Ian Abbott Cc: Ian Rogers Cc: Ingo Molnar Cc: Jan Kiszka Cc: Jiri Olsa Cc: Jonathan Corbet Cc: Kieran Bingham Cc: Mark Rutland Cc: Mykola Lysenko Cc: Namhyung Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Shuah Khan Cc: Todd E Brandt Cc: Tom Rix Cc: linux-doc@vger.kernel.org Cc: linux-ia64@vger.kernel.org Cc: linux-kselftest@vger.kernel.org Cc: linux-pm@vger.kernel.org Cc: llvm@lists.linux.dev Link: https://lore.kernel.org/r/20230912060801.95533-6-bgray@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/jevents.py | 2 +- tools/perf/scripts/python/arm-cs-trace-disasm.py | 4 ++-- tools/perf/scripts/python/compaction-times.py | 2 +- tools/perf/scripts/python/exported-sql-viewer.py | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/perf/pmu-events/jevents.py b/tools/perf/pmu-events/jevents.py index 3c091ab753059..0093c998cb6e6 100755 --- a/tools/perf/pmu-events/jevents.py +++ b/tools/perf/pmu-events/jevents.py @@ -83,7 +83,7 @@ def c_len(s: str) -> int: """Return the length of s a C string This doesn't handle all escape characters properly. It first assumes - all \ are for escaping, it then adjusts as it will have over counted + all \\ are for escaping, it then adjusts as it will have over counted \\. The code uses \000 rather than \0 as a terminator as an adjacent number would be folded into a string of \0 (ie. "\0" + "5" doesn't equal a terminator followed by the number 5 but the escape of diff --git a/tools/perf/scripts/python/arm-cs-trace-disasm.py b/tools/perf/scripts/python/arm-cs-trace-disasm.py index d59ff53f1d946..de58991c78bb4 100755 --- a/tools/perf/scripts/python/arm-cs-trace-disasm.py +++ b/tools/perf/scripts/python/arm-cs-trace-disasm.py @@ -45,8 +45,8 @@ parser = OptionParser(option_list=option_list) # Initialize global dicts and regular expression disasm_cache = dict() cpu_data = dict() -disasm_re = re.compile("^\s*([0-9a-fA-F]+):") -disasm_func_re = re.compile("^\s*([0-9a-fA-F]+)\s.*:") +disasm_re = re.compile(r"^\s*([0-9a-fA-F]+):") +disasm_func_re = re.compile(r"^\s*([0-9a-fA-F]+)\s.*:") cache_size = 64*1024 glb_source_file_name = None diff --git a/tools/perf/scripts/python/compaction-times.py b/tools/perf/scripts/python/compaction-times.py index 2560a042dc6fa..9401f7c147477 100644 --- a/tools/perf/scripts/python/compaction-times.py +++ b/tools/perf/scripts/python/compaction-times.py @@ -260,7 +260,7 @@ def pr_help(): comm_re = None pid_re = None -pid_regex = "^(\d*)-(\d*)$|^(\d*)$" +pid_regex = r"^(\d*)-(\d*)$|^(\d*)$" opt_proc = popt.DISP_DFL opt_disp = topt.DISP_ALL diff --git a/tools/perf/scripts/python/exported-sql-viewer.py b/tools/perf/scripts/python/exported-sql-viewer.py index 13f2d8a816109..121cf61ba1b34 100755 --- a/tools/perf/scripts/python/exported-sql-viewer.py +++ b/tools/perf/scripts/python/exported-sql-viewer.py @@ -677,8 +677,8 @@ class CallGraphModelBase(TreeModel): # sqlite supports GLOB (text only) which uses * and ? and is case sensitive if not self.glb.dbref.is_sqlite3: # Escape % and _ - s = value.replace("%", "\%") - s = s.replace("_", "\_") + s = value.replace("%", "\\%") + s = s.replace("_", "\\_") # Translate * and ? into SQL LIKE pattern characters % and _ trans = string.maketrans("*?", "%_") match = " LIKE '" + str(s).translate(trans) + "'" -- GitLab From 87f33a1b8f7e3d223fc331fe54fd8ec337dc9cb9 Mon Sep 17 00:00:00 2001 From: Flavio Suligoi Date: Thu, 16 Nov 2023 11:53:18 +0100 Subject: [PATCH 0076/1290] dt-bindings: backlight: mp3309c: Remove two required properties The two properties: - max-brightness - default brightness are not really required, so they can be removed from the "required" section. The "max-brightness" is no longer used in the current version of the driver (it was used only in the first version). The "default-brightness", if omitted in the DT, is managed by the device driver, using a default value. This value depends on the dimming mode used: - for the "analog mode", via I2C commands, this value is fixed by hardware (=31) - while in case of pwm mode the default used is the last value of the brightness-levels array. Also the brightness-levels array is not required: - in "analog mode", via I2C commands, the brightness-level array is fixed by hardware (0..31).; - in pwm dimming mode, the driver uses a default array of 0..255 and the "default-brightness" is the last one, which is "255" NOTE: there are no compatibility problems with the previous version, since the device driver has not yet been included in any kernel. Only this dt-binding yaml file is already included in the current v6.7.0-rc1 kernel version. No developer may have used it. Other changes: - improve the backlight working mode description, in the "description" section - update the example, removing the "max-brightness" and introducing the "brightess-levels" property Signed-off-by: Flavio Suligoi Acked-by: Conor Dooley Reviewed-by: Daniel Thompson Link: https://lore.kernel.org/r/20231116105319.957600-2-f.suligoi@asem.it Signed-off-by: Lee Jones --- .../bindings/leds/backlight/mps,mp3309c.yaml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/Documentation/devicetree/bindings/leds/backlight/mps,mp3309c.yaml b/Documentation/devicetree/bindings/leds/backlight/mps,mp3309c.yaml index 4191e33626f51..527a37368ed74 100644 --- a/Documentation/devicetree/bindings/leds/backlight/mps,mp3309c.yaml +++ b/Documentation/devicetree/bindings/leds/backlight/mps,mp3309c.yaml @@ -14,8 +14,8 @@ description: | programmable switching frequency to optimize efficiency. It supports two different dimming modes: - - analog mode, via I2C commands (default) - - PWM controlled mode. + - analog mode, via I2C commands, as default mode (32 dimming levels) + - PWM controlled mode (optional) The datasheet is available at: https://www.monolithicpower.com/en/mp3309c.html @@ -50,8 +50,6 @@ properties: required: - compatible - reg - - max-brightness - - default-brightness unevaluatedProperties: false @@ -66,8 +64,8 @@ examples: compatible = "mps,mp3309c"; reg = <0x17>; pwms = <&pwm1 0 3333333 0>; /* 300 Hz --> (1/f) * 1*10^9 */ - max-brightness = <100>; - default-brightness = <80>; + brightness-levels = <0 4 8 16 32 64 128 255>; + default-brightness = <6>; mps,overvoltage-protection-microvolt = <24000000>; }; }; -- GitLab From 2e914516a58cf86bd0e42c7d3e25c81d44ec2ab8 Mon Sep 17 00:00:00 2001 From: Flavio Suligoi Date: Thu, 16 Nov 2023 11:53:19 +0100 Subject: [PATCH 0077/1290] backlight: mp3309c: Add support for MPS MP3309C The Monolithic Power (MPS) MP3309C is a WLED step-up converter, featuring a programmable switching frequency to optimize efficiency. The brightness can be controlled either by I2C commands (called "analog" mode) or by a PWM input signal (PWM mode). This driver supports both modes. For DT configuration details, please refer to: - Documentation/devicetree/bindings/leds/backlight/mps,mp3309c.yaml The datasheet is available at: - https://www.monolithicpower.com/en/mp3309c.html Signed-off-by: Flavio Suligoi Reviewed-by: Daniel Thompson Link: https://lore.kernel.org/r/20231116105319.957600-3-f.suligoi@asem.it Signed-off-by: Lee Jones --- MAINTAINERS | 7 + drivers/video/backlight/Kconfig | 11 + drivers/video/backlight/Makefile | 1 + drivers/video/backlight/mp3309c.c | 443 ++++++++++++++++++++++++++++++ 4 files changed, 462 insertions(+) create mode 100644 drivers/video/backlight/mp3309c.c diff --git a/MAINTAINERS b/MAINTAINERS index 97f51d5ec1cfd..c00952c462026 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14658,6 +14658,13 @@ S: Maintained F: Documentation/driver-api/tty/moxa-smartio.rst F: drivers/tty/mxser.* +MP3309C BACKLIGHT DRIVER +M: Flavio Suligoi +L: dri-devel@lists.freedesktop.org +S: Maintained +F: Documentation/devicetree/bindings/leds/backlight/mps,mp3309c.yaml +F: drivers/video/backlight/mp3309c.c + MR800 AVERMEDIA USB FM RADIO DRIVER M: Alexey Klimov L: linux-media@vger.kernel.org diff --git a/drivers/video/backlight/Kconfig b/drivers/video/backlight/Kconfig index 51387b1ef012a..1144a54a35c01 100644 --- a/drivers/video/backlight/Kconfig +++ b/drivers/video/backlight/Kconfig @@ -402,6 +402,17 @@ config BACKLIGHT_LP8788 help This supports TI LP8788 backlight driver. +config BACKLIGHT_MP3309C + tristate "Backlight Driver for MPS MP3309C" + depends on I2C && PWM + select REGMAP_I2C + help + This supports MPS MP3309C backlight WLED driver in both PWM and + analog/I2C dimming modes. + + To compile this driver as a module, choose M here: the module will + be called mp3309c. + config BACKLIGHT_PANDORA tristate "Backlight driver for Pandora console" depends on TWL4030_CORE diff --git a/drivers/video/backlight/Makefile b/drivers/video/backlight/Makefile index f72e1c3c59e95..1af583de665bb 100644 --- a/drivers/video/backlight/Makefile +++ b/drivers/video/backlight/Makefile @@ -44,6 +44,7 @@ obj-$(CONFIG_BACKLIGHT_LP855X) += lp855x_bl.o obj-$(CONFIG_BACKLIGHT_LP8788) += lp8788_bl.o obj-$(CONFIG_BACKLIGHT_LV5207LP) += lv5207lp.o obj-$(CONFIG_BACKLIGHT_MAX8925) += max8925_bl.o +obj-$(CONFIG_BACKLIGHT_MP3309C) += mp3309c.o obj-$(CONFIG_BACKLIGHT_MT6370) += mt6370-backlight.o obj-$(CONFIG_BACKLIGHT_OMAP1) += omap1_bl.o obj-$(CONFIG_BACKLIGHT_PANDORA) += pandora_bl.o diff --git a/drivers/video/backlight/mp3309c.c b/drivers/video/backlight/mp3309c.c new file mode 100644 index 0000000000000..3fe4469ef43fd --- /dev/null +++ b/drivers/video/backlight/mp3309c.c @@ -0,0 +1,443 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Driver for MPS MP3309C White LED driver with I2C interface + * + * This driver support both analog (by I2C commands) and PWM dimming control + * modes. + * + * Copyright (C) 2023 ASEM Srl + * Author: Flavio Suligoi + * + * Based on pwm_bl.c + */ + +#include +#include +#include +#include +#include +#include + +#define REG_I2C_0 0x00 +#define REG_I2C_1 0x01 + +#define REG_I2C_0_EN 0x80 +#define REG_I2C_0_D0 0x40 +#define REG_I2C_0_D1 0x20 +#define REG_I2C_0_D2 0x10 +#define REG_I2C_0_D3 0x08 +#define REG_I2C_0_D4 0x04 +#define REG_I2C_0_RSRV1 0x02 +#define REG_I2C_0_RSRV2 0x01 + +#define REG_I2C_1_RSRV1 0x80 +#define REG_I2C_1_DIMS 0x40 +#define REG_I2C_1_SYNC 0x20 +#define REG_I2C_1_OVP0 0x10 +#define REG_I2C_1_OVP1 0x08 +#define REG_I2C_1_VOS 0x04 +#define REG_I2C_1_LEDO 0x02 +#define REG_I2C_1_OTP 0x01 + +#define ANALOG_I2C_NUM_LEVELS 32 /* 0..31 */ +#define ANALOG_I2C_REG_MASK 0x7c + +#define MP3309C_PWM_DEFAULT_NUM_LEVELS 256 /* 0..255 */ + +enum mp3309c_status_value { + FIRST_POWER_ON, + BACKLIGHT_OFF, + BACKLIGHT_ON, +}; + +enum mp3309c_dimming_mode_value { + DIMMING_PWM, + DIMMING_ANALOG_I2C, +}; + +struct mp3309c_platform_data { + unsigned int max_brightness; + unsigned int default_brightness; + unsigned int *levels; + u8 dimming_mode; + u8 over_voltage_protection; + bool sync_mode; + u8 status; +}; + +struct mp3309c_chip { + struct device *dev; + struct mp3309c_platform_data *pdata; + struct backlight_device *bl; + struct gpio_desc *enable_gpio; + struct regmap *regmap; + struct pwm_device *pwmd; +}; + +static const struct regmap_config mp3309c_regmap = { + .name = "mp3309c_regmap", + .reg_bits = 8, + .reg_stride = 1, + .val_bits = 8, + .max_register = REG_I2C_1, +}; + +static int mp3309c_enable_device(struct mp3309c_chip *chip) +{ + u8 reg_val; + int ret; + + /* I2C register #0 - Device enable */ + ret = regmap_update_bits(chip->regmap, REG_I2C_0, REG_I2C_0_EN, + REG_I2C_0_EN); + if (ret) + return ret; + + /* + * I2C register #1 - Set working mode: + * - set one of the two dimming mode: + * - PWM dimming using an external PWM dimming signal + * - analog dimming using I2C commands + * - enable/disable synchronous mode + * - set overvoltage protection (OVP) + */ + reg_val = 0x00; + if (chip->pdata->dimming_mode == DIMMING_PWM) + reg_val |= REG_I2C_1_DIMS; + if (chip->pdata->sync_mode) + reg_val |= REG_I2C_1_SYNC; + reg_val |= chip->pdata->over_voltage_protection; + ret = regmap_write(chip->regmap, REG_I2C_1, reg_val); + if (ret) + return ret; + + return 0; +} + +static int mp3309c_bl_update_status(struct backlight_device *bl) +{ + struct mp3309c_chip *chip = bl_get_data(bl); + int brightness = backlight_get_brightness(bl); + struct pwm_state pwmstate; + unsigned int analog_val, bits_val; + int i, ret; + + if (chip->pdata->dimming_mode == DIMMING_PWM) { + /* + * PWM control mode + */ + pwm_get_state(chip->pwmd, &pwmstate); + pwm_set_relative_duty_cycle(&pwmstate, + chip->pdata->levels[brightness], + chip->pdata->levels[chip->pdata->max_brightness]); + pwmstate.enabled = true; + ret = pwm_apply_state(chip->pwmd, &pwmstate); + if (ret) + return ret; + + switch (chip->pdata->status) { + case FIRST_POWER_ON: + case BACKLIGHT_OFF: + /* + * After 20ms of low pwm signal level, the chip turns + * off automatically. In this case, before enabling the + * chip again, we must wait about 10ms for pwm signal to + * stabilize. + */ + if (brightness > 0) { + msleep(10); + mp3309c_enable_device(chip); + chip->pdata->status = BACKLIGHT_ON; + } else { + chip->pdata->status = BACKLIGHT_OFF; + } + break; + case BACKLIGHT_ON: + if (brightness == 0) + chip->pdata->status = BACKLIGHT_OFF; + break; + } + } else { + /* + * Analog (by I2C command) control mode + * + * The first time, before setting brightness, we must enable the + * device + */ + if (chip->pdata->status == FIRST_POWER_ON) + mp3309c_enable_device(chip); + + /* + * Dimming mode I2C command (fixed dimming range 0..31) + * + * The 5 bits of the dimming analog value D4..D0 is allocated + * in the I2C register #0, in the following way: + * + * +--+--+--+--+--+--+--+--+ + * |EN|D0|D1|D2|D3|D4|XX|XX| + * +--+--+--+--+--+--+--+--+ + */ + analog_val = brightness; + bits_val = 0; + for (i = 0; i <= 5; i++) + bits_val += ((analog_val >> i) & 0x01) << (6 - i); + ret = regmap_update_bits(chip->regmap, REG_I2C_0, + ANALOG_I2C_REG_MASK, bits_val); + if (ret) + return ret; + + if (brightness > 0) + chip->pdata->status = BACKLIGHT_ON; + else + chip->pdata->status = BACKLIGHT_OFF; + } + + return 0; +} + +static const struct backlight_ops mp3309c_bl_ops = { + .update_status = mp3309c_bl_update_status, +}; + +static int pm3309c_parse_dt_node(struct mp3309c_chip *chip, + struct mp3309c_platform_data *pdata) +{ + struct device_node *node = chip->dev->of_node; + struct property *prop_pwms, *prop_levels; + int length = 0; + int ret, i; + unsigned int num_levels, tmp_value; + + if (!node) { + dev_err(chip->dev, "failed to get DT node\n"); + return -ENODEV; + } + + /* + * Dimming mode: the MP3309C provides two dimming control mode: + * + * - PWM mode + * - Analog by I2C control mode (default) + * + * I2C control mode is assumed as default but, if the pwms property is + * found in the backlight node, the mode switches to PWM mode. + */ + pdata->dimming_mode = DIMMING_ANALOG_I2C; + prop_pwms = of_find_property(node, "pwms", &length); + if (prop_pwms) { + chip->pwmd = devm_pwm_get(chip->dev, NULL); + if (IS_ERR(chip->pwmd)) + return dev_err_probe(chip->dev, PTR_ERR(chip->pwmd), + "error getting pwm data\n"); + pdata->dimming_mode = DIMMING_PWM; + pwm_apply_args(chip->pwmd); + } + + /* + * In I2C control mode the dimming levels (0..31) are fixed by the + * hardware, while in PWM control mode they can be chosen by the user, + * to allow nonlinear mappings. + */ + if (pdata->dimming_mode == DIMMING_ANALOG_I2C) { + /* + * Analog (by I2C commands) control mode: fixed 0..31 brightness + * levels + */ + num_levels = ANALOG_I2C_NUM_LEVELS; + + /* Enable GPIO used in I2C dimming mode only */ + chip->enable_gpio = devm_gpiod_get(chip->dev, "enable", + GPIOD_OUT_HIGH); + if (IS_ERR(chip->enable_gpio)) + return dev_err_probe(chip->dev, + PTR_ERR(chip->enable_gpio), + "error getting enable gpio\n"); + } else { + /* + * PWM control mode: check for brightness level in DT + */ + prop_levels = of_find_property(node, "brightness-levels", + &length); + if (prop_levels) { + /* Read brightness levels from DT */ + num_levels = length / sizeof(u32); + if (num_levels < 2) + return -EINVAL; + } else { + /* Use default brightness levels */ + num_levels = MP3309C_PWM_DEFAULT_NUM_LEVELS; + } + } + + /* Fill brightness levels array */ + pdata->levels = devm_kcalloc(chip->dev, num_levels, + sizeof(*pdata->levels), GFP_KERNEL); + if (!pdata->levels) + return -ENOMEM; + if (prop_levels) { + ret = of_property_read_u32_array(node, "brightness-levels", + pdata->levels, + num_levels); + if (ret < 0) + return ret; + } else { + for (i = 0; i < num_levels; i++) + pdata->levels[i] = i; + } + + pdata->max_brightness = num_levels - 1; + + ret = of_property_read_u32(node, "default-brightness", + &pdata->default_brightness); + if (ret) + pdata->default_brightness = pdata->max_brightness; + if (pdata->default_brightness > pdata->max_brightness) { + dev_err(chip->dev, + "default brightness exceeds max brightness\n"); + pdata->default_brightness = pdata->max_brightness; + } + + /* + * Over-voltage protection (OVP) + * + * This (optional) property values are: + * + * - 13.5V + * - 24V + * - 35.5V (hardware default setting) + * + * If missing, the default value for OVP is 35.5V + */ + pdata->over_voltage_protection = REG_I2C_1_OVP1; + if (!of_property_read_u32(node, "mps,overvoltage-protection-microvolt", + &tmp_value)) { + switch (tmp_value) { + case 13500000: + pdata->over_voltage_protection = 0x00; + break; + case 24000000: + pdata->over_voltage_protection = REG_I2C_1_OVP0; + break; + case 35500000: + pdata->over_voltage_protection = REG_I2C_1_OVP1; + break; + default: + return -EINVAL; + } + } + + /* Synchronous (default) and non-synchronous mode */ + pdata->sync_mode = true; + if (of_property_read_bool(node, "mps,no-sync-mode")) + pdata->sync_mode = false; + + return 0; +} + +static int mp3309c_probe(struct i2c_client *client) +{ + struct mp3309c_platform_data *pdata = dev_get_platdata(&client->dev); + struct mp3309c_chip *chip; + struct backlight_properties props; + struct pwm_state pwmstate; + int ret; + + if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) { + dev_err(&client->dev, "failed to check i2c functionality\n"); + return -EOPNOTSUPP; + } + + chip = devm_kzalloc(&client->dev, sizeof(*chip), GFP_KERNEL); + if (!chip) + return -ENOMEM; + + chip->dev = &client->dev; + + chip->regmap = devm_regmap_init_i2c(client, &mp3309c_regmap); + if (IS_ERR(chip->regmap)) + return dev_err_probe(&client->dev, PTR_ERR(chip->regmap), + "failed to allocate register map\n"); + + i2c_set_clientdata(client, chip); + + if (!pdata) { + pdata = devm_kzalloc(chip->dev, sizeof(*pdata), GFP_KERNEL); + if (!pdata) + return -ENOMEM; + + ret = pm3309c_parse_dt_node(chip, pdata); + if (ret) + return ret; + } + chip->pdata = pdata; + + /* Backlight properties */ + props.brightness = pdata->default_brightness; + props.max_brightness = pdata->max_brightness; + props.scale = BACKLIGHT_SCALE_LINEAR; + props.type = BACKLIGHT_RAW; + props.power = FB_BLANK_UNBLANK; + props.fb_blank = FB_BLANK_UNBLANK; + chip->bl = devm_backlight_device_register(chip->dev, "mp3309c", + chip->dev, chip, + &mp3309c_bl_ops, &props); + if (IS_ERR(chip->bl)) + return dev_err_probe(chip->dev, PTR_ERR(chip->bl), + "error registering backlight device\n"); + + /* In PWM dimming mode, enable pwm device */ + if (chip->pdata->dimming_mode == DIMMING_PWM) { + pwm_init_state(chip->pwmd, &pwmstate); + pwm_set_relative_duty_cycle(&pwmstate, + chip->pdata->default_brightness, + chip->pdata->max_brightness); + pwmstate.enabled = true; + ret = pwm_apply_state(chip->pwmd, &pwmstate); + if (ret) + return dev_err_probe(chip->dev, ret, + "error setting pwm device\n"); + } + + chip->pdata->status = FIRST_POWER_ON; + backlight_update_status(chip->bl); + + return 0; +} + +static void mp3309c_remove(struct i2c_client *client) +{ + struct mp3309c_chip *chip = i2c_get_clientdata(client); + struct backlight_device *bl = chip->bl; + + bl->props.power = FB_BLANK_POWERDOWN; + bl->props.brightness = 0; + backlight_update_status(chip->bl); +} + +static const struct of_device_id mp3309c_match_table[] = { + { .compatible = "mps,mp3309c", }, + { }, +}; +MODULE_DEVICE_TABLE(of, mp3309c_match_table); + +static const struct i2c_device_id mp3309c_id[] = { + { "mp3309c", 0 }, + { } +}; +MODULE_DEVICE_TABLE(i2c, mp3309c_id); + +static struct i2c_driver mp3309c_i2c_driver = { + .driver = { + .name = KBUILD_MODNAME, + .of_match_table = mp3309c_match_table, + }, + .probe = mp3309c_probe, + .remove = mp3309c_remove, + .id_table = mp3309c_id, +}; + +module_i2c_driver(mp3309c_i2c_driver); + +MODULE_DESCRIPTION("Backlight Driver for MPS MP3309C"); +MODULE_AUTHOR("Flavio Suligoi "); +MODULE_LICENSE("GPL"); -- GitLab From 58793f263abc8e5233fabf7466219202db09d048 Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Fri, 17 Nov 2023 13:06:25 +0100 Subject: [PATCH 0078/1290] backlight: pwm_bl: Use dev_err_probe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use dev_err_probe to simplify error paths. Also let dev_err_probe handle the -EPROBE_DEFER case and add an entry to /sys/kernel/debug/devices_deferred when deferred. Signed-off-by: Alexander Stein Reviewed-by: Uwe Kleine-König Reviewed-by: Daniel Thompson Link: https://lore.kernel.org/r/20231117120625.2398417-1-alexander.stein@ew.tq-group.com Signed-off-by: Lee Jones --- drivers/video/backlight/pwm_bl.c | 34 +++++++++++++++++--------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c index 289bd9ce4d36d..5bc9c6c075710 100644 --- a/drivers/video/backlight/pwm_bl.c +++ b/drivers/video/backlight/pwm_bl.c @@ -461,10 +461,9 @@ static int pwm_backlight_probe(struct platform_device *pdev) if (!data) { ret = pwm_backlight_parse_dt(&pdev->dev, &defdata); - if (ret < 0) { - dev_err(&pdev->dev, "failed to find platform data\n"); - return ret; - } + if (ret < 0) + return dev_err_probe(&pdev->dev, ret, + "failed to find platform data\n"); data = &defdata; } @@ -493,24 +492,27 @@ static int pwm_backlight_probe(struct platform_device *pdev) pb->enable_gpio = devm_gpiod_get_optional(&pdev->dev, "enable", GPIOD_ASIS); if (IS_ERR(pb->enable_gpio)) { - ret = PTR_ERR(pb->enable_gpio); + ret = dev_err_probe(&pdev->dev, PTR_ERR(pb->enable_gpio), + "failed to acquire enable GPIO\n"); goto err_alloc; } pb->power_supply = devm_regulator_get_optional(&pdev->dev, "power"); if (IS_ERR(pb->power_supply)) { ret = PTR_ERR(pb->power_supply); - if (ret == -ENODEV) + if (ret == -ENODEV) { pb->power_supply = NULL; - else + } else { + dev_err_probe(&pdev->dev, ret, + "failed to acquire power regulator\n"); goto err_alloc; + } } pb->pwm = devm_pwm_get(&pdev->dev, NULL); if (IS_ERR(pb->pwm)) { - ret = PTR_ERR(pb->pwm); - if (ret != -EPROBE_DEFER) - dev_err(&pdev->dev, "unable to request PWM\n"); + ret = dev_err_probe(&pdev->dev, PTR_ERR(pb->pwm), + "unable to request PWM\n"); goto err_alloc; } @@ -530,8 +532,8 @@ static int pwm_backlight_probe(struct platform_device *pdev) ret = pwm_apply_state(pb->pwm, &state); if (ret) { - dev_err(&pdev->dev, "failed to apply initial PWM state: %d\n", - ret); + dev_err_probe(&pdev->dev, ret, + "failed to apply initial PWM state"); goto err_alloc; } @@ -568,8 +570,8 @@ static int pwm_backlight_probe(struct platform_device *pdev) ret = pwm_backlight_brightness_default(&pdev->dev, data, state.period); if (ret < 0) { - dev_err(&pdev->dev, - "failed to setup default brightness table\n"); + dev_err_probe(&pdev->dev, ret, + "failed to setup default brightness table\n"); goto err_alloc; } @@ -597,8 +599,8 @@ static int pwm_backlight_probe(struct platform_device *pdev) bl = backlight_device_register(dev_name(&pdev->dev), &pdev->dev, pb, &pwm_backlight_ops, &props); if (IS_ERR(bl)) { - dev_err(&pdev->dev, "failed to register backlight\n"); - ret = PTR_ERR(bl); + ret = dev_err_probe(&pdev->dev, PTR_ERR(bl), + "failed to register backlight\n"); goto err_alloc; } -- GitLab From 6543ac13c623f906200dfd3f1c407d8d333b6995 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Tue, 17 Oct 2023 11:09:32 -0500 Subject: [PATCH 0079/1290] soundwire: bus: introduce controller_id The existing SoundWire support misses a clear Controller/Manager hiearchical definition to deal with all variants across SOC vendors. a) Intel platforms have one controller with 4 or more Managers. b) AMD platforms have two controllers with one Manager each, but due to BIOS issues use two different link_id values within the scope of a single controller. c) QCOM platforms have one or more controller with one Manager each. This patch adds a 'controller_id' which can be set by higher levels. If assigned to -1, the controller_id will be set to the system-unique IDA-assigned bus->id. The main change is that the bus->id is no longer used for any device name, which makes the definition completely predictable and not dependent on any enumeration order. The bus->id is only used to insert the Managers in the stream rt context. Reviewed-by: Bard Liao Reviewed-by: Vijendar Mukunda Signed-off-by: Pierre-Louis Bossart Reviewed-by: Krzysztof Kozlowski Tested-by: Krzysztof Kozlowski Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/stable/20231017160933.12624-2-pierre-louis.bossart%40linux.intel.com Tested-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20231017160933.12624-2-pierre-louis.bossart@linux.intel.com Signed-off-by: Vinod Koul --- drivers/soundwire/amd_manager.c | 8 ++++++++ drivers/soundwire/bus.c | 4 ++++ drivers/soundwire/debugfs.c | 2 +- drivers/soundwire/intel_auxdevice.c | 3 +++ drivers/soundwire/master.c | 2 +- drivers/soundwire/qcom.c | 3 +++ include/linux/soundwire/sdw.h | 4 +++- 7 files changed, 23 insertions(+), 3 deletions(-) diff --git a/drivers/soundwire/amd_manager.c b/drivers/soundwire/amd_manager.c index 3a99f6dcdfafa..a3b1f4e6f0f90 100644 --- a/drivers/soundwire/amd_manager.c +++ b/drivers/soundwire/amd_manager.c @@ -927,6 +927,14 @@ static int amd_sdw_manager_probe(struct platform_device *pdev) amd_manager->bus.clk_stop_timeout = 200; amd_manager->bus.link_id = amd_manager->instance; + /* + * Due to BIOS compatibility, the two links are exposed within + * the scope of a single controller. If this changes, the + * controller_id will have to be updated with drv_data + * information. + */ + amd_manager->bus.controller_id = 0; + switch (amd_manager->instance) { case ACP_SDW0: amd_manager->num_dout_ports = AMD_SDW0_MAX_TX_PORTS; diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c index 41b0d9adf68ef..f3fec15c31122 100644 --- a/drivers/soundwire/bus.c +++ b/drivers/soundwire/bus.c @@ -22,6 +22,10 @@ static int sdw_get_id(struct sdw_bus *bus) return rc; bus->id = rc; + + if (bus->controller_id == -1) + bus->controller_id = rc; + return 0; } diff --git a/drivers/soundwire/debugfs.c b/drivers/soundwire/debugfs.c index d1553cb771874..67abd7e52f092 100644 --- a/drivers/soundwire/debugfs.c +++ b/drivers/soundwire/debugfs.c @@ -20,7 +20,7 @@ void sdw_bus_debugfs_init(struct sdw_bus *bus) return; /* create the debugfs master-N */ - snprintf(name, sizeof(name), "master-%d-%d", bus->id, bus->link_id); + snprintf(name, sizeof(name), "master-%d-%d", bus->controller_id, bus->link_id); bus->debugfs = debugfs_create_dir(name, sdw_debugfs_root); } diff --git a/drivers/soundwire/intel_auxdevice.c b/drivers/soundwire/intel_auxdevice.c index 7f15e3549e539..93698532deac4 100644 --- a/drivers/soundwire/intel_auxdevice.c +++ b/drivers/soundwire/intel_auxdevice.c @@ -234,6 +234,9 @@ static int intel_link_probe(struct auxiliary_device *auxdev, cdns->instance = sdw->instance; cdns->msg_count = 0; + /* single controller for all SoundWire links */ + bus->controller_id = 0; + bus->link_id = auxdev->id; bus->clk_stop_timeout = 1; diff --git a/drivers/soundwire/master.c b/drivers/soundwire/master.c index 9b05c9e25ebe4..51abedbbaa663 100644 --- a/drivers/soundwire/master.c +++ b/drivers/soundwire/master.c @@ -145,7 +145,7 @@ int sdw_master_device_add(struct sdw_bus *bus, struct device *parent, md->dev.fwnode = fwnode; md->dev.dma_mask = parent->dma_mask; - dev_set_name(&md->dev, "sdw-master-%d", bus->id); + dev_set_name(&md->dev, "sdw-master-%d-%d", bus->controller_id, bus->link_id); ret = device_register(&md->dev); if (ret) { diff --git a/drivers/soundwire/qcom.c b/drivers/soundwire/qcom.c index e9a52c1bd3594..7f8549aab42f3 100644 --- a/drivers/soundwire/qcom.c +++ b/drivers/soundwire/qcom.c @@ -1620,6 +1620,9 @@ static int qcom_swrm_probe(struct platform_device *pdev) } } + /* FIXME: is there a DT-defined value to use ? */ + ctrl->bus.controller_id = -1; + ret = sdw_bus_master_add(&ctrl->bus, dev, dev->fwnode); if (ret) { dev_err(dev, "Failed to register Soundwire controller (%d)\n", diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 904004d8b5622..66f814b63a435 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -886,7 +886,8 @@ struct sdw_master_ops { * struct sdw_bus - SoundWire bus * @dev: Shortcut to &bus->md->dev to avoid changing the entire code. * @md: Master device - * @link_id: Link id number, can be 0 to N, unique for each Master + * @controller_id: system-unique controller ID. If set to -1, the bus @id will be used. + * @link_id: Link id number, can be 0 to N, unique for each Controller * @id: bus system-wide unique id * @slaves: list of Slaves on this bus * @assigned: Bitmap for Slave device numbers. @@ -918,6 +919,7 @@ struct sdw_master_ops { struct sdw_bus { struct device *dev; struct sdw_master_device *md; + int controller_id; unsigned int link_id; int id; struct list_head slaves; -- GitLab From 8a8a9ac8a4972ee69d3dd3d1ae43963ae39cee18 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 17 Oct 2023 11:09:33 -0500 Subject: [PATCH 0080/1290] soundwire: fix initializing sysfs for same devices on different buses If same devices with same device IDs are present on different soundwire buses, the probe fails due to conflicting device names and sysfs entries: sysfs: cannot create duplicate filename '/bus/soundwire/devices/sdw:0:0217:0204:00:0' The link ID is 0 for both devices, so they should be differentiated by the controller ID. Add the controller ID so, the device names and sysfs entries look like: sdw:1:0:0217:0204:00:0 -> ../../../devices/platform/soc@0/6ab0000.soundwire-controller/sdw-master-1-0/sdw:1:0:0217:0204:00:0 sdw:3:0:0217:0204:00:0 -> ../../../devices/platform/soc@0/6b10000.soundwire-controller/sdw-master-3-0/sdw:3:0:0217:0204:00:0 [PLB changes: use bus->controller_id instead of bus->id] Fixes: 7c3cd189b86d ("soundwire: Add Master registration") Cc: stable@vger.kernel.org Reviewed-by: Bard Liao Reviewed-by: Vijendar Mukunda Co-developed-by: Pierre-Louis Bossart Signed-off-by: Pierre-Louis Bossart Signed-off-by: Krzysztof Kozlowski Reviewed-by: Krzysztof Kozlowski Tested-by: Krzysztof Kozlowski Acked-by: Mark Brown Tested-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20231017160933.12624-3-pierre-louis.bossart@linux.intel.com Signed-off-by: Vinod Koul --- drivers/soundwire/slave.c | 12 ++++++------ sound/soc/intel/boards/sof_sdw.c | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/soundwire/slave.c b/drivers/soundwire/slave.c index c1c1a2ac293af..060c2982e26b0 100644 --- a/drivers/soundwire/slave.c +++ b/drivers/soundwire/slave.c @@ -39,14 +39,14 @@ int sdw_slave_add(struct sdw_bus *bus, slave->dev.fwnode = fwnode; if (id->unique_id == SDW_IGNORED_UNIQUE_ID) { - /* name shall be sdw:link:mfg:part:class */ - dev_set_name(&slave->dev, "sdw:%01x:%04x:%04x:%02x", - bus->link_id, id->mfg_id, id->part_id, + /* name shall be sdw:ctrl:link:mfg:part:class */ + dev_set_name(&slave->dev, "sdw:%01x:%01x:%04x:%04x:%02x", + bus->controller_id, bus->link_id, id->mfg_id, id->part_id, id->class_id); } else { - /* name shall be sdw:link:mfg:part:class:unique */ - dev_set_name(&slave->dev, "sdw:%01x:%04x:%04x:%02x:%01x", - bus->link_id, id->mfg_id, id->part_id, + /* name shall be sdw:ctrl:link:mfg:part:class:unique */ + dev_set_name(&slave->dev, "sdw:%01x:%01x:%04x:%04x:%02x:%01x", + bus->controller_id, bus->link_id, id->mfg_id, id->part_id, id->class_id, id->unique_id); } diff --git a/sound/soc/intel/boards/sof_sdw.c b/sound/soc/intel/boards/sof_sdw.c index 3312ad8a563b3..690c279bbb886 100644 --- a/sound/soc/intel/boards/sof_sdw.c +++ b/sound/soc/intel/boards/sof_sdw.c @@ -1232,11 +1232,11 @@ static int fill_sdw_codec_dlc(struct device *dev, else if (is_unique_device(adr_link, sdw_version, mfg_id, part_id, class_id, adr_index)) codec->name = devm_kasprintf(dev, GFP_KERNEL, - "sdw:%01x:%04x:%04x:%02x", link_id, + "sdw:0:%01x:%04x:%04x:%02x", link_id, mfg_id, part_id, class_id); else codec->name = devm_kasprintf(dev, GFP_KERNEL, - "sdw:%01x:%04x:%04x:%02x:%01x", link_id, + "sdw:0:%01x:%04x:%04x:%02x:%01x", link_id, mfg_id, part_id, class_id, unique_id); if (!codec->name) -- GitLab From a7ae05ef356162c2a7ff108a7ff154d7d0dcd6aa Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Thu, 23 Nov 2023 10:53:32 +0000 Subject: [PATCH 0081/1290] soundwire: qcom: set controller id to hw master id Qualcomm Soundwire Controllers IP version after 1.3 have a dedicated master id register which will provide a unique id value for each controller instance. Use this value instead of artificially generated value from idr. Versions 1.3 and below only have one instance of soundwire controller which does no have this register, so let them use value from idr. Signed-off-by: Srinivas Kandagatla Reviewed-by: Krzysztof Kozlowski Tested-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20231123105332.102167-1-srinivas.kandagatla@linaro.org Signed-off-by: Vinod Koul --- drivers/soundwire/qcom.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/soundwire/qcom.c b/drivers/soundwire/qcom.c index 7f8549aab42f3..d869e2276815c 100644 --- a/drivers/soundwire/qcom.c +++ b/drivers/soundwire/qcom.c @@ -1620,9 +1620,13 @@ static int qcom_swrm_probe(struct platform_device *pdev) } } - /* FIXME: is there a DT-defined value to use ? */ ctrl->bus.controller_id = -1; + if (ctrl->version > SWRM_VERSION_1_3_0) { + ctrl->reg_read(ctrl, SWRM_COMP_MASTER_ID, &val); + ctrl->bus.controller_id = val; + } + ret = sdw_bus_master_add(&ctrl->bus, dev, dev->fwnode); if (ret) { dev_err(dev, "Failed to register Soundwire controller (%d)\n", -- GitLab From aaf7b392347bc4b32ffcab11c414d983a782e651 Mon Sep 17 00:00:00 2001 From: Vignesh Raghavendra Date: Fri, 24 Nov 2023 10:27:19 +0530 Subject: [PATCH 0082/1290] dt-bindings: dma: ti: k3-*: Add descriptions for register regions In preparation for introducing more register regions, add description for existing register regions so that its easier to map reg-names to that of SoC Documentations/TRMs. Signed-off-by: Vignesh Raghavendra Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20231124045722.191817-2-vigneshr@ti.com Signed-off-by: Vinod Koul --- .../devicetree/bindings/dma/ti/k3-bcdma.yaml | 18 +++++++++++++++--- .../devicetree/bindings/dma/ti/k3-pktdma.yaml | 6 +++++- .../devicetree/bindings/dma/ti/k3-udma.yaml | 5 ++++- 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/Documentation/devicetree/bindings/dma/ti/k3-bcdma.yaml b/Documentation/devicetree/bindings/dma/ti/k3-bcdma.yaml index 4ca300a42a99c..11727af9df734 100644 --- a/Documentation/devicetree/bindings/dma/ti/k3-bcdma.yaml +++ b/Documentation/devicetree/bindings/dma/ti/k3-bcdma.yaml @@ -141,7 +141,10 @@ allOf: ti,sci-rm-range-tchan: false reg: - maxItems: 3 + items: + - description: BCDMA Control /Status Registers region + - description: RX Channel Realtime Registers region + - description: Ring Realtime Registers region reg-names: items: @@ -160,7 +163,12 @@ allOf: then: properties: reg: - minItems: 5 + items: + - description: BCDMA Control /Status Registers region + - description: Block Copy Channel Realtime Registers region + - description: RX Channel Realtime Registers region + - description: TX Channel Realtime Registers region + - description: Ring Realtime Registers region reg-names: items: @@ -184,7 +192,11 @@ allOf: ti,sci-rm-range-bchan: false reg: - maxItems: 4 + items: + - description: BCDMA Control /Status Registers region + - description: RX Channel Realtime Registers region + - description: TX Channel Realtime Registers region + - description: Ring Realtime Registers region reg-names: items: diff --git a/Documentation/devicetree/bindings/dma/ti/k3-pktdma.yaml b/Documentation/devicetree/bindings/dma/ti/k3-pktdma.yaml index a69f62f854d8c..3580b08f65c68 100644 --- a/Documentation/devicetree/bindings/dma/ti/k3-pktdma.yaml +++ b/Documentation/devicetree/bindings/dma/ti/k3-pktdma.yaml @@ -45,7 +45,11 @@ properties: The second cell is the ASEL value for the channel reg: - maxItems: 4 + items: + - description: Packet DMA Control /Status Registers region + - description: RX Channel Realtime Registers region + - description: TX Channel Realtime Registers region + - description: Ring Realtime Registers region reg-names: items: diff --git a/Documentation/devicetree/bindings/dma/ti/k3-udma.yaml b/Documentation/devicetree/bindings/dma/ti/k3-udma.yaml index 22f6c5e2f7f4b..ded588bd079a1 100644 --- a/Documentation/devicetree/bindings/dma/ti/k3-udma.yaml +++ b/Documentation/devicetree/bindings/dma/ti/k3-udma.yaml @@ -69,7 +69,10 @@ properties: - ti,j721e-navss-mcu-udmap reg: - maxItems: 3 + items: + - description: UDMA-P Control /Status Registers region + - description: RX Channel Realtime Registers region + - description: TX Channel Realtime Registers region reg-names: items: -- GitLab From f04470678132c2d044b92befab39a933ac4d106c Mon Sep 17 00:00:00 2001 From: Vignesh Raghavendra Date: Fri, 24 Nov 2023 10:27:20 +0530 Subject: [PATCH 0083/1290] dt-bindings: dma: ti: k3-bcdma: Describe cfg register regions Block copy DMA(BCDMA)module on K3 SoCs have ring, BCHAN, TX and RX channel cfg register regions which are usually configured by a Device Management firmware. But certain entities such as bootloader (like U-Boot) may have to access them directly. Describe this region in the binding documentation for completeness of module description. Keep the binding compatible with existing DTS files by requiring first five regions to be present at least. Signed-off-by: Vignesh Raghavendra Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20231124045722.191817-3-vigneshr@ti.com Signed-off-by: Vinod Koul --- .../devicetree/bindings/dma/ti/k3-bcdma.yaml | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/dma/ti/k3-bcdma.yaml b/Documentation/devicetree/bindings/dma/ti/k3-bcdma.yaml index 11727af9df734..27b8e16365600 100644 --- a/Documentation/devicetree/bindings/dma/ti/k3-bcdma.yaml +++ b/Documentation/devicetree/bindings/dma/ti/k3-bcdma.yaml @@ -37,11 +37,11 @@ properties: reg: minItems: 3 - maxItems: 5 + maxItems: 9 reg-names: minItems: 3 - maxItems: 5 + maxItems: 9 "#dma-cells": const: 3 @@ -163,20 +163,30 @@ allOf: then: properties: reg: + minItems: 5 items: - description: BCDMA Control /Status Registers region - description: Block Copy Channel Realtime Registers region - description: RX Channel Realtime Registers region - description: TX Channel Realtime Registers region - description: Ring Realtime Registers region + - description: Ring Configuration Registers region + - description: TX Channel Configuration Registers region + - description: RX Channel Configuration Registers region + - description: Block Copy Channel Configuration Registers region reg-names: + minItems: 5 items: - const: gcfg - const: bchanrt - const: rchanrt - const: tchanrt - const: ringrt + - const: ring + - const: tchan + - const: rchan + - const: bchan required: - ti,sci-rm-range-bchan @@ -232,8 +242,13 @@ examples: <0x0 0x4c000000 0x0 0x20000>, <0x0 0x4a820000 0x0 0x20000>, <0x0 0x4aa40000 0x0 0x20000>, - <0x0 0x4bc00000 0x0 0x100000>; - reg-names = "gcfg", "bchanrt", "rchanrt", "tchanrt", "ringrt"; + <0x0 0x4bc00000 0x0 0x100000>, + <0x0 0x48600000 0x0 0x8000>, + <0x0 0x484a4000 0x0 0x2000>, + <0x0 0x484c2000 0x0 0x2000>, + <0x0 0x48420000 0x0 0x2000>; + reg-names = "gcfg", "bchanrt", "rchanrt", "tchanrt", "ringrt", + "ring", "tchan", "rchan", "bchan"; msi-parent = <&inta_main_dmss>; #dma-cells = <3>; -- GitLab From 8d75e0e5eed23e4f8ced5eacae3255e498a1c304 Mon Sep 17 00:00:00 2001 From: Vignesh Raghavendra Date: Fri, 24 Nov 2023 10:27:21 +0530 Subject: [PATCH 0084/1290] dt-bindings: dma: ti: k3-pktdma: Describe cfg register regions Packet DMA (PKTDMA) module on K3 SoCs have ring cfg, TX and RX channel cfg and RX flow cfg register regions which are usually configured by a Device Management firmware. But certain entities such as bootloader (like U-Boot) may have to access them directly. Describe this region in the binding documentation for completeness of module description. Keep the binding compatible with existing DTS files by requiring first four regions to be present at least. Signed-off-by: Vignesh Raghavendra Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20231124045722.191817-4-vigneshr@ti.com Signed-off-by: Vinod Koul --- .../devicetree/bindings/dma/ti/k3-pktdma.yaml | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/dma/ti/k3-pktdma.yaml b/Documentation/devicetree/bindings/dma/ti/k3-pktdma.yaml index 3580b08f65c68..11e064c029946 100644 --- a/Documentation/devicetree/bindings/dma/ti/k3-pktdma.yaml +++ b/Documentation/devicetree/bindings/dma/ti/k3-pktdma.yaml @@ -45,18 +45,28 @@ properties: The second cell is the ASEL value for the channel reg: + minItems: 4 items: - description: Packet DMA Control /Status Registers region - description: RX Channel Realtime Registers region - description: TX Channel Realtime Registers region - description: Ring Realtime Registers region + - description: Ring Configuration Registers region + - description: TX Configuration Registers region + - description: RX Configuration Registers region + - description: RX Flow Configuration Registers region reg-names: + minItems: 4 items: - const: gcfg - const: rchanrt - const: tchanrt - const: ringrt + - const: ring + - const: tchan + - const: rchan + - const: rflow msi-parent: true @@ -140,8 +150,14 @@ examples: reg = <0x0 0x485c0000 0x0 0x100>, <0x0 0x4a800000 0x0 0x20000>, <0x0 0x4aa00000 0x0 0x40000>, - <0x0 0x4b800000 0x0 0x400000>; - reg-names = "gcfg", "rchanrt", "tchanrt", "ringrt"; + <0x0 0x4b800000 0x0 0x400000>, + <0x0 0x485e0000 0x0 0x20000>, + <0x0 0x484a0000 0x0 0x4000>, + <0x0 0x484c0000 0x0 0x2000>, + <0x0 0x48430000 0x0 0x4000>; + reg-names = "gcfg", "rchanrt", "tchanrt", "ringrt", + "ring", "tchan", "rchan", "rflow"; + msi-parent = <&inta_main_dmss>; #dma-cells = <2>; -- GitLab From d7aaccd3beb1ec34b04b13fa236f50efb77c8d6c Mon Sep 17 00:00:00 2001 From: Vignesh Raghavendra Date: Fri, 24 Nov 2023 10:27:22 +0530 Subject: [PATCH 0085/1290] dt-bindings: dma: ti: k3-udma: Describe cfg register regions Unified DMA (UDMA) module on K3 SoCs have TX and RX channel cfg and RX flow cfg register regions which are usually configured by a Device Management firmware. But certain entities such as bootloader (like U-Boot) may have to access them directly. Describe this region in the binding documentation for completeness of module description. Keep the binding compatible with existing DTS files by requiring first four regions to be present at least. Signed-off-by: Vignesh Raghavendra Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20231124045722.191817-5-vigneshr@ti.com Signed-off-by: Vinod Koul --- .../devicetree/bindings/dma/ti/k3-udma.yaml | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/dma/ti/k3-udma.yaml b/Documentation/devicetree/bindings/dma/ti/k3-udma.yaml index ded588bd079a1..b18cf2bfdb5b1 100644 --- a/Documentation/devicetree/bindings/dma/ti/k3-udma.yaml +++ b/Documentation/devicetree/bindings/dma/ti/k3-udma.yaml @@ -69,16 +69,24 @@ properties: - ti,j721e-navss-mcu-udmap reg: + minItems: 3 items: - description: UDMA-P Control /Status Registers region - description: RX Channel Realtime Registers region - description: TX Channel Realtime Registers region + - description: TX Configuration Registers region + - description: RX Configuration Registers region + - description: RX Flow Configuration Registers region reg-names: + minItems: 3 items: - const: gcfg - const: rchanrt - const: tchanrt + - const: tchan + - const: rchan + - const: rflow msi-parent: true @@ -161,8 +169,11 @@ examples: compatible = "ti,am654-navss-main-udmap"; reg = <0x0 0x31150000 0x0 0x100>, <0x0 0x34000000 0x0 0x100000>, - <0x0 0x35000000 0x0 0x100000>; - reg-names = "gcfg", "rchanrt", "tchanrt"; + <0x0 0x35000000 0x0 0x100000>, + <0x0 0x30b00000 0x0 0x20000>, + <0x0 0x30c00000 0x0 0x8000>, + <0x0 0x30d00000 0x0 0x4000>; + reg-names = "gcfg", "rchanrt", "tchanrt", "tchan", "rchan", "rflow"; #dma-cells = <1>; ti,ringacc = <&ringacc>; -- GitLab From 66fb6eb6fab63ee80fd26cd5bdd9164aead0d207 Mon Sep 17 00:00:00 2001 From: Sibi Sankar Date: Fri, 24 Nov 2023 15:36:06 +0530 Subject: [PATCH 0086/1290] dt-bindings: dma: qcom: gpi: add compatible for X1E80100 The Qualcomm X1E80100 uses GPI DMA for its GENI interface. Add a compatible string for it in the documentation by using the SM6350 as fallback. Signed-off-by: Sibi Sankar Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20231124100608.29964-4-quic_sibis@quicinc.com Signed-off-by: Vinod Koul --- Documentation/devicetree/bindings/dma/qcom,gpi.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/dma/qcom,gpi.yaml b/Documentation/devicetree/bindings/dma/qcom,gpi.yaml index 88d0de3d1b46b..d06db2cc931eb 100644 --- a/Documentation/devicetree/bindings/dma/qcom,gpi.yaml +++ b/Documentation/devicetree/bindings/dma/qcom,gpi.yaml @@ -32,6 +32,7 @@ properties: - qcom,sm8350-gpi-dma - qcom,sm8450-gpi-dma - qcom,sm8550-gpi-dma + - qcom,x1e80100-gpi-dma - const: qcom,sm6350-gpi-dma - items: - enum: -- GitLab From 56d02cfa3fbfca7466ccd68f4db78b0297f5c01f Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Thu, 2 Nov 2023 20:39:22 +0000 Subject: [PATCH 0087/1290] dt-bindings: dma: rz-dmac: Document RZ/Five SoC The DMAC block on the RZ/Five SoC is identical to one found on the RZ/G2UL SoC. "renesas,r9a07g043-dmac" compatible string will be used on the RZ/Five SoC so to make this clear, update the comment to include RZ/Five SoC. No driver changes are required as generic compatible string "renesas,rz-dmac" will be used as a fallback on RZ/Five SoC. Signed-off-by: Lad Prabhakar Reviewed-by: Geert Uytterhoeven Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20231102203922.548353-1-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Vinod Koul --- Documentation/devicetree/bindings/dma/renesas,rz-dmac.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/dma/renesas,rz-dmac.yaml b/Documentation/devicetree/bindings/dma/renesas,rz-dmac.yaml index c284abc6784ae..a42b6a26a6d3f 100644 --- a/Documentation/devicetree/bindings/dma/renesas,rz-dmac.yaml +++ b/Documentation/devicetree/bindings/dma/renesas,rz-dmac.yaml @@ -16,7 +16,7 @@ properties: compatible: items: - enum: - - renesas,r9a07g043-dmac # RZ/G2UL + - renesas,r9a07g043-dmac # RZ/G2UL and RZ/Five - renesas,r9a07g044-dmac # RZ/G2{L,LC} - renesas,r9a07g054-dmac # RZ/V2L - const: renesas,rz-dmac -- GitLab From 0fdd1c4ea99e188dfa8ab7bafbe4004cc72dca30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sun, 5 Nov 2023 10:34:17 +0100 Subject: [PATCH 0088/1290] dmaengine: milbeaut-hdmac: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). There is an error path that has the above mentioned problem. This patch only adds a more drastic error message. To properly fix it, dmaengine_terminate_sync() must be known to have succeeded (or that it's safe to not call it as other drivers seem to assume). Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20231105093415.3704633-7-u.kleine-koenig@pengutronix.de Signed-off-by: Vinod Koul --- drivers/dma/milbeaut-hdmac.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/dma/milbeaut-hdmac.c b/drivers/dma/milbeaut-hdmac.c index 1b0a95892627d..7b41c670970a6 100644 --- a/drivers/dma/milbeaut-hdmac.c +++ b/drivers/dma/milbeaut-hdmac.c @@ -531,7 +531,7 @@ disable_clk: return ret; } -static int milbeaut_hdmac_remove(struct platform_device *pdev) +static void milbeaut_hdmac_remove(struct platform_device *pdev) { struct milbeaut_hdmac_device *mdev = platform_get_drvdata(pdev); struct dma_chan *chan; @@ -546,16 +546,21 @@ static int milbeaut_hdmac_remove(struct platform_device *pdev) */ list_for_each_entry(chan, &mdev->ddev.channels, device_node) { ret = dmaengine_terminate_sync(chan); - if (ret) - return ret; + if (ret) { + /* + * This results in resource leakage and maybe also + * use-after-free errors as e.g. *mdev is kfreed. + */ + dev_alert(&pdev->dev, "Failed to terminate channel %d (%pe)\n", + chan->chan_id, ERR_PTR(ret)); + return; + } milbeaut_hdmac_free_chan_resources(chan); } of_dma_controller_free(pdev->dev.of_node); dma_async_device_unregister(&mdev->ddev); clk_disable_unprepare(mdev->clk); - - return 0; } static const struct of_device_id milbeaut_hdmac_match[] = { @@ -566,7 +571,7 @@ MODULE_DEVICE_TABLE(of, milbeaut_hdmac_match); static struct platform_driver milbeaut_hdmac_driver = { .probe = milbeaut_hdmac_probe, - .remove = milbeaut_hdmac_remove, + .remove_new = milbeaut_hdmac_remove, .driver = { .name = "milbeaut-m10v-hdmac", .of_match_table = milbeaut_hdmac_match, -- GitLab From 47ee210011ddb8b366c7dcf9c1a9c3818573df29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sun, 5 Nov 2023 10:34:18 +0100 Subject: [PATCH 0089/1290] dmaengine: milbeaut-xdmac: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). There is an error path that has the above mentioned problem. This patch only adds a more drastic error message. To properly fix it, dmaengine_terminate_sync() must be known to have succeeded (or that it's safe to not call it as other drivers seem to assume). Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20231105093415.3704633-8-u.kleine-koenig@pengutronix.de Signed-off-by: Vinod Koul --- drivers/dma/milbeaut-xdmac.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/dma/milbeaut-xdmac.c b/drivers/dma/milbeaut-xdmac.c index d29d01e730aa0..2cce529b448eb 100644 --- a/drivers/dma/milbeaut-xdmac.c +++ b/drivers/dma/milbeaut-xdmac.c @@ -368,7 +368,7 @@ disable_xdmac: return ret; } -static int milbeaut_xdmac_remove(struct platform_device *pdev) +static void milbeaut_xdmac_remove(struct platform_device *pdev) { struct milbeaut_xdmac_device *mdev = platform_get_drvdata(pdev); struct dma_chan *chan; @@ -383,8 +383,15 @@ static int milbeaut_xdmac_remove(struct platform_device *pdev) */ list_for_each_entry(chan, &mdev->ddev.channels, device_node) { ret = dmaengine_terminate_sync(chan); - if (ret) - return ret; + if (ret) { + /* + * This results in resource leakage and maybe also + * use-after-free errors as e.g. *mdev is kfreed. + */ + dev_alert(&pdev->dev, "Failed to terminate channel %d (%pe)\n", + chan->chan_id, ERR_PTR(ret)); + return; + } milbeaut_xdmac_free_chan_resources(chan); } @@ -392,8 +399,6 @@ static int milbeaut_xdmac_remove(struct platform_device *pdev) dma_async_device_unregister(&mdev->ddev); disable_xdmac(mdev); - - return 0; } static const struct of_device_id milbeaut_xdmac_match[] = { @@ -404,7 +409,7 @@ MODULE_DEVICE_TABLE(of, milbeaut_xdmac_match); static struct platform_driver milbeaut_xdmac_driver = { .probe = milbeaut_xdmac_probe, - .remove = milbeaut_xdmac_remove, + .remove_new = milbeaut_xdmac_remove, .driver = { .name = "milbeaut-m10v-xdmac", .of_match_table = milbeaut_xdmac_match, -- GitLab From 5d4304a8d5646c268d73383fbc179db53f85b921 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sun, 5 Nov 2023 10:34:19 +0100 Subject: [PATCH 0090/1290] dmaengine: uniphier-mdmac: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). There is an error path that has the above mentioned problem. This patch only adds a more drastic error message. To properly fix it, dmaengine_terminate_sync() must be known to have succeeded (or that it's safe to not call it as other drivers seem to assume). Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20231105093415.3704633-9-u.kleine-koenig@pengutronix.de Signed-off-by: Vinod Koul --- drivers/dma/uniphier-mdmac.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/dma/uniphier-mdmac.c b/drivers/dma/uniphier-mdmac.c index 618839df07486..ad7125f6e2ca8 100644 --- a/drivers/dma/uniphier-mdmac.c +++ b/drivers/dma/uniphier-mdmac.c @@ -453,7 +453,7 @@ disable_clk: return ret; } -static int uniphier_mdmac_remove(struct platform_device *pdev) +static void uniphier_mdmac_remove(struct platform_device *pdev) { struct uniphier_mdmac_device *mdev = platform_get_drvdata(pdev); struct dma_chan *chan; @@ -468,16 +468,21 @@ static int uniphier_mdmac_remove(struct platform_device *pdev) */ list_for_each_entry(chan, &mdev->ddev.channels, device_node) { ret = dmaengine_terminate_sync(chan); - if (ret) - return ret; + if (ret) { + /* + * This results in resource leakage and maybe also + * use-after-free errors as e.g. *mdev is kfreed. + */ + dev_alert(&pdev->dev, "Failed to terminate channel %d (%pe)\n", + chan->chan_id, ERR_PTR(ret)); + return; + } uniphier_mdmac_free_chan_resources(chan); } of_dma_controller_free(pdev->dev.of_node); dma_async_device_unregister(&mdev->ddev); clk_disable_unprepare(mdev->clk); - - return 0; } static const struct of_device_id uniphier_mdmac_match[] = { @@ -488,7 +493,7 @@ MODULE_DEVICE_TABLE(of, uniphier_mdmac_match); static struct platform_driver uniphier_mdmac_driver = { .probe = uniphier_mdmac_probe, - .remove = uniphier_mdmac_remove, + .remove_new = uniphier_mdmac_remove, .driver = { .name = "uniphier-mio-dmac", .of_match_table = uniphier_mdmac_match, -- GitLab From ead0e402e50d1101939e4af67891d5b2fa9678b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sun, 5 Nov 2023 10:34:20 +0100 Subject: [PATCH 0091/1290] dmaengine: uniphier-xdmac: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). There is an error path that has the above mentioned problem. This patch only adds a more drastic error message. To properly fix it, dmaengine_terminate_sync() must be known to have succeeded (or that it's safe to not call it as other drivers seem to assume). Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20231105093415.3704633-10-u.kleine-koenig@pengutronix.de Signed-off-by: Vinod Koul --- drivers/dma/uniphier-xdmac.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/dma/uniphier-xdmac.c b/drivers/dma/uniphier-xdmac.c index 3a8ee2b173b52..3ce2dc2ad9de4 100644 --- a/drivers/dma/uniphier-xdmac.c +++ b/drivers/dma/uniphier-xdmac.c @@ -563,7 +563,7 @@ out_unregister_dmac: return ret; } -static int uniphier_xdmac_remove(struct platform_device *pdev) +static void uniphier_xdmac_remove(struct platform_device *pdev) { struct uniphier_xdmac_device *xdev = platform_get_drvdata(pdev); struct dma_device *ddev = &xdev->ddev; @@ -579,15 +579,20 @@ static int uniphier_xdmac_remove(struct platform_device *pdev) */ list_for_each_entry(chan, &ddev->channels, device_node) { ret = dmaengine_terminate_sync(chan); - if (ret) - return ret; + if (ret) { + /* + * This results in resource leakage and maybe also + * use-after-free errors as e.g. *xdev is kfreed. + */ + dev_alert(&pdev->dev, "Failed to terminate channel %d (%pe)\n", + chan->chan_id, ERR_PTR(ret)); + return; + } uniphier_xdmac_free_chan_resources(chan); } of_dma_controller_free(pdev->dev.of_node); dma_async_device_unregister(ddev); - - return 0; } static const struct of_device_id uniphier_xdmac_match[] = { @@ -598,7 +603,7 @@ MODULE_DEVICE_TABLE(of, uniphier_xdmac_match); static struct platform_driver uniphier_xdmac_driver = { .probe = uniphier_xdmac_probe, - .remove = uniphier_xdmac_remove, + .remove_new = uniphier_xdmac_remove, .driver = { .name = "uniphier-xdmac", .of_match_table = uniphier_xdmac_match, -- GitLab From 375ff42c4c9825c19a53b9095ae4b3337cc83442 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Wed, 25 Oct 2023 10:23:04 +0200 Subject: [PATCH 0092/1290] dt-bindings: dma: qcom,gpi: document the SM8650 GPI DMA Engine Document the GPI DMA Engine on the SM8650 Platform. Signed-off-by: Neil Armstrong Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20231025-topic-sm8650-upstream-bindings-gpi-v2-1-4de85293d730@linaro.org Signed-off-by: Vinod Koul --- Documentation/devicetree/bindings/dma/qcom,gpi.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/dma/qcom,gpi.yaml b/Documentation/devicetree/bindings/dma/qcom,gpi.yaml index d06db2cc931eb..deb64cb9ca3ea 100644 --- a/Documentation/devicetree/bindings/dma/qcom,gpi.yaml +++ b/Documentation/devicetree/bindings/dma/qcom,gpi.yaml @@ -32,6 +32,7 @@ properties: - qcom,sm8350-gpi-dma - qcom,sm8450-gpi-dma - qcom,sm8550-gpi-dma + - qcom,sm8650-gpi-dma - qcom,x1e80100-gpi-dma - const: qcom,sm6350-gpi-dma - items: -- GitLab From 306f5df81fcc89b462fbeb9dbe26d9a8ad7c7582 Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Sun, 29 Oct 2023 18:07:04 +0100 Subject: [PATCH 0093/1290] dmaengine: apple-admac: Keep upper bits of REG_BUS_WIDTH MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For RX channels, REG_BUS_WIDTH seems to default to a value of 0xf00, and macOS preserves the upper bits when setting the configuration in the lower ones. If we reset the upper bits to 0, this causes framing errors on suspend/resume (the data stream "tears" and channels get swapped around). Keeping the upper bits untouched, like the macOS driver does, fixes this issue. Signed-off-by: Hector Martin Reviewed-by: Martin Povišer Signed-off-by: Martin Povišer Link: https://lore.kernel.org/r/20231029170704.82238-1-povik+lin@cutebit.org Signed-off-by: Vinod Koul --- drivers/dma/apple-admac.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/dma/apple-admac.c b/drivers/dma/apple-admac.c index 5b63996640d9d..9588773dd2eb6 100644 --- a/drivers/dma/apple-admac.c +++ b/drivers/dma/apple-admac.c @@ -57,6 +57,8 @@ #define REG_BUS_WIDTH(ch) (0x8040 + (ch) * 0x200) +#define BUS_WIDTH_WORD_SIZE GENMASK(3, 0) +#define BUS_WIDTH_FRAME_SIZE GENMASK(7, 4) #define BUS_WIDTH_8BIT 0x00 #define BUS_WIDTH_16BIT 0x01 #define BUS_WIDTH_32BIT 0x02 @@ -740,7 +742,8 @@ static int admac_device_config(struct dma_chan *chan, struct admac_data *ad = adchan->host; bool is_tx = admac_chan_direction(adchan->no) == DMA_MEM_TO_DEV; int wordsize = 0; - u32 bus_width = 0; + u32 bus_width = readl_relaxed(ad->base + REG_BUS_WIDTH(adchan->no)) & + ~(BUS_WIDTH_WORD_SIZE | BUS_WIDTH_FRAME_SIZE); switch (is_tx ? config->dst_addr_width : config->src_addr_width) { case DMA_SLAVE_BUSWIDTH_1_BYTE: -- GitLab From 1cba275017352ba887058934d23b5c76a3de62ae Mon Sep 17 00:00:00 2001 From: John Johansen Date: Sun, 26 Nov 2023 01:02:48 -0800 Subject: [PATCH 0094/1290] apparmor: cleanup network hook comments Drop useless partial kernel doc style comments. Finish/update kerneldoc comment where there is useful information Signed-off-by: John Johansen --- security/apparmor/lsm.c | 60 +++++++++++------------------------------ 1 file changed, 16 insertions(+), 44 deletions(-) diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index d3d2fc13c6e7c..3eb992801a7f5 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -987,9 +987,6 @@ static int apparmor_userns_create(const struct cred *cred) return error; } -/** - * apparmor_sk_alloc_security - allocate and attach the sk_security field - */ static int apparmor_sk_alloc_security(struct sock *sk, int family, gfp_t flags) { struct aa_sk_ctx *ctx; @@ -1003,9 +1000,6 @@ static int apparmor_sk_alloc_security(struct sock *sk, int family, gfp_t flags) return 0; } -/** - * apparmor_sk_free_security - free the sk_security field - */ static void apparmor_sk_free_security(struct sock *sk) { struct aa_sk_ctx *ctx = aa_sock(sk); @@ -1018,6 +1012,8 @@ static void apparmor_sk_free_security(struct sock *sk) /** * apparmor_sk_clone_security - clone the sk_security field + * @sk: sock to have security cloned + * @newsk: sock getting clone */ static void apparmor_sk_clone_security(const struct sock *sk, struct sock *newsk) @@ -1034,9 +1030,6 @@ static void apparmor_sk_clone_security(const struct sock *sk, new->peer = aa_get_label(ctx->peer); } -/** - * apparmor_socket_create - check perms before creating a new socket - */ static int apparmor_socket_create(int family, int type, int protocol, int kern) { struct aa_label *label; @@ -1058,10 +1051,14 @@ static int apparmor_socket_create(int family, int type, int protocol, int kern) /** * apparmor_socket_post_create - setup the per-socket security struct + * @sock: socket that is being setup + * @family: family of socket being created + * @type: type of the socket + * @ptotocol: protocol of the socket + * @kern: socket is a special kernel socket * * Note: - * - kernel sockets currently labeled unconfined but we may want to - * move to a special kernel label + * - kernel sockets labeled kernel_t used to use unconfined * - socket may not have sk here if created with sock_create_lite or * sock_alloc. These should be accept cases which will be handled in * sock_graft. @@ -1087,9 +1084,6 @@ static int apparmor_socket_post_create(struct socket *sock, int family, return 0; } -/** - * apparmor_socket_bind - check perms before bind addr to socket - */ static int apparmor_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen) { @@ -1103,9 +1097,6 @@ static int apparmor_socket_bind(struct socket *sock, aa_sk_perm(OP_BIND, AA_MAY_BIND, sock->sk)); } -/** - * apparmor_socket_connect - check perms before connecting @sock to @address - */ static int apparmor_socket_connect(struct socket *sock, struct sockaddr *address, int addrlen) { @@ -1119,9 +1110,6 @@ static int apparmor_socket_connect(struct socket *sock, aa_sk_perm(OP_CONNECT, AA_MAY_CONNECT, sock->sk)); } -/** - * apparmor_socket_listen - check perms before allowing listen - */ static int apparmor_socket_listen(struct socket *sock, int backlog) { AA_BUG(!sock); @@ -1133,9 +1121,7 @@ static int apparmor_socket_listen(struct socket *sock, int backlog) aa_sk_perm(OP_LISTEN, AA_MAY_LISTEN, sock->sk)); } -/** - * apparmor_socket_accept - check perms before accepting a new connection. - * +/* * Note: while @newsock is created and has some information, the accept * has not been done. */ @@ -1164,18 +1150,12 @@ static int aa_sock_msg_perm(const char *op, u32 request, struct socket *sock, aa_sk_perm(op, request, sock->sk)); } -/** - * apparmor_socket_sendmsg - check perms before sending msg to another socket - */ static int apparmor_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size) { return aa_sock_msg_perm(OP_SENDMSG, AA_MAY_SEND, sock, msg, size); } -/** - * apparmor_socket_recvmsg - check perms before receiving a message - */ static int apparmor_socket_recvmsg(struct socket *sock, struct msghdr *msg, int size, int flags) { @@ -1194,17 +1174,11 @@ static int aa_sock_perm(const char *op, u32 request, struct socket *sock) aa_sk_perm(op, request, sock->sk)); } -/** - * apparmor_socket_getsockname - check perms before getting the local address - */ static int apparmor_socket_getsockname(struct socket *sock) { return aa_sock_perm(OP_GETSOCKNAME, AA_MAY_GETATTR, sock); } -/** - * apparmor_socket_getpeername - check perms before getting remote address - */ static int apparmor_socket_getpeername(struct socket *sock) { return aa_sock_perm(OP_GETPEERNAME, AA_MAY_GETATTR, sock); @@ -1223,9 +1197,6 @@ static int aa_sock_opt_perm(const char *op, u32 request, struct socket *sock, aa_sk_perm(op, request, sock->sk)); } -/** - * apparmor_socket_getsockopt - check perms before getting socket options - */ static int apparmor_socket_getsockopt(struct socket *sock, int level, int optname) { @@ -1233,9 +1204,6 @@ static int apparmor_socket_getsockopt(struct socket *sock, int level, level, optname); } -/** - * apparmor_socket_setsockopt - check perms before setting socket options - */ static int apparmor_socket_setsockopt(struct socket *sock, int level, int optname) { @@ -1243,9 +1211,6 @@ static int apparmor_socket_setsockopt(struct socket *sock, int level, level, optname); } -/** - * apparmor_socket_shutdown - check perms before shutting down @sock conn - */ static int apparmor_socket_shutdown(struct socket *sock, int how) { return aa_sock_perm(OP_SHUTDOWN, AA_MAY_SHUTDOWN, sock); @@ -1254,6 +1219,8 @@ static int apparmor_socket_shutdown(struct socket *sock, int how) #ifdef CONFIG_NETWORK_SECMARK /** * apparmor_socket_sock_rcv_skb - check perms before associating skb to sk + * @sk: sk to associate @skb with + * @skb: skb to check for perms * * Note: can not sleep may be called with locks held * @@ -1285,6 +1252,11 @@ static struct aa_label *sk_peer_label(struct sock *sk) /** * apparmor_socket_getpeersec_stream - get security context of peer + * @sock: socket that we are trying to get the peer context of + * @optval: output - buffer to copy peer name to + * @optlen: output - size of copied name in @optval + * @len: size of @optval buffer + * Returns: 0 on success, -errno of failure * * Note: for tcp only valid if using ipsec or cipso on lan */ -- GitLab From f1aad9df93f39267e890836a28d22511f23474e1 Mon Sep 17 00:00:00 2001 From: Laurentiu Tudor Date: Tue, 26 Sep 2023 18:26:00 +0300 Subject: [PATCH 0095/1290] iommu: Map reserved memory as cacheable if device is coherent Check if the device is marked as DMA coherent in the DT and if so, map its reserved memory as cacheable in the IOMMU. This fixes the recently added IOMMU reserved memory support which uses IOMMU_RESV_DIRECT without properly building the PROT for the mapping. Fixes: a5bf3cfce8cb ("iommu: Implement of_iommu_get_resv_regions()") Signed-off-by: Laurentiu Tudor Reviewed-by: Jason Gunthorpe Acked-by: Thierry Reding Link: https://lore.kernel.org/r/20230926152600.8749-1-laurentiu.tudor@nxp.com Signed-off-by: Joerg Roedel --- drivers/iommu/of_iommu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index 157b286e36bf3..5b3631ba5a451 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -254,6 +254,9 @@ void of_iommu_get_resv_regions(struct device *dev, struct list_head *list) phys_addr_t iova; size_t length; + if (of_dma_is_coherent(dev->of_node)) + prot |= IOMMU_CACHE; + maps = of_translate_dma_region(np, maps, &iova, &length); type = iommu_resv_region_get_type(dev, &phys, iova, length); -- GitLab From 9abe6c55354db38f0906fcaf4e1c1644c26cd624 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Mon, 20 Nov 2023 17:53:42 +0800 Subject: [PATCH 0096/1290] iommu/amd: Set variable amd_dirty_ops to static Fix the followng warning: drivers/iommu/amd/iommu.c:67:30: warning: symbol 'amd_dirty_ops' was not declared. Should it be static? This variable is only used in its defining file, so it should be static. Signed-off-by: Kunwu Chan Reviewed-by: Joao Martins Reviewed-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231120095342.1102999-1-chentao@kylinos.cn Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index fcc987f5d4edc..9f70643608283 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -64,7 +64,7 @@ LIST_HEAD(hpet_map); LIST_HEAD(acpihid_map); const struct iommu_ops amd_iommu_ops; -const struct iommu_dirty_ops amd_dirty_ops; +static const struct iommu_dirty_ops amd_dirty_ops; int amd_iommu_max_glx_val = -1; @@ -2635,7 +2635,7 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain) return true; } -const struct iommu_dirty_ops amd_dirty_ops = { +static const struct iommu_dirty_ops amd_dirty_ops = { .set_dirty_tracking = amd_iommu_set_dirty_tracking, .read_and_clear_dirty = amd_iommu_read_and_clear_dirty, }; -- GitLab From 00271ca5cbcd36d01f3c58a05378304ba9dd6a2a Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Mon, 20 Nov 2023 15:51:56 +0100 Subject: [PATCH 0097/1290] iommu/virtio: Make use of ops->iotlb_sync_map Pull out the sync operation from viommu_map_pages() by implementing ops->iotlb_sync_map. This allows the common IOMMU code to map multiple elements of an sg with a single sync (see iommu_map_sg()). Link: https://lore.kernel.org/lkml/20230726111433.1105665-1-schnelle@linux.ibm.com/ Signed-off-by: Niklas Schnelle Reviewed-by: Jean-Philippe Brucker Link: https://lore.kernel.org/r/20231120-viommu-sync-map-v3-1-50a57ecf78b5@linux.ibm.com Signed-off-by: Joerg Roedel --- drivers/iommu/virtio-iommu.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index 379ebe03efb6d..3f18f69d70d89 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -843,7 +843,7 @@ static int viommu_map_pages(struct iommu_domain *domain, unsigned long iova, .flags = cpu_to_le32(flags), }; - ret = viommu_send_req_sync(vdomain->viommu, &map, sizeof(map)); + ret = viommu_add_req(vdomain->viommu, &map, sizeof(map)); if (ret) { viommu_del_mappings(vdomain, iova, end); return ret; @@ -912,6 +912,20 @@ static void viommu_iotlb_sync(struct iommu_domain *domain, viommu_sync_req(vdomain->viommu); } +static int viommu_iotlb_sync_map(struct iommu_domain *domain, + unsigned long iova, size_t size) +{ + struct viommu_domain *vdomain = to_viommu_domain(domain); + + /* + * May be called before the viommu is initialized including + * while creating direct mapping + */ + if (!vdomain->nr_endpoints) + return 0; + return viommu_sync_req(vdomain->viommu); +} + static void viommu_get_resv_regions(struct device *dev, struct list_head *head) { struct iommu_resv_region *entry, *new_entry, *msi = NULL; @@ -1058,6 +1072,7 @@ static struct iommu_ops viommu_ops = { .unmap_pages = viommu_unmap_pages, .iova_to_phys = viommu_iova_to_phys, .iotlb_sync = viommu_iotlb_sync, + .iotlb_sync_map = viommu_iotlb_sync_map, .free = viommu_domain_free, } }; -- GitLab From 6f01a732608f294a853f0d1cc963772bac12f1e9 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Mon, 20 Nov 2023 15:51:57 +0100 Subject: [PATCH 0098/1290] iommu/virtio: Add ops->flush_iotlb_all and enable deferred flush Add ops->flush_iotlb_all operation to enable virtio-iommu for the dma-iommu deferred flush scheme. This results in a significant increase in performance in exchange for a window in which devices can still access previously IOMMU mapped memory when running with CONFIG_IOMMU_DEFAULT_DMA_LAZY. The previous strict behavior can be achieved with iommu.strict=1 on the kernel command line or CONFIG_IOMMU_DEFAULT_DMA_STRICT. Link: https://lore.kernel.org/lkml/20230802123612.GA6142@myrica/ Signed-off-by: Niklas Schnelle Reviewed-by: Jean-Philippe Brucker Link: https://lore.kernel.org/r/20231120-viommu-sync-map-v3-2-50a57ecf78b5@linux.ibm.com Signed-off-by: Joerg Roedel --- drivers/iommu/virtio-iommu.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index 3f18f69d70d89..ef6a874451059 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -926,6 +926,19 @@ static int viommu_iotlb_sync_map(struct iommu_domain *domain, return viommu_sync_req(vdomain->viommu); } +static void viommu_flush_iotlb_all(struct iommu_domain *domain) +{ + struct viommu_domain *vdomain = to_viommu_domain(domain); + + /* + * May be called before the viommu is initialized including + * while creating direct mapping + */ + if (!vdomain->nr_endpoints) + return; + viommu_sync_req(vdomain->viommu); +} + static void viommu_get_resv_regions(struct device *dev, struct list_head *head) { struct iommu_resv_region *entry, *new_entry, *msi = NULL; @@ -1051,6 +1064,8 @@ static bool viommu_capable(struct device *dev, enum iommu_cap cap) switch (cap) { case IOMMU_CAP_CACHE_COHERENCY: return true; + case IOMMU_CAP_DEFERRED_FLUSH: + return true; default: return false; } @@ -1071,6 +1086,7 @@ static struct iommu_ops viommu_ops = { .map_pages = viommu_map_pages, .unmap_pages = viommu_unmap_pages, .iova_to_phys = viommu_iova_to_phys, + .flush_iotlb_all = viommu_flush_iotlb_all, .iotlb_sync = viommu_iotlb_sync, .iotlb_sync_map = viommu_iotlb_sync_map, .free = viommu_domain_free, -- GitLab From 48ed12788ed8aa099e463c2febcd3f06fb71c68c Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Tue, 21 Nov 2023 18:03:57 +0000 Subject: [PATCH 0099/1290] iommu: Factor out some helpers The pattern for picking the first device out of the group list is repeated a few times now, so it's clearly worth factoring out, which also helps hide the iommu_group_dev detail from places that don't need to know. Similarly, the safety check for dev_iommu_ops() at certain public interfaces starts looking a bit repetitive, and might not be completely obvious at first glance, so let's factor that out for clarity as well, in preparation for more uses of both. Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Jerry Snitselaar Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/566cbd161546caa6aed49662c9b3e8f09dc9c3cf.1700589539.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 56 ++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 30 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index f17a1113f3d6a..5c555fc0d54ca 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -368,6 +368,15 @@ static void dev_iommu_free(struct device *dev) kfree(param); } +/* + * Internal equivalent of device_iommu_mapped() for when we care that a device + * actually has API ops, and don't want false positives from VFIO-only groups. + */ +static bool dev_has_iommu(struct device *dev) +{ + return dev->iommu && dev->iommu->iommu_dev; +} + static u32 dev_iommu_get_max_pasids(struct device *dev) { u32 max_pasids = 0, bits = 0; @@ -620,7 +629,7 @@ static void __iommu_group_remove_device(struct device *dev) list_del(&device->list); __iommu_group_free_device(group, device); - if (dev->iommu && dev->iommu->iommu_dev) + if (dev_has_iommu(dev)) iommu_deinit_device(dev); else dev->iommu_group = NULL; @@ -819,7 +828,7 @@ int iommu_get_group_resv_regions(struct iommu_group *group, * Non-API groups still expose reserved_regions in sysfs, * so filter out calls that get here that way. */ - if (!device->dev->iommu) + if (!dev_has_iommu(device->dev)) break; INIT_LIST_HEAD(&dev_resv_regions); @@ -1225,6 +1234,12 @@ void iommu_group_remove_device(struct device *dev) } EXPORT_SYMBOL_GPL(iommu_group_remove_device); +static struct device *iommu_group_first_dev(struct iommu_group *group) +{ + lockdep_assert_held(&group->mutex); + return list_first_entry(&group->devices, struct group_device, list)->dev; +} + /** * iommu_group_for_each_dev - iterate over each device in the group * @group: the group @@ -1752,23 +1767,6 @@ __iommu_group_alloc_default_domain(struct iommu_group *group, int req_type) return __iommu_group_domain_alloc(group, req_type); } -/* - * Returns the iommu_ops for the devices in an iommu group. - * - * It is assumed that all devices in an iommu group are managed by a single - * IOMMU unit. Therefore, this returns the dev_iommu_ops of the first device - * in the group. - */ -static const struct iommu_ops *group_iommu_ops(struct iommu_group *group) -{ - struct group_device *device = - list_first_entry(&group->devices, struct group_device, list); - - lockdep_assert_held(&group->mutex); - - return dev_iommu_ops(device->dev); -} - /* * req_type of 0 means "auto" which means to select a domain based on * iommu_def_domain_type or what the driver actually supports. @@ -1776,7 +1774,7 @@ static const struct iommu_ops *group_iommu_ops(struct iommu_group *group) static struct iommu_domain * iommu_group_alloc_default_domain(struct iommu_group *group, int req_type) { - const struct iommu_ops *ops = group_iommu_ops(group); + const struct iommu_ops *ops = dev_iommu_ops(iommu_group_first_dev(group)); struct iommu_domain *dom; lockdep_assert_held(&group->mutex); @@ -1854,7 +1852,7 @@ static int iommu_bus_notifier(struct notifier_block *nb, static int iommu_get_def_domain_type(struct iommu_group *group, struct device *dev, int cur_type) { - const struct iommu_ops *ops = group_iommu_ops(group); + const struct iommu_ops *ops = dev_iommu_ops(dev); int type; if (!ops->def_domain_type) @@ -2021,7 +2019,7 @@ bool device_iommu_capable(struct device *dev, enum iommu_cap cap) { const struct iommu_ops *ops; - if (!dev->iommu || !dev->iommu->iommu_dev) + if (!dev_has_iommu(dev)) return false; ops = dev_iommu_ops(dev); @@ -2120,11 +2118,9 @@ static struct iommu_domain *__iommu_domain_alloc(const struct iommu_ops *ops, static struct iommu_domain * __iommu_group_domain_alloc(struct iommu_group *group, unsigned int type) { - struct device *dev = - list_first_entry(&group->devices, struct group_device, list) - ->dev; + struct device *dev = iommu_group_first_dev(group); - return __iommu_domain_alloc(group_iommu_ops(group), dev, type); + return __iommu_domain_alloc(dev_iommu_ops(dev), dev, type); } struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus) @@ -2987,8 +2983,8 @@ EXPORT_SYMBOL_GPL(iommu_fwspec_add_ids); */ int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features feat) { - if (dev->iommu && dev->iommu->iommu_dev) { - const struct iommu_ops *ops = dev->iommu->iommu_dev->ops; + if (dev_has_iommu(dev)) { + const struct iommu_ops *ops = dev_iommu_ops(dev); if (ops->dev_enable_feat) return ops->dev_enable_feat(dev, feat); @@ -3003,8 +2999,8 @@ EXPORT_SYMBOL_GPL(iommu_dev_enable_feature); */ int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat) { - if (dev->iommu && dev->iommu->iommu_dev) { - const struct iommu_ops *ops = dev->iommu->iommu_dev->ops; + if (dev_has_iommu(dev)) { + const struct iommu_ops *ops = dev_iommu_ops(dev); if (ops->dev_disable_feat) return ops->dev_disable_feat(dev, feat); -- GitLab From 1d8d43bb984b9cfa7ff63dbd0e2f6e5775ff1fdb Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Tue, 21 Nov 2023 18:03:58 +0000 Subject: [PATCH 0100/1290] iommu: Decouple iommu_present() from bus ops Much as I'd like to remove iommu_present(), the final remaining users are proving stubbornly difficult to clean up, so kick that can down the road and just rework it to preserve the current behaviour without depending on bus ops. Since commit 57365a04c921 ("iommu: Move bus setup to IOMMU device registration"), any registered IOMMU instance is already considered "present" for every entry in iommu_buses, so it's simply a case of validating the bus and checking we have at least once IOMMU. Reviewed-by: Jason Gunthorpe Reviewed-by: Lu Baolu Reviewed-by: Jerry Snitselaar Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/caa93680bb9d35a8facbcd8ff46267ca67335229.1700589539.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 5c555fc0d54ca..7fafd073c33e6 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2001,9 +2001,28 @@ int bus_iommu_probe(const struct bus_type *bus) return 0; } +/** + * iommu_present() - make platform-specific assumptions about an IOMMU + * @bus: bus to check + * + * Do not use this function. You want device_iommu_mapped() instead. + * + * Return: true if some IOMMU is present and aware of devices on the given bus; + * in general it may not be the only IOMMU, and it may not have anything to do + * with whatever device you are ultimately interested in. + */ bool iommu_present(const struct bus_type *bus) { - return bus->iommu_ops != NULL; + bool ret = false; + + for (int i = 0; i < ARRAY_SIZE(iommu_buses); i++) { + if (iommu_buses[i] == bus) { + spin_lock(&iommu_device_lock); + ret = !list_empty(&iommu_device_list); + spin_unlock(&iommu_device_lock); + } + } + return ret; } EXPORT_SYMBOL_GPL(iommu_present); -- GitLab From a9c362db39207c4934c9125e56ed730c5297c37c Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Tue, 21 Nov 2023 18:03:59 +0000 Subject: [PATCH 0101/1290] iommu: Validate that devices match domains Before we can allow drivers to coexist, we need to make sure that one driver's domain ops can't misinterpret another driver's dev_iommu_priv data. To that end, add a token to the domain so we can remember how it was allocated - for now this may as well be the device ops, since they still correlate 1:1 with drivers. We can trust ourselves for internal default domain attachment, so add checks to cover all the public attach interfaces. Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Jerry Snitselaar Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/097c6f30480e4efe12195d00ba0e84ea4837fb4c.1700589539.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 10 ++++++++++ drivers/iommu/iommufd/hw_pagetable.c | 2 ++ include/linux/iommu.h | 2 +- 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 7fafd073c33e6..8e4436c606d30 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2117,6 +2117,7 @@ static struct iommu_domain *__iommu_domain_alloc(const struct iommu_ops *ops, return NULL; domain->type = type; + domain->owner = ops; /* * If not already set, assume all sizes by default; the driver * may override this later @@ -2282,10 +2283,16 @@ struct iommu_domain *iommu_get_dma_domain(struct device *dev) static int __iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group) { + struct device *dev; + if (group->domain && group->domain != group->default_domain && group->domain != group->blocking_domain) return -EBUSY; + dev = iommu_group_first_dev(group); + if (!dev_has_iommu(dev) || dev_iommu_ops(dev) != domain->owner) + return -EINVAL; + return __iommu_group_set_domain(group, domain); } @@ -3477,6 +3484,9 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, if (!group) return -ENODEV; + if (!dev_has_iommu(dev) || dev_iommu_ops(dev) != domain->owner) + return -EINVAL; + mutex_lock(&group->mutex); curr = xa_cmpxchg(&group->pasid_array, pasid, NULL, domain, GFP_KERNEL); if (curr) { diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 2abbeafdbd22d..5be7f513b622d 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -135,6 +135,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, hwpt->domain = NULL; goto out_abort; } + hwpt->domain->owner = ops; } else { hwpt->domain = iommu_domain_alloc(idev->dev->bus); if (!hwpt->domain) { @@ -233,6 +234,7 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, hwpt->domain = NULL; goto out_abort; } + hwpt->domain->owner = ops; if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { rc = -EINVAL; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index ec289c1016f5f..077bf8cae2f7a 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -106,7 +106,7 @@ struct iommu_domain { unsigned type; const struct iommu_domain_ops *ops; const struct iommu_dirty_ops *dirty_ops; - + const struct iommu_ops *owner; /* Whose domain_alloc we came from */ unsigned long pgsize_bitmap; /* Bitmap of page sizes in use */ struct iommu_domain_geometry geometry; struct iommu_dma_cookie *iova_cookie; -- GitLab From b4c0497169d54e08346678dbc79ded7809dae5b7 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Tue, 21 Nov 2023 18:04:00 +0000 Subject: [PATCH 0102/1290] iommu: Decouple iommu_domain_alloc() from bus ops As the final remaining piece of bus-dependent API, iommu_domain_alloc() can now take responsibility for the "one iommu_ops per bus" rule for itself. It turns out we can't safely make the internal allocation call any more group-based or device-based yet - that will have to wait until the external callers can pass the right thing - but we can at least get as far as deriving "bus ops" based on which driver is actually managing devices on the given bus, rather than whichever driver won the race to register first. This will then leave us able to convert the last of the core internals over to the IOMMU-instance model, allow multiple drivers to register and actually coexist (modulo the above limitation for unmanaged domain users in the short term), and start trying to solve the long-standing iommu_probe_device() mess. Reviewed-by: Jason Gunthorpe Reviewed-by: Jerry Snitselaar Signed-off-by: Robin Murphy Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/6c7313009aae0e39ae2855920990ebf85af4662f.1700589539.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 8e4436c606d30..88aeae0acd9b5 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2143,12 +2143,31 @@ __iommu_group_domain_alloc(struct iommu_group *group, unsigned int type) return __iommu_domain_alloc(dev_iommu_ops(dev), dev, type); } +static int __iommu_domain_alloc_dev(struct device *dev, void *data) +{ + const struct iommu_ops **ops = data; + + if (!dev_has_iommu(dev)) + return 0; + + if (WARN_ONCE(*ops && *ops != dev_iommu_ops(dev), + "Multiple IOMMU drivers present for bus %s, which the public IOMMU API can't fully support yet. You will still need to disable one or more for this to work, sorry!\n", + dev_bus_name(dev))) + return -EBUSY; + + *ops = dev_iommu_ops(dev); + return 0; +} + struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus) { - if (bus == NULL || bus->iommu_ops == NULL) + const struct iommu_ops *ops = NULL; + int err = bus_for_each_dev(bus, NULL, &ops, __iommu_domain_alloc_dev); + + if (err || !ops) return NULL; - return __iommu_domain_alloc(bus->iommu_ops, NULL, - IOMMU_DOMAIN_UNMANAGED); + + return __iommu_domain_alloc(ops, NULL, IOMMU_DOMAIN_UNMANAGED); } EXPORT_SYMBOL_GPL(iommu_domain_alloc); -- GitLab From 01bf81af85458c0427a70ae3bf90b375d038c95b Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Tue, 21 Nov 2023 18:04:01 +0000 Subject: [PATCH 0103/1290] iommu/arm-smmu: Don't register fwnode for legacy binding When using the legacy binding we bypass the of_xlate mechanism, so avoid registering the instance fwnodes which act as keys for that. This will help __iommu_probe_device() to retrieve the registered ops the same way as for x86 etc. when no fwspec has previously been set up by of_xlate. Acked-by: Will Deacon Reviewed-by: Jason Gunthorpe Reviewed-by: Jerry Snitselaar Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/18b0f812a42a74dd6924aea24e68ab409d6e1b52.1700589539.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index d6d1a2a55cc06..4b83a3adacd6e 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -2161,7 +2161,8 @@ static int arm_smmu_device_probe(struct platform_device *pdev) return err; } - err = iommu_device_register(&smmu->iommu, &arm_smmu_ops, dev); + err = iommu_device_register(&smmu->iommu, &arm_smmu_ops, + using_legacy_binding ? NULL : dev); if (err) { dev_err(dev, "Failed to register iommu\n"); iommu_device_sysfs_remove(&smmu->iommu); -- GitLab From 17de3f5fdd35676b0e3d41c7c9bf4e3032eb3673 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Tue, 21 Nov 2023 18:04:02 +0000 Subject: [PATCH 0104/1290] iommu: Retire bus ops With the rest of the API internals converted, it's time to finally tackle probe_device and how we bootstrap the per-device ops association to begin with. This ends up being disappointingly straightforward, since fwspec users are already doing it in order to find their of_xlate callback, and it works out that we can easily do the equivalent for other drivers too. Then shuffle the remaining awareness of iommu_ops into the couple of core headers that still need it, and breathe a sigh of relief. Ding dong the bus ops are gone! CC: Rafael J. Wysocki Acked-by: Christoph Hellwig Acked-by: Greg Kroah-Hartman Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Jerry Snitselaar Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/a59011ef65b4b6657cb0b7a388d786b779b61305.1700589539.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 31 ++++++++++++++++++------------- include/acpi/acpi_bus.h | 2 ++ include/linux/device.h | 1 - include/linux/device/bus.h | 5 ----- include/linux/dma-map-ops.h | 1 + 5 files changed, 21 insertions(+), 19 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 88aeae0acd9b5..254f42b4220b7 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -148,7 +148,7 @@ struct iommu_group_attribute iommu_group_attr_##_name = \ static LIST_HEAD(iommu_device_list); static DEFINE_SPINLOCK(iommu_device_lock); -static struct bus_type * const iommu_buses[] = { +static const struct bus_type * const iommu_buses[] = { &platform_bus_type, #ifdef CONFIG_PCI &pci_bus_type, @@ -257,13 +257,6 @@ int iommu_device_register(struct iommu_device *iommu, /* We need to be able to take module references appropriately */ if (WARN_ON(is_module_address((unsigned long)ops) && !ops->owner)) return -EINVAL; - /* - * Temporarily enforce global restriction to a single driver. This was - * already the de-facto behaviour, since any possible combination of - * existing drivers would compete for at least the PCI or platform bus. - */ - if (iommu_buses[0]->iommu_ops && iommu_buses[0]->iommu_ops != ops) - return -EBUSY; iommu->ops = ops; if (hwdev) @@ -273,10 +266,8 @@ int iommu_device_register(struct iommu_device *iommu, list_add_tail(&iommu->list, &iommu_device_list); spin_unlock(&iommu_device_lock); - for (int i = 0; i < ARRAY_SIZE(iommu_buses) && !err; i++) { - iommu_buses[i]->iommu_ops = ops; + for (int i = 0; i < ARRAY_SIZE(iommu_buses) && !err; i++) err = bus_iommu_probe(iommu_buses[i]); - } if (err) iommu_device_unregister(iommu); return err; @@ -329,7 +320,6 @@ int iommu_device_register_bus(struct iommu_device *iommu, list_add_tail(&iommu->list, &iommu_device_list); spin_unlock(&iommu_device_lock); - bus->iommu_ops = ops; err = bus_iommu_probe(bus); if (err) { iommu_device_unregister_bus(iommu, bus, nb); @@ -496,12 +486,27 @@ static void iommu_deinit_device(struct device *dev) static int __iommu_probe_device(struct device *dev, struct list_head *group_list) { - const struct iommu_ops *ops = dev->bus->iommu_ops; + const struct iommu_ops *ops; + struct iommu_fwspec *fwspec; struct iommu_group *group; static DEFINE_MUTEX(iommu_probe_device_lock); struct group_device *gdev; int ret; + /* + * For FDT-based systems and ACPI IORT/VIOT, drivers register IOMMU + * instances with non-NULL fwnodes, and client devices should have been + * identified with a fwspec by this point. Otherwise, we can currently + * assume that only one of Intel, AMD, s390, PAMU or legacy SMMUv2 can + * be present, and that any of their registered instances has suitable + * ops for probing, and thus cheekily co-opt the same mechanism. + */ + fwspec = dev_iommu_fwspec_get(dev); + if (fwspec && fwspec->ops) + ops = fwspec->ops; + else + ops = iommu_ops_from_fwnode(NULL); + if (!ops) return -ENODEV; /* diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 1216d72c650fa..89079787905d4 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -627,6 +627,8 @@ struct acpi_pci_root { /* helper */ +struct iommu_ops; + bool acpi_dma_supported(const struct acpi_device *adev); enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev); int acpi_iommu_fwspec_init(struct device *dev, u32 id, diff --git a/include/linux/device.h b/include/linux/device.h index d7a72a8749ea0..0314dbbdb5345 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -42,7 +42,6 @@ struct class; struct subsys_private; struct device_node; struct fwnode_handle; -struct iommu_ops; struct iommu_group; struct dev_pin_info; struct dev_iommu; diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h index ae10c43227543..e25aab08f873d 100644 --- a/include/linux/device/bus.h +++ b/include/linux/device/bus.h @@ -62,9 +62,6 @@ struct fwnode_handle; * this bus. * @pm: Power management operations of this bus, callback the specific * device driver's pm-ops. - * @iommu_ops: IOMMU specific operations for this bus, used to attach IOMMU - * driver implementations to a bus and allow the driver to do - * bus-specific setup * @need_parent_lock: When probing or removing a device on this bus, the * device core should lock the device's parent. * @@ -104,8 +101,6 @@ struct bus_type { const struct dev_pm_ops *pm; - const struct iommu_ops *iommu_ops; - bool need_parent_lock; }; diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index f2fc203fb8a1a..a52e508d1869f 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -11,6 +11,7 @@ #include struct cma; +struct iommu_ops; /* * Values for struct dma_map_ops.flags: -- GitLab From e7080665c977ea1aafb8547a9c7bd08b199311d6 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Tue, 21 Nov 2023 18:04:03 +0000 Subject: [PATCH 0105/1290] iommu: Clean up open-coded ownership checks Some drivers already implement their own defence against the possibility of being given someone else's device. Since this is now taken care of by the core code (and via a slightly different path from the original fwspec-based idea), let's clean them up. Acked-by: Will Deacon Reviewed-by: Jason Gunthorpe Reviewed-by: Jerry Snitselaar Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/58a9879ce3f03562bb061e6714fe6efb554c3907.1700589539.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 3 --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 9 +-------- drivers/iommu/arm/arm-smmu/qcom_iommu.c | 16 +++------------- drivers/iommu/mtk_iommu.c | 7 +------ drivers/iommu/mtk_iommu_v1.c | 3 --- drivers/iommu/sprd-iommu.c | 8 +------- drivers/iommu/virtio-iommu.c | 3 --- 7 files changed, 6 insertions(+), 43 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 7445454c2af24..fc4317c25b6d5 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2649,9 +2649,6 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev) struct arm_smmu_master *master; struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); - if (!fwspec || fwspec->ops != &arm_smmu_ops) - return ERR_PTR(-ENODEV); - if (WARN_ON_ONCE(dev_iommu_priv_get(dev))) return ERR_PTR(-EBUSY); diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 4b83a3adacd6e..4d09c00478927 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1116,11 +1116,6 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) struct arm_smmu_device *smmu; int ret; - if (!fwspec || fwspec->ops != &arm_smmu_ops) { - dev_err(dev, "cannot attach to SMMU, is it on the same bus?\n"); - return -ENXIO; - } - /* * FIXME: The arch/arm DMA API code tries to attach devices to its own * domains between of_xlate() and probe_device() - we have no way to cope @@ -1357,10 +1352,8 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev) fwspec = dev_iommu_fwspec_get(dev); if (ret) goto out_free; - } else if (fwspec && fwspec->ops == &arm_smmu_ops) { - smmu = arm_smmu_get_by_fwnode(fwspec->iommu_fwnode); } else { - return ERR_PTR(-ENODEV); + smmu = arm_smmu_get_by_fwnode(fwspec->iommu_fwnode); } ret = -EINVAL; diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c index 97b2122032b23..33f3c870086ce 100644 --- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c +++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c @@ -79,16 +79,6 @@ static struct qcom_iommu_domain *to_qcom_iommu_domain(struct iommu_domain *dom) static const struct iommu_ops qcom_iommu_ops; -static struct qcom_iommu_dev * to_iommu(struct device *dev) -{ - struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); - - if (!fwspec || fwspec->ops != &qcom_iommu_ops) - return NULL; - - return dev_iommu_priv_get(dev); -} - static struct qcom_iommu_ctx * to_ctx(struct qcom_iommu_domain *d, unsigned asid) { struct qcom_iommu_dev *qcom_iommu = d->iommu; @@ -372,7 +362,7 @@ static void qcom_iommu_domain_free(struct iommu_domain *domain) static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) { - struct qcom_iommu_dev *qcom_iommu = to_iommu(dev); + struct qcom_iommu_dev *qcom_iommu = dev_iommu_priv_get(dev); struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain); int ret; @@ -404,7 +394,7 @@ static int qcom_iommu_identity_attach(struct iommu_domain *identity_domain, struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct qcom_iommu_domain *qcom_domain; struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); - struct qcom_iommu_dev *qcom_iommu = to_iommu(dev); + struct qcom_iommu_dev *qcom_iommu = dev_iommu_priv_get(dev); unsigned int i; if (domain == identity_domain || !domain) @@ -535,7 +525,7 @@ static bool qcom_iommu_capable(struct device *dev, enum iommu_cap cap) static struct iommu_device *qcom_iommu_probe_device(struct device *dev) { - struct qcom_iommu_dev *qcom_iommu = to_iommu(dev); + struct qcom_iommu_dev *qcom_iommu = dev_iommu_priv_get(dev); struct device_link *link; if (!qcom_iommu) diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index 75279500a4a82..7abe9e85a5706 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -863,16 +863,11 @@ static phys_addr_t mtk_iommu_iova_to_phys(struct iommu_domain *domain, static struct iommu_device *mtk_iommu_probe_device(struct device *dev) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); - struct mtk_iommu_data *data; + struct mtk_iommu_data *data = dev_iommu_priv_get(dev); struct device_link *link; struct device *larbdev; unsigned int larbid, larbidx, i; - if (!fwspec || fwspec->ops != &mtk_iommu_ops) - return ERR_PTR(-ENODEV); /* Not a iommu client device */ - - data = dev_iommu_priv_get(dev); - if (!MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) return &data->iommu; diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index 67e044c1a7d93..25b41222abaec 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -481,9 +481,6 @@ static struct iommu_device *mtk_iommu_v1_probe_device(struct device *dev) idx++; } - if (!fwspec || fwspec->ops != &mtk_iommu_v1_ops) - return ERR_PTR(-ENODEV); /* Not a iommu client device */ - data = dev_iommu_priv_get(dev); /* Link the consumer device with the smi-larb device(supplier) */ diff --git a/drivers/iommu/sprd-iommu.c b/drivers/iommu/sprd-iommu.c index 2eb9fb46703b3..537359f109979 100644 --- a/drivers/iommu/sprd-iommu.c +++ b/drivers/iommu/sprd-iommu.c @@ -385,13 +385,7 @@ static phys_addr_t sprd_iommu_iova_to_phys(struct iommu_domain *domain, static struct iommu_device *sprd_iommu_probe_device(struct device *dev) { - struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); - struct sprd_iommu_device *sdev; - - if (!fwspec || fwspec->ops != &sprd_iommu_ops) - return ERR_PTR(-ENODEV); - - sdev = dev_iommu_priv_get(dev); + struct sprd_iommu_device *sdev = dev_iommu_priv_get(dev); return &sdev->iommu; } diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index 379ebe03efb6d..9bcffdde6175d 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -969,9 +969,6 @@ static struct iommu_device *viommu_probe_device(struct device *dev) struct viommu_dev *viommu = NULL; struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); - if (!fwspec || fwspec->ops != &viommu_ops) - return ERR_PTR(-ENODEV); - viommu = viommu_get_by_fwnode(fwspec->iommu_fwnode); if (!viommu) return ERR_PTR(-ENODEV); -- GitLab From 17b226dcf80ce79d02f4f0b08813d8848885b986 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Fri, 24 Nov 2023 15:24:33 +0100 Subject: [PATCH 0106/1290] iommu: Allow passing custom allocators to pgtable drivers This will be useful for GPU drivers who want to keep page tables in a pool so they can: - keep freed page tables in a free pool and speed-up upcoming page table allocations - batch page table allocation instead of allocating one page at a time - pre-reserve pages for page tables needed for map/unmap operations, to ensure map/unmap operations don't try to allocate memory in paths they're allowed to block or fail It might also be valuable for other aspects of GPU and similar use-cases, like fine-grained memory accounting and resource limiting. We will extend the Arm LPAE format to support custom allocators in a separate commit. Signed-off-by: Boris Brezillon Reviewed-by: Steven Price Reviewed-by: Robin Murphy Link: https://lore.kernel.org/r/20231124142434.1577550-2-boris.brezillon@collabora.com Signed-off-by: Joerg Roedel --- drivers/iommu/io-pgtable.c | 23 +++++++++++++++++++++++ include/linux/io-pgtable.h | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c index b843fcd365d28..8841c1487f004 100644 --- a/drivers/iommu/io-pgtable.c +++ b/drivers/iommu/io-pgtable.c @@ -34,6 +34,26 @@ io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] = { #endif }; +static int check_custom_allocator(enum io_pgtable_fmt fmt, + struct io_pgtable_cfg *cfg) +{ + /* No custom allocator, no need to check the format. */ + if (!cfg->alloc && !cfg->free) + return 0; + + /* When passing a custom allocator, both the alloc and free + * functions should be provided. + */ + if (!cfg->alloc || !cfg->free) + return -EINVAL; + + /* Make sure the format supports custom allocators. */ + if (io_pgtable_init_table[fmt]->caps & IO_PGTABLE_CAP_CUSTOM_ALLOCATOR) + return 0; + + return -EINVAL; +} + struct io_pgtable_ops *alloc_io_pgtable_ops(enum io_pgtable_fmt fmt, struct io_pgtable_cfg *cfg, void *cookie) @@ -44,6 +64,9 @@ struct io_pgtable_ops *alloc_io_pgtable_ops(enum io_pgtable_fmt fmt, if (fmt >= IO_PGTABLE_NUM_FMTS) return NULL; + if (check_custom_allocator(fmt, cfg)) + return NULL; + fns = io_pgtable_init_table[fmt]; if (!fns) return NULL; diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index 25142a0e2fc2c..86cf1f7ae389a 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -100,6 +100,30 @@ struct io_pgtable_cfg { const struct iommu_flush_ops *tlb; struct device *iommu_dev; + /** + * @alloc: Custom page allocator. + * + * Optional hook used to allocate page tables. If this function is NULL, + * @free must be NULL too. + * + * Memory returned should be zeroed and suitable for dma_map_single() and + * virt_to_phys(). + * + * Not all formats support custom page allocators. Before considering + * passing a non-NULL value, make sure the chosen page format supports + * this feature. + */ + void *(*alloc)(void *cookie, size_t size, gfp_t gfp); + + /** + * @free: Custom page de-allocator. + * + * Optional hook used to free page tables allocated with the @alloc + * hook. Must be non-NULL if @alloc is not NULL, must be NULL + * otherwise. + */ + void (*free)(void *cookie, void *pages, size_t size); + /* Low-level data specific to the table format */ union { struct { @@ -241,16 +265,26 @@ io_pgtable_tlb_add_page(struct io_pgtable *iop, iop->cfg.tlb->tlb_add_page(gather, iova, granule, iop->cookie); } +/** + * enum io_pgtable_caps - IO page table backend capabilities. + */ +enum io_pgtable_caps { + /** @IO_PGTABLE_CAP_CUSTOM_ALLOCATOR: Backend accepts custom page table allocators. */ + IO_PGTABLE_CAP_CUSTOM_ALLOCATOR = BIT(0), +}; + /** * struct io_pgtable_init_fns - Alloc/free a set of page tables for a * particular format. * * @alloc: Allocate a set of page tables described by cfg. * @free: Free the page tables associated with iop. + * @caps: Combination of @io_pgtable_caps flags encoding the backend capabilities. */ struct io_pgtable_init_fns { struct io_pgtable *(*alloc)(struct io_pgtable_cfg *cfg, void *cookie); void (*free)(struct io_pgtable *iop); + u32 caps; }; extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s1_init_fns; -- GitLab From 87639e01e05c716d8fd5d63846b6d9a8b0a2e79a Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Fri, 24 Nov 2023 15:24:34 +0100 Subject: [PATCH 0107/1290] iommu: Extend LPAE page table format to support custom allocators We need that in order to implement the VM_BIND ioctl in the GPU driver targeting new Mali GPUs. VM_BIND is about executing MMU map/unmap requests asynchronously, possibly after waiting for external dependencies encoded as dma_fences. We intend to use the drm_sched framework to automate the dependency tracking and VM job dequeuing logic, but this comes with its own set of constraints, one of them being the fact we are not allowed to allocate memory in the drm_gpu_scheduler_ops::run_job() to avoid this sort of deadlocks: - VM_BIND map job needs to allocate a page table to map some memory to the VM. No memory available, so kswapd is kicked - GPU driver shrinker backend ends up waiting on the fence attached to the VM map job or any other job fence depending on this VM operation. With custom allocators, we will be able to pre-reserve enough pages to guarantee the map/unmap operations we queued will take place without going through the system allocator. But we can also optimize allocation/reservation by not free-ing pages immediately, so any upcoming page table allocation requests can be serviced by some free page table pool kept at the driver level. I might also be valuable for other aspects of GPU and similar use-cases, like fine-grained memory accounting and resource limiting. Signed-off-by: Boris Brezillon Reviewed-by: Steven Price Reviewed-by: Robin Murphy Link: https://lore.kernel.org/r/20231124142434.1577550-3-boris.brezillon@collabora.com Signed-off-by: Joerg Roedel --- drivers/iommu/io-pgtable-arm.c | 55 ++++++++++++++++++++++++---------- 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 72dcdd468cf30..f7828a7aad410 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -188,20 +188,28 @@ static dma_addr_t __arm_lpae_dma_addr(void *pages) } static void *__arm_lpae_alloc_pages(size_t size, gfp_t gfp, - struct io_pgtable_cfg *cfg) + struct io_pgtable_cfg *cfg, + void *cookie) { struct device *dev = cfg->iommu_dev; int order = get_order(size); - struct page *p; dma_addr_t dma; void *pages; VM_BUG_ON((gfp & __GFP_HIGHMEM)); - p = alloc_pages_node(dev_to_node(dev), gfp | __GFP_ZERO, order); - if (!p) + + if (cfg->alloc) { + pages = cfg->alloc(cookie, size, gfp); + } else { + struct page *p; + + p = alloc_pages_node(dev_to_node(dev), gfp | __GFP_ZERO, order); + pages = p ? page_address(p) : NULL; + } + + if (!pages) return NULL; - pages = page_address(p); if (!cfg->coherent_walk) { dma = dma_map_single(dev, pages, size, DMA_TO_DEVICE); if (dma_mapping_error(dev, dma)) @@ -220,18 +228,28 @@ static void *__arm_lpae_alloc_pages(size_t size, gfp_t gfp, out_unmap: dev_err(dev, "Cannot accommodate DMA translation for IOMMU page tables\n"); dma_unmap_single(dev, dma, size, DMA_TO_DEVICE); + out_free: - __free_pages(p, order); + if (cfg->free) + cfg->free(cookie, pages, size); + else + free_pages((unsigned long)pages, order); + return NULL; } static void __arm_lpae_free_pages(void *pages, size_t size, - struct io_pgtable_cfg *cfg) + struct io_pgtable_cfg *cfg, + void *cookie) { if (!cfg->coherent_walk) dma_unmap_single(cfg->iommu_dev, __arm_lpae_dma_addr(pages), size, DMA_TO_DEVICE); - free_pages((unsigned long)pages, get_order(size)); + + if (cfg->free) + cfg->free(cookie, pages, size); + else + free_pages((unsigned long)pages, get_order(size)); } static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep, int num_entries, @@ -373,13 +391,13 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova, /* Grab a pointer to the next level */ pte = READ_ONCE(*ptep); if (!pte) { - cptep = __arm_lpae_alloc_pages(tblsz, gfp, cfg); + cptep = __arm_lpae_alloc_pages(tblsz, gfp, cfg, data->iop.cookie); if (!cptep) return -ENOMEM; pte = arm_lpae_install_table(cptep, ptep, 0, data); if (pte) - __arm_lpae_free_pages(cptep, tblsz, cfg); + __arm_lpae_free_pages(cptep, tblsz, cfg, data->iop.cookie); } else if (!cfg->coherent_walk && !(pte & ARM_LPAE_PTE_SW_SYNC)) { __arm_lpae_sync_pte(ptep, 1, cfg); } @@ -524,7 +542,7 @@ static void __arm_lpae_free_pgtable(struct arm_lpae_io_pgtable *data, int lvl, __arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data)); } - __arm_lpae_free_pages(start, table_size, &data->iop.cfg); + __arm_lpae_free_pages(start, table_size, &data->iop.cfg, data->iop.cookie); } static void arm_lpae_free_pgtable(struct io_pgtable *iop) @@ -552,7 +570,7 @@ static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data, if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS)) return 0; - tablep = __arm_lpae_alloc_pages(tablesz, GFP_ATOMIC, cfg); + tablep = __arm_lpae_alloc_pages(tablesz, GFP_ATOMIC, cfg, data->iop.cookie); if (!tablep) return 0; /* Bytes unmapped */ @@ -575,7 +593,7 @@ static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data, pte = arm_lpae_install_table(tablep, ptep, blk_pte, data); if (pte != blk_pte) { - __arm_lpae_free_pages(tablep, tablesz, cfg); + __arm_lpae_free_pages(tablep, tablesz, cfg, data->iop.cookie); /* * We may race against someone unmapping another part of this * block, but anything else is invalid. We can't misinterpret @@ -882,7 +900,7 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) /* Looking good; allocate a pgd */ data->pgd = __arm_lpae_alloc_pages(ARM_LPAE_PGD_SIZE(data), - GFP_KERNEL, cfg); + GFP_KERNEL, cfg, cookie); if (!data->pgd) goto out_free_data; @@ -984,7 +1002,7 @@ arm_64_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie) /* Allocate pgd pages */ data->pgd = __arm_lpae_alloc_pages(ARM_LPAE_PGD_SIZE(data), - GFP_KERNEL, cfg); + GFP_KERNEL, cfg, cookie); if (!data->pgd) goto out_free_data; @@ -1059,7 +1077,7 @@ arm_mali_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_DEV)); data->pgd = __arm_lpae_alloc_pages(ARM_LPAE_PGD_SIZE(data), GFP_KERNEL, - cfg); + cfg, cookie); if (!data->pgd) goto out_free_data; @@ -1080,26 +1098,31 @@ out_free_data: } struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s1_init_fns = { + .caps = IO_PGTABLE_CAP_CUSTOM_ALLOCATOR, .alloc = arm_64_lpae_alloc_pgtable_s1, .free = arm_lpae_free_pgtable, }; struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns = { + .caps = IO_PGTABLE_CAP_CUSTOM_ALLOCATOR, .alloc = arm_64_lpae_alloc_pgtable_s2, .free = arm_lpae_free_pgtable, }; struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s1_init_fns = { + .caps = IO_PGTABLE_CAP_CUSTOM_ALLOCATOR, .alloc = arm_32_lpae_alloc_pgtable_s1, .free = arm_lpae_free_pgtable, }; struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s2_init_fns = { + .caps = IO_PGTABLE_CAP_CUSTOM_ALLOCATOR, .alloc = arm_32_lpae_alloc_pgtable_s2, .free = arm_lpae_free_pgtable, }; struct io_pgtable_init_fns io_pgtable_arm_mali_lpae_init_fns = { + .caps = IO_PGTABLE_CAP_CUSTOM_ALLOCATOR, .alloc = arm_mali_lpae_alloc_pgtable, .free = arm_lpae_free_pgtable, }; -- GitLab From a08bd8df97b7549fe0edc53d087322314a0c4dfe Mon Sep 17 00:00:00 2001 From: Sven Peter Date: Sun, 26 Nov 2023 16:16:59 +0100 Subject: [PATCH 0108/1290] dt-bindings: iommu: dart: Add t8103-usb4-dart compatible This DART variant is found in the t8103 (M1) SoCs and used for the USB4/Thunderbolt PCIe ports. Unlike the regular t8103 DART these support up to 64 SIDs and require a slightly different MMIO layout. Acked-by: Hector Martin Acked-by: Rob Herring Signed-off-by: Sven Peter Link: https://lore.kernel.org/r/20231126151701.16534-2-sven@svenpeter.dev Signed-off-by: Joerg Roedel --- Documentation/devicetree/bindings/iommu/apple,dart.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/iommu/apple,dart.yaml b/Documentation/devicetree/bindings/iommu/apple,dart.yaml index 903edf85d72e4..7adb1de455a5b 100644 --- a/Documentation/devicetree/bindings/iommu/apple,dart.yaml +++ b/Documentation/devicetree/bindings/iommu/apple,dart.yaml @@ -24,6 +24,7 @@ properties: compatible: enum: - apple,t8103-dart + - apple,t8103-usb4-dart - apple,t8110-dart - apple,t6000-dart -- GitLab From c58a17a99753749aabd979bc62babb2243266dcc Mon Sep 17 00:00:00 2001 From: Sven Peter Date: Sun, 26 Nov 2023 16:17:00 +0100 Subject: [PATCH 0109/1290] iommu/apple-dart: Write to all DART_T8020_STREAM_SELECT We're about to add support for a DART variant that use more than 16 streams and requires writing to two separate stream select registers when issuing TLB flushes. Acked-by: Hector Martin Signed-off-by: Sven Peter Link: https://lore.kernel.org/r/20231126151701.16534-3-sven@svenpeter.dev Signed-off-by: Joerg Roedel --- drivers/iommu/apple-dart.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c index ee05f4824bfad..8b6b0cdba9b57 100644 --- a/drivers/iommu/apple-dart.c +++ b/drivers/iommu/apple-dart.c @@ -368,12 +368,14 @@ apple_dart_t8020_hw_stream_command(struct apple_dart_stream_map *stream_map, u32 command) { unsigned long flags; - int ret; + int ret, i; u32 command_reg; spin_lock_irqsave(&stream_map->dart->lock, flags); - writel(stream_map->sidmap[0], stream_map->dart->regs + DART_T8020_STREAM_SELECT); + for (i = 0; i < BITS_TO_U32(stream_map->dart->num_streams); i++) + writel(stream_map->sidmap[i], + stream_map->dart->regs + DART_T8020_STREAM_SELECT + 4 * i); writel(command, stream_map->dart->regs + DART_T8020_STREAM_COMMAND); ret = readl_poll_timeout_atomic( -- GitLab From 863c092323ab17a5bc4b2f3185d4136f9e57bbe3 Mon Sep 17 00:00:00 2001 From: Sven Peter Date: Sun, 26 Nov 2023 16:17:01 +0100 Subject: [PATCH 0110/1290] iommu/apple-dart: Add support for t8103 USB4 DART This variant of the regular t8103 DART is used for the two USB4/Thunderbolt PCIe controllers. It supports 64 instead of 16 streams which requires a slightly different MMIO layout. Acked-by: Hector Martin Signed-off-by: Sven Peter Link: https://lore.kernel.org/r/20231126151701.16534-4-sven@svenpeter.dev Signed-off-by: Joerg Roedel --- drivers/iommu/apple-dart.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c index 8b6b0cdba9b57..f47f60c0a4d0d 100644 --- a/drivers/iommu/apple-dart.c +++ b/drivers/iommu/apple-dart.c @@ -81,6 +81,7 @@ #define DART_T8020_TCR_BYPASS_DAPF BIT(12) #define DART_T8020_TTBR 0x200 +#define DART_T8020_USB4_TTBR 0x400 #define DART_T8020_TTBR_VALID BIT(31) #define DART_T8020_TTBR_ADDR_FIELD_SHIFT 0 #define DART_T8020_TTBR_SHIFT 12 @@ -1217,6 +1218,33 @@ static const struct apple_dart_hw apple_dart_hw_t8103 = { .ttbr_shift = DART_T8020_TTBR_SHIFT, .ttbr_count = 4, }; + +static const struct apple_dart_hw apple_dart_hw_t8103_usb4 = { + .type = DART_T8020, + .irq_handler = apple_dart_t8020_irq, + .invalidate_tlb = apple_dart_t8020_hw_invalidate_tlb, + .oas = 36, + .fmt = APPLE_DART, + .max_sid_count = 64, + + .enable_streams = DART_T8020_STREAMS_ENABLE, + .lock = DART_T8020_CONFIG, + .lock_bit = DART_T8020_CONFIG_LOCK, + + .error = DART_T8020_ERROR, + + .tcr = DART_T8020_TCR, + .tcr_enabled = DART_T8020_TCR_TRANSLATE_ENABLE, + .tcr_disabled = 0, + .tcr_bypass = 0, + + .ttbr = DART_T8020_USB4_TTBR, + .ttbr_valid = DART_T8020_TTBR_VALID, + .ttbr_addr_field_shift = DART_T8020_TTBR_ADDR_FIELD_SHIFT, + .ttbr_shift = DART_T8020_TTBR_SHIFT, + .ttbr_count = 4, +}; + static const struct apple_dart_hw apple_dart_hw_t6000 = { .type = DART_T6000, .irq_handler = apple_dart_t8020_irq, @@ -1309,6 +1337,7 @@ static DEFINE_SIMPLE_DEV_PM_OPS(apple_dart_pm_ops, apple_dart_suspend, apple_dar static const struct of_device_id apple_dart_of_match[] = { { .compatible = "apple,t8103-dart", .data = &apple_dart_hw_t8103 }, + { .compatible = "apple,t8103-usb4-dart", .data = &apple_dart_hw_t8103_usb4 }, { .compatible = "apple,t8110-dart", .data = &apple_dart_hw_t8110 }, { .compatible = "apple,t6000-dart", .data = &apple_dart_hw_t6000 }, {}, -- GitLab From 60732292a135e8b8152858f219a563c05e9569e9 Mon Sep 17 00:00:00 2001 From: Sven Peter Date: Sun, 26 Nov 2023 17:20:09 +0100 Subject: [PATCH 0111/1290] iommu/apple-dart: Use readl instead of readl_relaxed for consistency While the readl_relaxed in apple_dart_suspend is correct the rest of the driver uses the non-relaxed variants everywhere and the single readl_relaxed is inconsistent and possibly confusing. Signed-off-by: Sven Peter Acked-by: Hector Martin Reviewed-by: Neal Gompa Link: https://lore.kernel.org/r/20231126162009.17934-1-sven@svenpeter.dev Signed-off-by: Joerg Roedel --- drivers/iommu/apple-dart.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c index f47f60c0a4d0d..7438e9c82ba98 100644 --- a/drivers/iommu/apple-dart.c +++ b/drivers/iommu/apple-dart.c @@ -1302,7 +1302,7 @@ static __maybe_unused int apple_dart_suspend(struct device *dev) unsigned int sid, idx; for (sid = 0; sid < dart->num_streams; sid++) { - dart->save_tcr[sid] = readl_relaxed(dart->regs + DART_TCR(dart, sid)); + dart->save_tcr[sid] = readl(dart->regs + DART_TCR(dart, sid)); for (idx = 0; idx < dart->hw->ttbr_count; idx++) dart->save_ttbr[sid][idx] = readl(dart->regs + DART_TTBR(dart, sid, idx)); -- GitLab From 130601d488fa06447283767e447909ce9e975e43 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Fri, 24 Nov 2023 09:41:14 +0100 Subject: [PATCH 0112/1290] dt-bindings: phy: amlogic,meson-axg-mipi-pcie-analog: drop text about parent syscon and drop example Since this bindings is referred from amlogic,meson-gx-hhi-sysctrl.yaml, drop the now useless description about the parent node and also drop the unnecessary example. Acked-by: Conor Dooley Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20231124-amlogic-v6-4-upstream-dsi-ccf-vim3-v9-3-95256ed139e6@linaro.org Signed-off-by: Vinod Koul --- .../phy/amlogic,meson-axg-mipi-pcie-analog.yaml | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/Documentation/devicetree/bindings/phy/amlogic,meson-axg-mipi-pcie-analog.yaml b/Documentation/devicetree/bindings/phy/amlogic,meson-axg-mipi-pcie-analog.yaml index 009a398083185..70def36e5688d 100644 --- a/Documentation/devicetree/bindings/phy/amlogic,meson-axg-mipi-pcie-analog.yaml +++ b/Documentation/devicetree/bindings/phy/amlogic,meson-axg-mipi-pcie-analog.yaml @@ -9,16 +9,6 @@ title: Amlogic AXG shared MIPI/PCIE analog PHY maintainers: - Remi Pommarel -description: |+ - The Everything-Else Power Domains node should be the child of a syscon - node with the required property: - - - compatible: Should be the following: - "amlogic,meson-gx-hhi-sysctrl", "simple-mfd", "syscon" - - Refer to the bindings described in - Documentation/devicetree/bindings/mfd/syscon.yaml - properties: compatible: const: amlogic,axg-mipi-pcie-analog-phy @@ -31,10 +21,3 @@ required: - "#phy-cells" additionalProperties: false - -examples: - - | - mpphy: phy { - compatible = "amlogic,axg-mipi-pcie-analog-phy"; - #phy-cells = <0>; - }; -- GitLab From 5f4a9a66f8a7582e90311fa8251da33a8d2111d7 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Fri, 24 Nov 2023 09:41:15 +0100 Subject: [PATCH 0113/1290] dt-bindings: phy: amlogic,g12a-mipi-dphy-analog: drop unneeded reg property and example The amlogic,g12a-mipi-dphy-analog is a feature of the simple-mfd amlogic,meson-axg-hhi-sysctrl system control register zone which is an intermixed registers zone, thus it's very hard to define clear ranges for each SoC controlled features even if possible. The amlogic,g12a-mipi-dphy-analog was wrongly documented as an independent register range, which is not the reality, thus fix the bindings by dropping the reg property now it's referred from amlogic,meson-gx-hhi-sysctrl.yaml and documented as a subnode of amlogic,meson-axg-hhi-sysctrl. Also drop the unnecessary example, the top level bindings example should be enough. Fixes: 76ab79f9726c ("dt-bindings: phy: add Amlogic G12A Analog MIPI D-PHY bindings") Signed-off-by: Neil Armstrong Reviewed-by: Rob Herring Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20231124-amlogic-v6-4-upstream-dsi-ccf-vim3-v9-4-95256ed139e6@linaro.org Signed-off-by: Vinod Koul --- .../bindings/phy/amlogic,g12a-mipi-dphy-analog.yaml | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/Documentation/devicetree/bindings/phy/amlogic,g12a-mipi-dphy-analog.yaml b/Documentation/devicetree/bindings/phy/amlogic,g12a-mipi-dphy-analog.yaml index c8c83acfb871d..81c2654b7e57e 100644 --- a/Documentation/devicetree/bindings/phy/amlogic,g12a-mipi-dphy-analog.yaml +++ b/Documentation/devicetree/bindings/phy/amlogic,g12a-mipi-dphy-analog.yaml @@ -16,20 +16,8 @@ properties: "#phy-cells": const: 0 - reg: - maxItems: 1 - required: - compatible - - reg - "#phy-cells" additionalProperties: false - -examples: - - | - phy@0 { - compatible = "amlogic,g12a-mipi-dphy-analog"; - reg = <0x0 0xc>; - #phy-cells = <0>; - }; -- GitLab From fa50920b4f82993941e0aac349eb8081ce11e38f Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Thu, 23 Nov 2023 14:37:47 +0100 Subject: [PATCH 0114/1290] dt-bindings: phy: add compatible for Mediatek MT8195 Add the compatible string for MediaTek MT8195 SoC, using the same MIPI D-PHY block as the MT8183. Signed-off-by: Michael Walle Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20231123133749.2030661-3-mwalle@kernel.org Signed-off-by: Vinod Koul --- Documentation/devicetree/bindings/phy/mediatek,dsi-phy.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/phy/mediatek,dsi-phy.yaml b/Documentation/devicetree/bindings/phy/mediatek,dsi-phy.yaml index 6703689fcdbe1..f6e494d0d89b8 100644 --- a/Documentation/devicetree/bindings/phy/mediatek,dsi-phy.yaml +++ b/Documentation/devicetree/bindings/phy/mediatek,dsi-phy.yaml @@ -31,6 +31,7 @@ properties: - items: - enum: - mediatek,mt8188-mipi-tx + - mediatek,mt8195-mipi-tx - mediatek,mt8365-mipi-tx - const: mediatek,mt8183-mipi-tx - const: mediatek,mt2701-mipi-tx -- GitLab From 2fda59099462ee700e424ba3ac928d13ad6389a8 Mon Sep 17 00:00:00 2001 From: Alex Bee Date: Sun, 19 Nov 2023 13:13:36 +0100 Subject: [PATCH 0115/1290] phy: rockchip-inno-usb2: Split ID interrupt phy registers Commit 51a9b2c03dd3 ("phy: rockchip-inno-usb2: Handle ID IRQ") added ID detection interrupt registers. However the current implementation assumes that falling and rising edge interrupt are always enabled in registers spanning over subsequent bits. That is not the case for RK3128's version of the phy and this implementation can't be used as-is, since there are bits with different purpose in between. This splits up the register definitions for id_det_en, id_det_en and id_det_clr registers in rising and falling edge variants. It's required as preparation to support RK3128's Innosilicon usb2 phy as well in this driver and matches pretty much to what the vendor does, so I'm not expecting issues for other SoCs with that change. Signed-off-by: Alex Bee Reviewed-by: Heiko Stuebner Link: https://lore.kernel.org/r/20231119121340.109025-2-knaerzche@gmail.com Signed-off-by: Vinod Koul --- drivers/phy/rockchip/phy-rockchip-inno-usb2.c | 99 +++++++++++++------ 1 file changed, 70 insertions(+), 29 deletions(-) diff --git a/drivers/phy/rockchip/phy-rockchip-inno-usb2.c b/drivers/phy/rockchip/phy-rockchip-inno-usb2.c index a24d2af154df7..b5a1d30df83a1 100644 --- a/drivers/phy/rockchip/phy-rockchip-inno-usb2.c +++ b/drivers/phy/rockchip/phy-rockchip-inno-usb2.c @@ -123,9 +123,12 @@ struct rockchip_chg_det_reg { * @disrise_en: host disconnect rise edge detection enable. * @disrise_st: host disconnect rise edge detection state. * @disrise_clr: host disconnect rise edge detection clear. - * @id_det_en: id detection enable register. - * @id_det_st: id detection state register. - * @id_det_clr: id detection clear register. + * @idfall_det_en: id detection enable register, falling edge + * @idfall_det_st: id detection state register, falling edge + * @idfall_det_clr: id detection clear register, falling edge + * @idrise_det_en: id detection enable register, rising edge + * @idrise_det_st: id detection state register, rising edge + * @idrise_det_clr: id detection clear register, rising edge * @ls_det_en: linestate detection enable register. * @ls_det_st: linestate detection state register. * @ls_det_clr: linestate detection clear register. @@ -146,9 +149,12 @@ struct rockchip_usb2phy_port_cfg { struct usb2phy_reg disrise_en; struct usb2phy_reg disrise_st; struct usb2phy_reg disrise_clr; - struct usb2phy_reg id_det_en; - struct usb2phy_reg id_det_st; - struct usb2phy_reg id_det_clr; + struct usb2phy_reg idfall_det_en; + struct usb2phy_reg idfall_det_st; + struct usb2phy_reg idfall_det_clr; + struct usb2phy_reg idrise_det_en; + struct usb2phy_reg idrise_det_st; + struct usb2phy_reg idrise_det_clr; struct usb2phy_reg ls_det_en; struct usb2phy_reg ls_det_st; struct usb2phy_reg ls_det_clr; @@ -488,15 +494,27 @@ static int rockchip_usb2phy_init(struct phy *phy) if (ret) goto out; - /* clear id status and enable id detect irq */ + /* clear id status and enable id detect irqs */ ret = property_enable(rphy->grf, - &rport->port_cfg->id_det_clr, + &rport->port_cfg->idfall_det_clr, true); if (ret) goto out; ret = property_enable(rphy->grf, - &rport->port_cfg->id_det_en, + &rport->port_cfg->idrise_det_clr, + true); + if (ret) + goto out; + + ret = property_enable(rphy->grf, + &rport->port_cfg->idfall_det_en, + true); + if (ret) + goto out; + + ret = property_enable(rphy->grf, + &rport->port_cfg->idrise_det_en, true); if (ret) goto out; @@ -1030,11 +1048,16 @@ static irqreturn_t rockchip_usb2phy_id_irq(int irq, void *data) struct rockchip_usb2phy *rphy = dev_get_drvdata(rport->phy->dev.parent); bool id; - if (!property_enabled(rphy->grf, &rport->port_cfg->id_det_st)) + if (!property_enabled(rphy->grf, &rport->port_cfg->idfall_det_st) && + !property_enabled(rphy->grf, &rport->port_cfg->idrise_det_st)) return IRQ_NONE; /* clear id detect irq pending status */ - property_enable(rphy->grf, &rport->port_cfg->id_det_clr, true); + if (property_enabled(rphy->grf, &rport->port_cfg->idfall_det_st)) + property_enable(rphy->grf, &rport->port_cfg->idfall_det_clr, true); + + if (property_enabled(rphy->grf, &rport->port_cfg->idrise_det_st)) + property_enable(rphy->grf, &rport->port_cfg->idrise_det_clr, true); id = property_enabled(rphy->grf, &rport->port_cfg->utmi_id); extcon_set_state_sync(rphy->edev, EXTCON_USB_HOST, !id); @@ -1524,9 +1547,12 @@ static const struct rockchip_usb2phy_cfg rk3228_phy_cfgs[] = { .bvalid_det_en = { 0x0680, 3, 3, 0, 1 }, .bvalid_det_st = { 0x0690, 3, 3, 0, 1 }, .bvalid_det_clr = { 0x06a0, 3, 3, 0, 1 }, - .id_det_en = { 0x0680, 6, 5, 0, 3 }, - .id_det_st = { 0x0690, 6, 5, 0, 3 }, - .id_det_clr = { 0x06a0, 6, 5, 0, 3 }, + .idfall_det_en = { 0x0680, 6, 6, 0, 1 }, + .idfall_det_st = { 0x0690, 6, 6, 0, 1 }, + .idfall_det_clr = { 0x06a0, 6, 6, 0, 1 }, + .idrise_det_en = { 0x0680, 5, 5, 0, 1 }, + .idrise_det_st = { 0x0690, 5, 5, 0, 1 }, + .idrise_det_clr = { 0x06a0, 5, 5, 0, 1 }, .ls_det_en = { 0x0680, 2, 2, 0, 1 }, .ls_det_st = { 0x0690, 2, 2, 0, 1 }, .ls_det_clr = { 0x06a0, 2, 2, 0, 1 }, @@ -1587,9 +1613,12 @@ static const struct rockchip_usb2phy_cfg rk3308_phy_cfgs[] = { .bvalid_det_en = { 0x3020, 3, 2, 0, 3 }, .bvalid_det_st = { 0x3024, 3, 2, 0, 3 }, .bvalid_det_clr = { 0x3028, 3, 2, 0, 3 }, - .id_det_en = { 0x3020, 5, 4, 0, 3 }, - .id_det_st = { 0x3024, 5, 4, 0, 3 }, - .id_det_clr = { 0x3028, 5, 4, 0, 3 }, + .idfall_det_en = { 0x3020, 5, 5, 0, 1 }, + .idfall_det_st = { 0x3024, 5, 5, 0, 1 }, + .idfall_det_clr = { 0x3028, 5, 5, 0, 1 }, + .idrise_det_en = { 0x3020, 4, 4, 0, 1 }, + .idrise_det_st = { 0x3024, 4, 4, 0, 1 }, + .idrise_det_clr = { 0x3028, 4, 4, 0, 1 }, .ls_det_en = { 0x3020, 0, 0, 0, 1 }, .ls_det_st = { 0x3024, 0, 0, 0, 1 }, .ls_det_clr = { 0x3028, 0, 0, 0, 1 }, @@ -1634,9 +1663,12 @@ static const struct rockchip_usb2phy_cfg rk3328_phy_cfgs[] = { .bvalid_det_en = { 0x0110, 3, 2, 0, 3 }, .bvalid_det_st = { 0x0114, 3, 2, 0, 3 }, .bvalid_det_clr = { 0x0118, 3, 2, 0, 3 }, - .id_det_en = { 0x0110, 5, 4, 0, 3 }, - .id_det_st = { 0x0114, 5, 4, 0, 3 }, - .id_det_clr = { 0x0118, 5, 4, 0, 3 }, + .idfall_det_en = { 0x0110, 5, 5, 0, 1 }, + .idfall_det_st = { 0x0114, 5, 5, 0, 1 }, + .idfall_det_clr = { 0x0118, 5, 5, 0, 1 }, + .idrise_det_en = { 0x0110, 4, 4, 0, 1 }, + .idrise_det_st = { 0x0114, 4, 4, 0, 1 }, + .idrise_det_clr = { 0x0118, 4, 4, 0, 1 }, .ls_det_en = { 0x0110, 0, 0, 0, 1 }, .ls_det_st = { 0x0114, 0, 0, 0, 1 }, .ls_det_clr = { 0x0118, 0, 0, 0, 1 }, @@ -1700,9 +1732,12 @@ static const struct rockchip_usb2phy_cfg rk3399_phy_cfgs[] = { .bvalid_det_en = { 0xe3c0, 3, 3, 0, 1 }, .bvalid_det_st = { 0xe3e0, 3, 3, 0, 1 }, .bvalid_det_clr = { 0xe3d0, 3, 3, 0, 1 }, - .id_det_en = { 0xe3c0, 5, 4, 0, 3 }, - .id_det_st = { 0xe3e0, 5, 4, 0, 3 }, - .id_det_clr = { 0xe3d0, 5, 4, 0, 3 }, + .idfall_det_en = { 0xe3c0, 5, 5, 0, 1 }, + .idfall_det_st = { 0xe3e0, 5, 5, 0, 1 }, + .idfall_det_clr = { 0xe3d0, 5, 5, 0, 1 }, + .idrise_det_en = { 0xe3c0, 4, 4, 0, 1 }, + .idrise_det_st = { 0xe3e0, 4, 4, 0, 1 }, + .idrise_det_clr = { 0xe3d0, 4, 4, 0, 1 }, .utmi_avalid = { 0xe2ac, 7, 7, 0, 1 }, .utmi_bvalid = { 0xe2ac, 12, 12, 0, 1 }, .utmi_id = { 0xe2ac, 8, 8, 0, 1 }, @@ -1739,9 +1774,12 @@ static const struct rockchip_usb2phy_cfg rk3399_phy_cfgs[] = { .bvalid_det_en = { 0xe3c0, 8, 8, 0, 1 }, .bvalid_det_st = { 0xe3e0, 8, 8, 0, 1 }, .bvalid_det_clr = { 0xe3d0, 8, 8, 0, 1 }, - .id_det_en = { 0xe3c0, 10, 9, 0, 3 }, - .id_det_st = { 0xe3e0, 10, 9, 0, 3 }, - .id_det_clr = { 0xe3d0, 10, 9, 0, 3 }, + .idfall_det_en = { 0xe3c0, 10, 10, 0, 1 }, + .idfall_det_st = { 0xe3e0, 10, 10, 0, 1 }, + .idfall_det_clr = { 0xe3d0, 10, 10, 0, 1 }, + .idrise_det_en = { 0xe3c0, 9, 9, 0, 1 }, + .idrise_det_st = { 0xe3e0, 9, 9, 0, 1 }, + .idrise_det_clr = { 0xe3d0, 9, 9, 0, 1 }, .utmi_avalid = { 0xe2ac, 10, 10, 0, 1 }, .utmi_bvalid = { 0xe2ac, 16, 16, 0, 1 }, .utmi_id = { 0xe2ac, 11, 11, 0, 1 }, @@ -1770,9 +1808,12 @@ static const struct rockchip_usb2phy_cfg rk3568_phy_cfgs[] = { .bvalid_det_en = { 0x0080, 3, 2, 0, 3 }, .bvalid_det_st = { 0x0084, 3, 2, 0, 3 }, .bvalid_det_clr = { 0x0088, 3, 2, 0, 3 }, - .id_det_en = { 0x0080, 5, 4, 0, 3 }, - .id_det_st = { 0x0084, 5, 4, 0, 3 }, - .id_det_clr = { 0x0088, 5, 4, 0, 3 }, + .idfall_det_en = { 0x0080, 5, 5, 0, 1 }, + .idfall_det_st = { 0x0084, 5, 5, 0, 1 }, + .idfall_det_clr = { 0x0088, 5, 5, 0, 1 }, + .idrise_det_en = { 0x0080, 4, 4, 0, 1 }, + .idrise_det_st = { 0x0084, 4, 4, 0, 1 }, + .idrise_det_clr = { 0x0088, 4, 4, 0, 1 }, .utmi_avalid = { 0x00c0, 10, 10, 0, 1 }, .utmi_bvalid = { 0x00c0, 9, 9, 0, 1 }, .utmi_id = { 0x00c0, 6, 6, 0, 1 }, -- GitLab From 62ff41017e147472b07de6125c3be82ce02a8dd7 Mon Sep 17 00:00:00 2001 From: Alex Bee Date: Sun, 19 Nov 2023 13:13:37 +0100 Subject: [PATCH 0116/1290] phy: phy-rockchip-inno-usb2: Add RK3128 support Add registers to support the 2-port usb2 phy found in RK312x SoC familiy. Signed-off-by: Alex Bee Reviewed-by: Heiko Stuebner Link: https://lore.kernel.org/r/20231119121340.109025-3-knaerzche@gmail.com Signed-off-by: Vinod Koul --- drivers/phy/rockchip/phy-rockchip-inno-usb2.c | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/drivers/phy/rockchip/phy-rockchip-inno-usb2.c b/drivers/phy/rockchip/phy-rockchip-inno-usb2.c index b5a1d30df83a1..4f71373ae6e1a 100644 --- a/drivers/phy/rockchip/phy-rockchip-inno-usb2.c +++ b/drivers/phy/rockchip/phy-rockchip-inno-usb2.c @@ -1487,6 +1487,14 @@ put_child: return ret; } +static int rk3128_usb2phy_tuning(struct rockchip_usb2phy *rphy) +{ + /* Turn off differential receiver in suspend mode */ + return regmap_write_bits(rphy->grf, 0x298, + BIT(2) << BIT_WRITEABLE_SHIFT | BIT(2), + BIT(2) << BIT_WRITEABLE_SHIFT | 0); +} + static int rk3588_usb2phy_tuning(struct rockchip_usb2phy *rphy) { int ret; @@ -1536,6 +1544,54 @@ static int rk3588_usb2phy_tuning(struct rockchip_usb2phy *rphy) return ret; } +static const struct rockchip_usb2phy_cfg rk3128_phy_cfgs[] = { + { + .reg = 0x17c, + .num_ports = 2, + .phy_tuning = rk3128_usb2phy_tuning, + .clkout_ctl = { 0x0190, 15, 15, 1, 0 }, + .port_cfgs = { + [USB2PHY_PORT_OTG] = { + .phy_sus = { 0x017c, 8, 0, 0, 0x1d1 }, + .bvalid_det_en = { 0x017c, 14, 14, 0, 1 }, + .bvalid_det_st = { 0x017c, 15, 15, 0, 1 }, + .bvalid_det_clr = { 0x017c, 15, 15, 0, 1 }, + .idfall_det_en = { 0x01a0, 2, 2, 0, 1 }, + .idfall_det_st = { 0x01a0, 3, 3, 0, 1 }, + .idfall_det_clr = { 0x01a0, 3, 3, 0, 1 }, + .idrise_det_en = { 0x01a0, 0, 0, 0, 1 }, + .idrise_det_st = { 0x01a0, 1, 1, 0, 1 }, + .idrise_det_clr = { 0x01a0, 1, 1, 0, 1 }, + .ls_det_en = { 0x017c, 12, 12, 0, 1 }, + .ls_det_st = { 0x017c, 13, 13, 0, 1 }, + .ls_det_clr = { 0x017c, 13, 13, 0, 1 }, + .utmi_bvalid = { 0x014c, 5, 5, 0, 1 }, + .utmi_id = { 0x014c, 8, 8, 0, 1 }, + .utmi_ls = { 0x014c, 7, 6, 0, 1 }, + }, + [USB2PHY_PORT_HOST] = { + .phy_sus = { 0x0194, 8, 0, 0, 0x1d1 }, + .ls_det_en = { 0x0194, 14, 14, 0, 1 }, + .ls_det_st = { 0x0194, 15, 15, 0, 1 }, + .ls_det_clr = { 0x0194, 15, 15, 0, 1 } + } + }, + .chg_det = { + .opmode = { 0x017c, 3, 0, 5, 1 }, + .cp_det = { 0x02c0, 6, 6, 0, 1 }, + .dcp_det = { 0x02c0, 5, 5, 0, 1 }, + .dp_det = { 0x02c0, 7, 7, 0, 1 }, + .idm_sink_en = { 0x0184, 8, 8, 0, 1 }, + .idp_sink_en = { 0x0184, 7, 7, 0, 1 }, + .idp_src_en = { 0x0184, 9, 9, 0, 1 }, + .rdm_pdwn_en = { 0x0184, 10, 10, 0, 1 }, + .vdm_src_en = { 0x0184, 12, 12, 0, 1 }, + .vdp_src_en = { 0x0184, 11, 11, 0, 1 }, + }, + }, + { /* sentinel */ } +}; + static const struct rockchip_usb2phy_cfg rk3228_phy_cfgs[] = { { .reg = 0x760, @@ -2031,6 +2087,7 @@ static const struct rockchip_usb2phy_cfg rv1108_phy_cfgs[] = { static const struct of_device_id rockchip_usb2phy_dt_match[] = { { .compatible = "rockchip,px30-usb2phy", .data = &rk3328_phy_cfgs }, + { .compatible = "rockchip,rk3128-usb2phy", .data = &rk3128_phy_cfgs }, { .compatible = "rockchip,rk3228-usb2phy", .data = &rk3228_phy_cfgs }, { .compatible = "rockchip,rk3308-usb2phy", .data = &rk3308_phy_cfgs }, { .compatible = "rockchip,rk3328-usb2phy", .data = &rk3328_phy_cfgs }, -- GitLab From 7f6f9e0def00cfaeb1d034fd13dbd84470aeccbd Mon Sep 17 00:00:00 2001 From: Rohit Agarwal Date: Fri, 17 Nov 2023 11:45:01 +0530 Subject: [PATCH 0117/1290] phy: qcom-qmp-usb: Add Qualcomm SDX75 USB3 PHY support Add support for USB3 QMP PHY found in SDX75 platform. Signed-off-by: Rohit Agarwal Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20231117061501.537529-1-quic_rohiagar@quicinc.com Signed-off-by: Vinod Koul --- drivers/phy/qualcomm/phy-qcom-qmp-usb.c | 173 ++++++++++++++++++++++++ 1 file changed, 173 insertions(+) diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-usb.c b/drivers/phy/qualcomm/phy-qcom-qmp-usb.c index 02f156298e77c..77e91b3eae79e 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp-usb.c +++ b/drivers/phy/qualcomm/phy-qcom-qmp-usb.c @@ -24,6 +24,7 @@ #include "phy-qcom-qmp-pcs-misc-v4.h" #include "phy-qcom-qmp-pcs-usb-v4.h" #include "phy-qcom-qmp-pcs-usb-v5.h" +#include "phy-qcom-qmp-pcs-usb-v6.h" /* QPHY_SW_RESET bit */ #define SW_RESET BIT(0) @@ -151,6 +152,17 @@ static const unsigned int qmp_v5_usb3phy_regs_layout[QPHY_LAYOUT_SIZE] = { [QPHY_PCS_LFPS_RXTERM_IRQ_CLEAR] = QPHY_V5_PCS_USB3_LFPS_RXTERM_IRQ_CLEAR, }; +static const unsigned int qmp_v6_usb3phy_regs_layout[QPHY_LAYOUT_SIZE] = { + [QPHY_SW_RESET] = QPHY_V6_PCS_SW_RESET, + [QPHY_START_CTRL] = QPHY_V6_PCS_START_CONTROL, + [QPHY_PCS_STATUS] = QPHY_V6_PCS_PCS_STATUS1, + [QPHY_PCS_POWER_DOWN_CONTROL] = QPHY_V6_PCS_POWER_DOWN_CONTROL, + + /* In PCS_USB */ + [QPHY_PCS_AUTONOMOUS_MODE_CTRL] = QPHY_V6_PCS_USB3_AUTONOMOUS_MODE_CTRL, + [QPHY_PCS_LFPS_RXTERM_IRQ_CLEAR] = QPHY_V6_PCS_USB3_LFPS_RXTERM_IRQ_CLEAR, +}; + static const struct qmp_phy_init_tbl ipq9574_usb3_serdes_tbl[] = { QMP_PHY_INIT_CFG(QSERDES_COM_SYSCLK_EN_SEL, 0x1a), QMP_PHY_INIT_CFG(QSERDES_COM_BIAS_EN_CLKBUFLR_EN, 0x08), @@ -871,6 +883,134 @@ static const struct qmp_phy_init_tbl sdx65_usb3_uniphy_rx_tbl[] = { QMP_PHY_INIT_CFG(QSERDES_V5_RX_SIGDET_ENABLES, 0x00), }; +static const struct qmp_phy_init_tbl sdx75_usb3_uniphy_serdes_tbl[] = { + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_STEP_SIZE1_MODE1, 0x9e), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_STEP_SIZE2_MODE1, 0x06), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_CP_CTRL_MODE1, 0x02), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_RCTRL_MODE1, 0x16), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_CCTRL_MODE1, 0x36), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_CORECLK_DIV_MODE1, 0x04), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP1_MODE1, 0x2e), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP2_MODE1, 0x82), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DEC_START_MODE1, 0x82), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DIV_FRAC_START1_MODE1, 0xab), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DIV_FRAC_START2_MODE1, 0xea), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DIV_FRAC_START3_MODE1, 0x02), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_HSCLK_SEL_1, 0x01), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_VCO_TUNE1_MODE1, 0x25), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_VCO_TUNE2_MODE1, 0x02), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_BIN_VCOCAL_CMP_CODE1_MODE1, 0xb7), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_BIN_VCOCAL_CMP_CODE2_MODE1, 0x1e), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_BIN_VCOCAL_CMP_CODE1_MODE0, 0xb7), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_BIN_VCOCAL_CMP_CODE2_MODE0, 0x1e), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_STEP_SIZE1_MODE0, 0x9e), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_STEP_SIZE2_MODE0, 0x06), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_CP_CTRL_MODE0, 0x02), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_RCTRL_MODE0, 0x16), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_CCTRL_MODE0, 0x36), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP1_MODE0, 0x12), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP2_MODE0, 0x34), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DEC_START_MODE0, 0x82), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DIV_FRAC_START1_MODE0, 0xab), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DIV_FRAC_START2_MODE0, 0xea), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DIV_FRAC_START3_MODE0, 0x02), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_VCO_TUNE1_MODE0, 0x25), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_VCO_TUNE2_MODE0, 0x02), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_BG_TIMER, 0x0e), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_EN_CENTER, 0x01), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_PER1, 0x31), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_PER2, 0x01), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SYSCLK_BUF_ENABLE, 0x0a), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SYSCLK_EN_SEL, 0x1a), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP_CFG, 0x14), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_VCO_TUNE_MAP, 0x04), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_CORE_CLK_EN, 0x20), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_CMN_CONFIG_1, 0x16), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_AUTO_GAIN_ADJ_CTRL_1, 0xb6), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_AUTO_GAIN_ADJ_CTRL_2, 0x4b), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_AUTO_GAIN_ADJ_CTRL_3, 0x37), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_ADDITIONAL_MISC, 0x0c), +}; + +static const struct qmp_phy_init_tbl sdx75_usb3_uniphy_tx_tbl[] = { + QMP_PHY_INIT_CFG(QSERDES_V6_TX_RES_CODE_LANE_TX, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_TX_RES_CODE_LANE_RX, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_TX_RES_CODE_LANE_OFFSET_TX, 0x1f), + QMP_PHY_INIT_CFG(QSERDES_V6_TX_RES_CODE_LANE_OFFSET_RX, 0x09), + QMP_PHY_INIT_CFG(QSERDES_V6_TX_LANE_MODE_1, 0xf5), + QMP_PHY_INIT_CFG(QSERDES_V6_TX_LANE_MODE_3, 0x3f), + QMP_PHY_INIT_CFG(QSERDES_V6_TX_LANE_MODE_4, 0x3f), + QMP_PHY_INIT_CFG(QSERDES_V6_TX_LANE_MODE_5, 0x5f), + QMP_PHY_INIT_CFG(QSERDES_V6_TX_RCV_DETECT_LVL_2, 0x12), + QMP_PHY_INIT_CFG(QSERDES_V6_TX_PI_QEC_CTRL, 0x21), +}; + +static const struct qmp_phy_init_tbl sdx75_usb3_uniphy_rx_tbl[] = { + QMP_PHY_INIT_CFG(QSERDES_V6_RX_UCDR_FO_GAIN, 0x0a), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_UCDR_SO_GAIN, 0x06), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_UCDR_FASTLOCK_FO_GAIN, 0x2f), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_UCDR_SO_SATURATION_AND_ENABLE, 0x7f), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_UCDR_FASTLOCK_COUNT_LOW, 0xff), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_UCDR_FASTLOCK_COUNT_HIGH, 0x0f), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_UCDR_PI_CONTROLS, 0x99), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_UCDR_SB2_THRESH1, 0x08), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_UCDR_SB2_THRESH2, 0x08), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_UCDR_SB2_GAIN1, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_UCDR_SB2_GAIN2, 0x0a), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_AUX_DATA_TCOARSE_TFINE, 0xa0), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_VGA_CAL_CNTRL1, 0x54), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_VGA_CAL_CNTRL2, 0x0f), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_GM_CAL, 0x13), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_RX_EQU_ADAPTOR_CNTRL2, 0x0f), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_RX_EQU_ADAPTOR_CNTRL3, 0x4a), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_RX_EQU_ADAPTOR_CNTRL4, 0x0a), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_RX_IDAC_TSETTLE_LOW, 0x07), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_RX_IDAC_TSETTLE_HIGH, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_RX_EQ_OFFSET_ADAPTOR_CNTRL1, 0x47), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_SIGDET_CNTRL, 0x04), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_SIGDET_DEGLITCH_CNTRL, 0x0e), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_RX_MODE_00_LOW, 0x3f), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_RX_MODE_00_HIGH, 0xbf), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_RX_MODE_00_HIGH2, 0xff), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_RX_MODE_00_HIGH3, 0xdf), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_RX_MODE_00_HIGH4, 0xed), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_RX_MODE_01_LOW, 0xdc), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_RX_MODE_01_HIGH, 0x5c), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_RX_MODE_01_HIGH2, 0x9c), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_RX_MODE_01_HIGH3, 0x1d), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_RX_MODE_01_HIGH4, 0x09), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_DFE_EN_TIMER, 0x04), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_DFE_CTLE_POST_CAL_OFFSET, 0x38), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_DCC_CTRL1, 0x0c), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_VTH_CODE, 0x10), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_SIGDET_CAL_CTRL1, 0x14), + QMP_PHY_INIT_CFG(QSERDES_V6_RX_SIGDET_CAL_TRIM, 0x08), +}; + +static const struct qmp_phy_init_tbl sdx75_usb3_uniphy_pcs_tbl[] = { + QMP_PHY_INIT_CFG(QPHY_V6_PCS_LOCK_DETECT_CONFIG1, 0xc4), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_LOCK_DETECT_CONFIG2, 0x89), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_LOCK_DETECT_CONFIG3, 0x20), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_LOCK_DETECT_CONFIG6, 0x13), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_REFGEN_REQ_CONFIG1, 0x21), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_RX_SIGDET_LVL, 0xaa), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_RCVR_DTCT_DLY_P1U2_L, 0xe7), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_RCVR_DTCT_DLY_P1U2_H, 0x03), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_CDR_RESET_TIME, 0x0a), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_ALIGN_DETECT_CONFIG1, 0x88), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_ALIGN_DETECT_CONFIG2, 0x13), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_PCS_TX_RX_CONFIG, 0x0c), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_EQ_CONFIG1, 0x4b), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_EQ_CONFIG5, 0x10), +}; + +static const struct qmp_phy_init_tbl sdx75_usb3_uniphy_pcs_usb_tbl[] = { + QMP_PHY_INIT_CFG(QPHY_V6_PCS_USB3_LFPS_DET_HIGH_COUNT_VAL, 0xf8), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_USB3_RXEQTRAINING_DFE_TIME_S2, 0x07), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_USB3_RCVR_DTCT_DLY_U3_L, 0x40), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_USB3_RCVR_DTCT_DLY_U3_H, 0x00), +}; + static const struct qmp_phy_init_tbl sm8350_usb3_uniphy_tx_tbl[] = { QMP_PHY_INIT_CFG(QSERDES_V5_TX_LANE_MODE_1, 0xa5), QMP_PHY_INIT_CFG(QSERDES_V5_TX_LANE_MODE_2, 0x82), @@ -1317,6 +1457,14 @@ static const struct qmp_usb_offsets qmp_usb_offsets_v5 = { .rx = 0x1000, }; +static const struct qmp_usb_offsets qmp_usb_offsets_v6 = { + .serdes = 0, + .pcs = 0x0200, + .pcs_usb = 0x1200, + .tx = 0x0e00, + .rx = 0x1000, +}; + static const struct qmp_phy_cfg ipq8074_usb3phy_cfg = { .lanes = 1, @@ -1541,6 +1689,28 @@ static const struct qmp_phy_cfg sdx65_usb3_uniphy_cfg = { .has_pwrdn_delay = true, }; +static const struct qmp_phy_cfg sdx75_usb3_uniphy_cfg = { + .lanes = 1, + .offsets = &qmp_usb_offsets_v6, + + .serdes_tbl = sdx75_usb3_uniphy_serdes_tbl, + .serdes_tbl_num = ARRAY_SIZE(sdx75_usb3_uniphy_serdes_tbl), + .tx_tbl = sdx75_usb3_uniphy_tx_tbl, + .tx_tbl_num = ARRAY_SIZE(sdx75_usb3_uniphy_tx_tbl), + .rx_tbl = sdx75_usb3_uniphy_rx_tbl, + .rx_tbl_num = ARRAY_SIZE(sdx75_usb3_uniphy_rx_tbl), + .pcs_tbl = sdx75_usb3_uniphy_pcs_tbl, + .pcs_tbl_num = ARRAY_SIZE(sdx75_usb3_uniphy_pcs_tbl), + .pcs_usb_tbl = sdx75_usb3_uniphy_pcs_usb_tbl, + .pcs_usb_tbl_num = ARRAY_SIZE(sdx75_usb3_uniphy_pcs_usb_tbl), + .vreg_list = qmp_phy_vreg_l, + .num_vregs = ARRAY_SIZE(qmp_phy_vreg_l), + .regs = qmp_v6_usb3phy_regs_layout, + .pcs_usb_offset = 0x1000, + + .has_pwrdn_delay = true, +}; + static const struct qmp_phy_cfg sm8350_usb3_uniphy_cfg = { .lanes = 1, @@ -2256,6 +2426,9 @@ static const struct of_device_id qmp_usb_of_match_table[] = { }, { .compatible = "qcom,sdx65-qmp-usb3-uni-phy", .data = &sdx65_usb3_uniphy_cfg, + }, { + .compatible = "qcom,sdx75-qmp-usb3-uni-phy", + .data = &sdx75_usb3_uniphy_cfg, }, { .compatible = "qcom,sm6115-qmp-usb3-phy", .data = &qcm2290_usb3phy_cfg, -- GitLab From 72b4ca7e993e94f09bcf6d19fc385a2e8060c71f Mon Sep 17 00:00:00 2001 From: Nick Forrington Date: Thu, 2 Nov 2023 16:22:24 +0000 Subject: [PATCH 0118/1290] perf test: Remove atomics from test_loop to avoid test failures The current use of atomics can lead to test failures, as tests (such as tests/shell/record.sh) search for samples with "test_loop" as the top-most stack frame, but find frames related to the atomic operation (e.g. __aarch64_ldadd4_relax). This change simply removes the "count" variable, as it is not necessary. Fixes: 1962ab6f6e0b39e4 ("perf test workload thloop: Make count increments atomic") Reviewed-by: James Clark Signed-off-by: Nick Forrington Acked-by: Leo Yan Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Link: https://lore.kernel.org/r/20231102162225.50028-1-nick.forrington@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/workloads/thloop.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/perf/tests/workloads/thloop.c b/tools/perf/tests/workloads/thloop.c index af05269c2eb8a..457b29f91c3ee 100644 --- a/tools/perf/tests/workloads/thloop.c +++ b/tools/perf/tests/workloads/thloop.c @@ -7,7 +7,6 @@ #include "../tests.h" static volatile sig_atomic_t done; -static volatile unsigned count; /* We want to check this symbol in perf report */ noinline void test_loop(void); @@ -19,8 +18,7 @@ static void sighandler(int sig __maybe_unused) noinline void test_loop(void) { - while (!done) - __atomic_fetch_add(&count, 1, __ATOMIC_RELAXED); + while (!done); } static void *thfunc(void *arg) -- GitLab From b457c526072aa8ab312517010837b06d38b9bb66 Mon Sep 17 00:00:00 2001 From: Paran Lee Date: Tue, 21 Nov 2023 07:32:19 +0900 Subject: [PATCH 0119/1290] perf script python: Fail check on dynamic allocation Add PyList_New() Fail check in get_field_numeric_entry() function and dynamic allocation checking for set_regs_in_dict(), python_start_script(). Reviewed-by: Adrian Hunter Reviewed-by: MichelleJin Signed-off-by: Paran Lee Cc: Alexander Shishkin Cc: Austin Kim Cc: Honggyu Kim Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Li Dong Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sean Christopherson Link: https://lore.kernel.org/r/20231120223218.9036-1-p4ranlee@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- .../util/scripting-engines/trace-event-python.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index 94312741443ab..860e1837ba969 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -353,6 +353,8 @@ static PyObject *get_field_numeric_entry(struct tep_event *event, if (is_array) { list = PyList_New(field->arraylen); + if (!list) + Py_FatalError("couldn't create Python list"); item_size = field->size / field->arraylen; n_items = field->arraylen; } else { @@ -754,7 +756,7 @@ static void regs_map(struct regs_dump *regs, uint64_t mask, const char *arch, ch } } -static void set_regs_in_dict(PyObject *dict, +static int set_regs_in_dict(PyObject *dict, struct perf_sample *sample, struct evsel *evsel) { @@ -770,6 +772,8 @@ static void set_regs_in_dict(PyObject *dict, */ int size = __sw_hweight64(attr->sample_regs_intr) * 28; char *bf = malloc(size); + if (!bf) + return -1; regs_map(&sample->intr_regs, attr->sample_regs_intr, arch, bf, size); @@ -781,6 +785,8 @@ static void set_regs_in_dict(PyObject *dict, pydict_set_item_string_decref(dict, "uregs", _PyUnicode_FromString(bf)); free(bf); + + return 0; } static void set_sym_in_dict(PyObject *dict, struct addr_location *al, @@ -920,7 +926,8 @@ static PyObject *get_perf_sample_dict(struct perf_sample *sample, PyLong_FromUnsignedLongLong(sample->cyc_cnt)); } - set_regs_in_dict(dict, sample, evsel); + if (set_regs_in_dict(dict, sample, evsel)) + Py_FatalError("Failed to setting regs in dict"); return dict; } @@ -1918,12 +1925,18 @@ static int python_start_script(const char *script, int argc, const char **argv, scripting_context->session = session; #if PY_MAJOR_VERSION < 3 command_line = malloc((argc + 1) * sizeof(const char *)); + if (!command_line) + return -1; + command_line[0] = script; for (i = 1; i < argc + 1; i++) command_line[i] = argv[i - 1]; PyImport_AppendInittab(name, initperf_trace_context); #else command_line = malloc((argc + 1) * sizeof(wchar_t *)); + if (!command_line) + return -1; + command_line[0] = Py_DecodeLocale(script, NULL); for (i = 1; i < argc + 1; i++) command_line[i] = Py_DecodeLocale(argv[i - 1], NULL); -- GitLab From cd38d6b5fa2dfc3beb7311f0b373e3141620c76b Mon Sep 17 00:00:00 2001 From: zhaimingbing Date: Mon, 20 Nov 2023 19:23:56 +0800 Subject: [PATCH 0120/1290] perf script perl: Fail check on dynamic allocation Return ENOMEM when dynamic allocation failed. Reviewed-by: Ian Rogers Signed-off-by: zhaimingbing Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Li Dong Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sean Christopherson Link: https://lore.kernel.org/r/20231120112356.8652-1-zhaimingbing@cmss.chinamobile.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/scripting-engines/trace-event-perl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c index 603091317bed9..b072ac5d3bc22 100644 --- a/tools/perf/util/scripting-engines/trace-event-perl.c +++ b/tools/perf/util/scripting-engines/trace-event-perl.c @@ -490,6 +490,9 @@ static int perl_start_script(const char *script, int argc, const char **argv, scripting_context->session = session; command_line = malloc((argc + 2) * sizeof(const char *)); + if (!command_line) + return -ENOMEM; + command_line[0] = ""; command_line[1] = script; for (i = 2; i < argc + 2; i++) -- GitLab From 697579629f850d8f863147351e12e74c50114a8a Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 7 Nov 2023 10:40:20 -0800 Subject: [PATCH 0121/1290] perf test: Basic branch counter support Add a basic test for the branch counter feature. The test verifies that - The new filter can be successfully applied on the supported platforms. - The counter value can be outputted via the perf report -D Signed-off-by: Kan Liang Tested-by: Ian Rogers Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Tinghao Zhang Link: https://lore.kernel.org/r/20231107184020.1497571-1-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/record.sh | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tools/perf/tests/shell/record.sh b/tools/perf/tests/shell/record.sh index 29443b8e88765..c74412c68e65a 100755 --- a/tools/perf/tests/shell/record.sh +++ b/tools/perf/tests/shell/record.sh @@ -12,6 +12,9 @@ err=0 perfdata=$(mktemp /tmp/__perf_test.perf.data.XXXXX) testprog="perf test -w thloop" testsym="test_loop" +cpu_pmu_dir="/sys/bus/event_source/devices/cpu*" +br_cntr_file="/caps/branch_counter_nr" +br_cntr_output="branch stack counters" cleanup() { rm -rf "${perfdata}" @@ -155,10 +158,37 @@ test_workload() { echo "Basic target workload test [Success]" } +test_branch_counter() { + echo "Basic branch counter test" + # Check if the branch counter feature is supported + for dir in $cpu_pmu_dir + do + if [ ! -e "$dir$br_cntr_file" ] + then + echo "branch counter feature not supported on all core PMUs ($dir) [Skipped]" + return + fi + done + if ! perf record -o "${perfdata}" -j any,counter ${testprog} 2> /dev/null + then + echo "Basic branch counter test [Failed record]" + err=1 + return + fi + if ! perf report -i "${perfdata}" -D -q | grep -q "$br_cntr_output" + then + echo "Basic branch record test [Failed missing output]" + err=1 + return + fi + echo "Basic branch counter test [Success]" +} + test_per_thread test_register_capture test_system_wide test_workload +test_branch_counter cleanup exit $err -- GitLab From 2dbba30fd69b604802a9535b74bddb5bcca23793 Mon Sep 17 00:00:00 2001 From: James Clark Date: Fri, 1 Sep 2023 14:37:15 +0100 Subject: [PATCH 0122/1290] perf cs-etm: Bump minimum OpenCSD version to ensure a bugfix is present Since commit d927ef5004ef ("perf cs-etm: Add exception level consistency check"), the exception that was added to Perf will be triggered unless the following bugfix from OpenCSD is present: - _Version 1.2.1_: - __Bugfix__: ETM4x / ETE - output of context elements to client can in some circumstances be delayed until after subsequent atoms have been processed leading to incorrect memory decode access via the client callbacks. Fixed to flush context elements immediately they are committed. Rather than remove the assert and silently fail, just increase the minimum version requirement to avoid hard to debug issues and regressions. Reviewed-by: Ian Rogers Signed-off-by: James Clark Tested-by: Leo Yan Cc: John Garry Cc: Mike Leach Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230901133716.677499-1-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/build/feature/test-libopencsd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/build/feature/test-libopencsd.c b/tools/build/feature/test-libopencsd.c index eb6303ff446ed..4cfcef9da3e43 100644 --- a/tools/build/feature/test-libopencsd.c +++ b/tools/build/feature/test-libopencsd.c @@ -4,9 +4,9 @@ /* * Check OpenCSD library version is sufficient to provide required features */ -#define OCSD_MIN_VER ((1 << 16) | (1 << 8) | (1)) +#define OCSD_MIN_VER ((1 << 16) | (2 << 8) | (1)) #if !defined(OCSD_VER_NUM) || (OCSD_VER_NUM < OCSD_MIN_VER) -#error "OpenCSD >= 1.1.1 is required" +#error "OpenCSD >= 1.2.1 is required" #endif int main(void) -- GitLab From 26218331f49c858d52e60418cf3cef4c3fa6cf2e Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Sat, 14 Oct 2023 15:45:12 +0800 Subject: [PATCH 0123/1290] perf auxtrace: Add 'T' itrace option for timestamp trace An AUX trace can contain timestamp, but in some situations, the hardware trace module (e.g. Arm CoreSight) cannot decide the traced timestamp is the same source with CPU's time, thus the decoder can not use the timestamp trace for samples. This patch introduces 'T' itrace option. If users know the platforms they are working on have the same time counter with CPUs, users can use this new option to tell a decoder for using timestamp trace as kernel time. Signed-off-by: Leo Yan Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20231014074513.1668000-2-leo.yan@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/itrace.txt | 1 + tools/perf/util/auxtrace.c | 3 +++ tools/perf/util/auxtrace.h | 3 +++ 3 files changed, 7 insertions(+) diff --git a/tools/perf/Documentation/itrace.txt b/tools/perf/Documentation/itrace.txt index a97f95825b14e..19cc179be9a78 100644 --- a/tools/perf/Documentation/itrace.txt +++ b/tools/perf/Documentation/itrace.txt @@ -25,6 +25,7 @@ q quicker (less detailed) decoding A approximate IPC Z prefer to ignore timestamps (so-called "timeless" decoding) + T use the timestamp trace as kernel time The default is all events i.e. the same as --itrace=iybxwpe, except for perf script where it is --itrace=ce diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index a0368202a746a..f528c4364d235 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -1638,6 +1638,9 @@ int itrace_do_parse_synth_opts(struct itrace_synth_opts *synth_opts, case 'Z': synth_opts->timeless_decoding = true; break; + case 'T': + synth_opts->use_timestamp = true; + break; case ' ': case ',': break; diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h index 29eb82dff5749..55702215a82d3 100644 --- a/tools/perf/util/auxtrace.h +++ b/tools/perf/util/auxtrace.h @@ -99,6 +99,7 @@ enum itrace_period_type { * @remote_access: whether to synthesize remote access events * @mem: whether to synthesize memory events * @timeless_decoding: prefer "timeless" decoding i.e. ignore timestamps + * @use_timestamp: use the timestamp trace as kernel time * @vm_time_correlation: perform VM Time Correlation * @vm_tm_corr_dry_run: VM Time Correlation dry-run * @vm_tm_corr_args: VM Time Correlation implementation-specific arguments @@ -146,6 +147,7 @@ struct itrace_synth_opts { bool remote_access; bool mem; bool timeless_decoding; + bool use_timestamp; bool vm_time_correlation; bool vm_tm_corr_dry_run; char *vm_tm_corr_args; @@ -678,6 +680,7 @@ bool auxtrace__evsel_is_auxtrace(struct perf_session *session, " q: quicker (less detailed) decoding\n" \ " A: approximate IPC\n" \ " Z: prefer to ignore timestamps (so-called \"timeless\" decoding)\n" \ +" T: use the timestamp trace as kernel time\n" \ " PERIOD[ns|us|ms|i|t]: specify period to sample stream\n" \ " concatenate multiple options. Default is iybxwpe or cewp\n" -- GitLab From a4271827e609d30b7255aa7e4c453a8f3fe36a7b Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Sat, 14 Oct 2023 15:45:13 +0800 Subject: [PATCH 0124/1290] perf cs-etm: Enable itrace option 'T' Prior to Armv8.4, the feature FEAT_TRF is not supported by Arm CPUs. Consequently, the sysfs node 'ts_source' will not be set as 1 by the CoreSight ETM driver. On the other hand, the perf tool relies on the 'ts_source' node to determine whether the kernel timestamp is traced. Since the 'ts_source' is not set for Arm CPUs prior to Armv8.4, platforms in this case cannot utilize the traced timestamp as the kernel time. This patch enables the 'T' itrace option, which forcibly utilizes the traced timestamp as the kernel time. If users are aware that their working platform's Arm CoreSight shares the same counter with the kernel time, they can specify 'T' option to decode the traced timestamp as the kernel time. An usage example is: # perf record -e cs_etm// -- test_program # perf script --itrace=i10ibT # perf report --itrace=i10ibT Reviewed-by: James Clark Signed-off-by: Leo Yan Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20231014074513.1668000-3-leo.yan@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cs-etm.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index a9873d14c6329..d65d7485886cd 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -3346,12 +3346,27 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event, etm->metadata = metadata; etm->auxtrace_type = auxtrace_info->type; - /* Use virtual timestamps if all ETMs report ts_source = 1 */ - etm->has_virtual_ts = cs_etm__has_virtual_ts(metadata, num_cpu); + if (etm->synth_opts.use_timestamp) + /* + * Prior to Armv8.4, Arm CPUs don't support FEAT_TRF feature, + * therefore the decoder cannot know if the timestamp trace is + * same with the kernel time. + * + * If a user has knowledge for the working platform and can + * specify itrace option 'T' to tell decoder to forcely use the + * traced timestamp as the kernel time. + */ + etm->has_virtual_ts = true; + else + /* Use virtual timestamps if all ETMs report ts_source = 1 */ + etm->has_virtual_ts = cs_etm__has_virtual_ts(metadata, num_cpu); if (!etm->has_virtual_ts) ui__warning("Virtual timestamps are not enabled, or not supported by the traced system.\n" - "The time field of the samples will not be set accurately.\n\n"); + "The time field of the samples will not be set accurately.\n" + "For Arm CPUs prior to Armv8.4 or without support FEAT_TRF,\n" + "you can specify the itrace option 'T' for timestamp decoding\n" + "if the Coresight timestamp on the platform is same with the kernel time.\n\n"); etm->auxtrace.process_event = cs_etm__process_event; etm->auxtrace.process_auxtrace_event = cs_etm__process_auxtrace_event; -- GitLab From a24d9d9dc096fc0d0bd85302c9a4fe4fe3b1107b Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 22 Nov 2023 20:29:22 -0800 Subject: [PATCH 0125/1290] perf parse-events: Make legacy events lower priority than sysfs/JSON The perf tool has previously made legacy events the priority so with or without a PMU the legacy event would be opened: $ perf stat -e cpu-cycles,cpu/cpu-cycles/ true Using CPUID GenuineIntel-6-8D-1 intel_pt default config: tsc,mtc,mtc_period=3,psb_period=3,pt,branch Attempting to add event pmu 'cpu' with 'cpu-cycles,' that may result in non-fatal errors After aliases, add event pmu 'cpu' with 'cpu-cycles,' that may result in non-fatal errors Control descriptor is not initialized ------------------------------------------------------------ perf_event_attr: type 0 (PERF_TYPE_HARDWARE) size 136 config 0 (PERF_COUNT_HW_CPU_CYCLES) sample_type IDENTIFIER read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING disabled 1 inherit 1 enable_on_exec 1 exclude_guest 1 ------------------------------------------------------------ sys_perf_event_open: pid 833967 cpu -1 group_fd -1 flags 0x8 = 3 ------------------------------------------------------------ perf_event_attr: type 0 (PERF_TYPE_HARDWARE) size 136 config 0 (PERF_COUNT_HW_CPU_CYCLES) sample_type IDENTIFIER read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING disabled 1 inherit 1 enable_on_exec 1 exclude_guest 1 ------------------------------------------------------------ ... Fixes to make hybrid/BIG.little PMUs behave correctly, ie as core PMUs capable of opening legacy events on each, removing hard coded "cpu_core" and "cpu_atom" Intel PMU names, etc. caused a behavioral difference on Apple/ARM due to latent issues in the PMU driver reported in: https://lore.kernel.org/lkml/08f1f185-e259-4014-9ca4-6411d5c1bc65@marcan.st/ As part of that report Mark Rutland requested that legacy events not be higher in priority when a PMU is specified reversing what has until this change been perf's default behavior. With this change the above becomes: $ perf stat -e cpu-cycles,cpu/cpu-cycles/ true Using CPUID GenuineIntel-6-8D-1 Attempt to add: cpu/cpu-cycles=0/ ..after resolving event: cpu/event=0x3c/ Control descriptor is not initialized ------------------------------------------------------------ perf_event_attr: type 0 (PERF_TYPE_HARDWARE) size 136 config 0 (PERF_COUNT_HW_CPU_CYCLES) sample_type IDENTIFIER read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING disabled 1 inherit 1 enable_on_exec 1 exclude_guest 1 ------------------------------------------------------------ sys_perf_event_open: pid 827628 cpu -1 group_fd -1 flags 0x8 = 3 ------------------------------------------------------------ perf_event_attr: type 4 (PERF_TYPE_RAW) size 136 config 0x3c sample_type IDENTIFIER read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING disabled 1 inherit 1 enable_on_exec 1 exclude_guest 1 ------------------------------------------------------------ ... So the second event has become a raw event as /sys/devices/cpu/events/cpu-cycles exists. A fix was necessary to config_term_pmu in parse-events.c as check_alias expansion needs to happen after config_term_pmu, and config_term_pmu may need calling a second time because of this. config_term_pmu is updated to not use the legacy event when the PMU has such a named event (either from JSON or sysfs). The bulk of this change is updating all of the parse-events test expectations so that if a sysfs/JSON event exists for a PMU the test doesn't fail - a further sign, if it were needed, that the legacy event priority was a known and tested behavior of the perf tool. Reported-by: Hector Martin Signed-off-by: Ian Rogers Tested-by: Hector Martin Tested-by: Marc Zyngier Acked-by: Mark Rutland Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231123042922.834425-1-irogers@google.com [ Initialize the 'alias_rewrote_terms' variable to false to address a clang warning ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/parse-events.c | 256 +++++++++++++++++++++++--------- tools/perf/util/parse-events.c | 52 +++++-- tools/perf/util/pmu.c | 8 +- tools/perf/util/pmu.h | 3 +- 4 files changed, 231 insertions(+), 88 deletions(-) diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index e52f45c7c3d1a..fbdf710d5eea0 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -162,6 +162,22 @@ static int test__checkevent_numeric(struct evlist *evlist) return TEST_OK; } + +static int assert_hw(struct perf_evsel *evsel, enum perf_hw_id id, const char *name) +{ + struct perf_pmu *pmu; + + if (evsel->attr.type == PERF_TYPE_HARDWARE) { + TEST_ASSERT_VAL("wrong config", test_perf_config(evsel, id)); + return 0; + } + pmu = perf_pmus__find_by_type(evsel->attr.type); + + TEST_ASSERT_VAL("unexpected PMU type", pmu); + TEST_ASSERT_VAL("PMU missing event", perf_pmu__have_event(pmu, name)); + return 0; +} + static int test__checkevent_symbolic_name(struct evlist *evlist) { struct perf_evsel *evsel; @@ -169,10 +185,12 @@ static int test__checkevent_symbolic_name(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of entries", 0 != evlist->core.nr_entries); perf_evlist__for_each_evsel(&evlist->core, evsel) { - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type); - TEST_ASSERT_VAL("wrong config", - test_perf_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); + int ret = assert_hw(evsel, PERF_COUNT_HW_INSTRUCTIONS, "instructions"); + + if (ret) + return ret; } + return TEST_OK; } @@ -183,8 +201,10 @@ static int test__checkevent_symbolic_name_config(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of entries", 0 != evlist->core.nr_entries); perf_evlist__for_each_evsel(&evlist->core, evsel) { - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type); - TEST_ASSERT_VAL("wrong config", test_perf_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + int ret = assert_hw(evsel, PERF_COUNT_HW_CPU_CYCLES, "cycles"); + + if (ret) + return ret; /* * The period value gets configured within evlist__config, * while this test executes only parse events method. @@ -861,10 +881,14 @@ static int test__group1(struct evlist *evlist) evlist__nr_groups(evlist) == num_core_entries()); for (int i = 0; i < num_core_entries(); i++) { + int ret; + /* instructions:k */ evsel = leader = (i == 0 ? evlist__first(evlist) : evsel__next(evsel)); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_INSTRUCTIONS, "instructions"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); @@ -878,8 +902,10 @@ static int test__group1(struct evlist *evlist) /* cycles:upp */ evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_CPU_CYCLES, "cycles"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); @@ -907,6 +933,8 @@ static int test__group2(struct evlist *evlist) TEST_ASSERT_VAL("wrong number of groups", 1 == evlist__nr_groups(evlist)); evlist__for_each_entry(evlist, evsel) { + int ret; + if (evsel->core.attr.type == PERF_TYPE_SOFTWARE) { /* faults + :ku modifier */ leader = evsel; @@ -939,8 +967,10 @@ static int test__group2(struct evlist *evlist) continue; } /* cycles:k */ - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_CPU_CYCLES, "cycles"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); @@ -957,6 +987,7 @@ static int test__group2(struct evlist *evlist) static int test__group3(struct evlist *evlist __maybe_unused) { struct evsel *evsel, *group1_leader = NULL, *group2_leader = NULL; + int ret; TEST_ASSERT_VAL("wrong number of entries", evlist->core.nr_entries == (3 * perf_pmus__num_core_pmus() + 2)); @@ -1045,8 +1076,10 @@ static int test__group3(struct evlist *evlist __maybe_unused) continue; } /* instructions:u */ - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_INSTRUCTIONS, "instructions"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); @@ -1070,10 +1103,14 @@ static int test__group4(struct evlist *evlist __maybe_unused) num_core_entries() == evlist__nr_groups(evlist)); for (int i = 0; i < num_core_entries(); i++) { + int ret; + /* cycles:u + p */ evsel = leader = (i == 0 ? evlist__first(evlist) : evsel__next(evsel)); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_CPU_CYCLES, "cycles"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); @@ -1089,8 +1126,10 @@ static int test__group4(struct evlist *evlist __maybe_unused) /* instructions:kp + p */ evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_INSTRUCTIONS, "instructions"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); @@ -1108,6 +1147,7 @@ static int test__group4(struct evlist *evlist __maybe_unused) static int test__group5(struct evlist *evlist __maybe_unused) { struct evsel *evsel = NULL, *leader; + int ret; TEST_ASSERT_VAL("wrong number of entries", evlist->core.nr_entries == (5 * num_core_entries())); @@ -1117,8 +1157,10 @@ static int test__group5(struct evlist *evlist __maybe_unused) for (int i = 0; i < num_core_entries(); i++) { /* cycles + G */ evsel = leader = (i == 0 ? evlist__first(evlist) : evsel__next(evsel)); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_CPU_CYCLES, "cycles"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); @@ -1133,8 +1175,10 @@ static int test__group5(struct evlist *evlist __maybe_unused) /* instructions + G */ evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_INSTRUCTIONS, "instructions"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); @@ -1148,8 +1192,10 @@ static int test__group5(struct evlist *evlist __maybe_unused) for (int i = 0; i < num_core_entries(); i++) { /* cycles:G */ evsel = leader = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_CPU_CYCLES, "cycles"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); @@ -1164,8 +1210,10 @@ static int test__group5(struct evlist *evlist __maybe_unused) /* instructions:G */ evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_INSTRUCTIONS, "instructions"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); @@ -1178,8 +1226,10 @@ static int test__group5(struct evlist *evlist __maybe_unused) for (int i = 0; i < num_core_entries(); i++) { /* cycles */ evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_CPU_CYCLES, "cycles"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); @@ -1201,10 +1251,14 @@ static int test__group_gh1(struct evlist *evlist) evlist__nr_groups(evlist) == num_core_entries()); for (int i = 0; i < num_core_entries(); i++) { + int ret; + /* cycles + :H group modifier */ evsel = leader = (i == 0 ? evlist__first(evlist) : evsel__next(evsel)); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_CPU_CYCLES, "cycles"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); @@ -1218,8 +1272,10 @@ static int test__group_gh1(struct evlist *evlist) /* cache-misses:G + :H group modifier */ evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_MISSES)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_CACHE_MISSES, "cache-misses"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); @@ -1242,10 +1298,14 @@ static int test__group_gh2(struct evlist *evlist) evlist__nr_groups(evlist) == num_core_entries()); for (int i = 0; i < num_core_entries(); i++) { + int ret; + /* cycles + :G group modifier */ evsel = leader = (i == 0 ? evlist__first(evlist) : evsel__next(evsel)); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_CPU_CYCLES, "cycles"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); @@ -1259,8 +1319,10 @@ static int test__group_gh2(struct evlist *evlist) /* cache-misses:H + :G group modifier */ evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_MISSES)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_CACHE_MISSES, "cache-misses"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); @@ -1283,10 +1345,14 @@ static int test__group_gh3(struct evlist *evlist) evlist__nr_groups(evlist) == num_core_entries()); for (int i = 0; i < num_core_entries(); i++) { + int ret; + /* cycles:G + :u group modifier */ evsel = leader = (i == 0 ? evlist__first(evlist) : evsel__next(evsel)); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_CPU_CYCLES, "cycles"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); @@ -1300,8 +1366,10 @@ static int test__group_gh3(struct evlist *evlist) /* cache-misses:H + :u group modifier */ evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_MISSES)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_CACHE_MISSES, "cache-misses"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); @@ -1324,10 +1392,14 @@ static int test__group_gh4(struct evlist *evlist) evlist__nr_groups(evlist) == num_core_entries()); for (int i = 0; i < num_core_entries(); i++) { + int ret; + /* cycles:G + :uG group modifier */ evsel = leader = (i == 0 ? evlist__first(evlist) : evsel__next(evsel)); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_CPU_CYCLES, "cycles"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); @@ -1341,8 +1413,10 @@ static int test__group_gh4(struct evlist *evlist) /* cache-misses:H + :uG group modifier */ evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_MISSES)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_CACHE_MISSES, "cache-misses"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); @@ -1363,10 +1437,14 @@ static int test__leader_sample1(struct evlist *evlist) evlist->core.nr_entries == (3 * num_core_entries())); for (int i = 0; i < num_core_entries(); i++) { + int ret; + /* cycles - sampling group leader */ evsel = leader = (i == 0 ? evlist__first(evlist) : evsel__next(evsel)); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_CPU_CYCLES, "cycles"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); @@ -1379,8 +1457,10 @@ static int test__leader_sample1(struct evlist *evlist) /* cache-misses - not sampling */ evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_MISSES)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_CACHE_MISSES, "cache-misses"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); @@ -1392,8 +1472,10 @@ static int test__leader_sample1(struct evlist *evlist) /* branch-misses - not sampling */ evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_BRANCH_MISSES)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_BRANCH_MISSES, "branch-misses"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", !evsel->core.attr.exclude_hv); @@ -1415,10 +1497,14 @@ static int test__leader_sample2(struct evlist *evlist __maybe_unused) evlist->core.nr_entries == (2 * num_core_entries())); for (int i = 0; i < num_core_entries(); i++) { + int ret; + /* instructions - sampling group leader */ evsel = leader = (i == 0 ? evlist__first(evlist) : evsel__next(evsel)); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_INSTRUCTIONS, "instructions"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); @@ -1431,8 +1517,10 @@ static int test__leader_sample2(struct evlist *evlist __maybe_unused) /* branch-misses - not sampling */ evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_BRANCH_MISSES)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_BRANCH_MISSES, "branch-misses"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); TEST_ASSERT_VAL("wrong exclude_hv", evsel->core.attr.exclude_hv); @@ -1472,10 +1560,14 @@ static int test__pinned_group(struct evlist *evlist) evlist->core.nr_entries == (3 * num_core_entries())); for (int i = 0; i < num_core_entries(); i++) { + int ret; + /* cycles - group leader */ evsel = leader = (i == 0 ? evlist__first(evlist) : evsel__next(evsel)); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_CPU_CYCLES, "cycles"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong group name", !evsel->group_name); TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); /* TODO: The group modifier is not copied to the split group leader. */ @@ -1484,13 +1576,18 @@ static int test__pinned_group(struct evlist *evlist) /* cache-misses - can not be pinned, but will go on with the leader */ evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_MISSES)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_CACHE_MISSES, "cache-misses"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong pinned", !evsel->core.attr.pinned); /* branch-misses - ditto */ evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_BRANCH_MISSES)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_BRANCH_MISSES, "branch-misses"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong pinned", !evsel->core.attr.pinned); } return TEST_OK; @@ -1517,10 +1614,14 @@ static int test__exclusive_group(struct evlist *evlist) evlist->core.nr_entries == 3 * num_core_entries()); for (int i = 0; i < num_core_entries(); i++) { + int ret; + /* cycles - group leader */ evsel = leader = (i == 0 ? evlist__first(evlist) : evsel__next(evsel)); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_CPU_CYCLES, "cycles"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong group name", !evsel->group_name); TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); /* TODO: The group modifier is not copied to the split group leader. */ @@ -1529,13 +1630,18 @@ static int test__exclusive_group(struct evlist *evlist) /* cache-misses - can not be pinned, but will go on with the leader */ evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CACHE_MISSES)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_CACHE_MISSES, "cache-misses"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong exclusive", !evsel->core.attr.exclusive); /* branch-misses - ditto */ evsel = evsel__next(evsel); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_BRANCH_MISSES)); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_BRANCH_MISSES, "branch-misses"); + if (ret) + return ret; + TEST_ASSERT_VAL("wrong exclusive", !evsel->core.attr.exclusive); } return TEST_OK; @@ -1677,9 +1783,11 @@ static int test__checkevent_raw_pmu(struct evlist *evlist) static int test__sym_event_slash(struct evlist *evlist) { struct evsel *evsel = evlist__first(evlist); + int ret = assert_hw(&evsel->core, PERF_COUNT_HW_CPU_CYCLES, "cycles"); + + if (ret) + return ret; - TEST_ASSERT_VAL("wrong type", evsel->core.attr.type == PERF_TYPE_HARDWARE); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); return TEST_OK; } @@ -1687,9 +1795,11 @@ static int test__sym_event_slash(struct evlist *evlist) static int test__sym_event_dc(struct evlist *evlist) { struct evsel *evsel = evlist__first(evlist); + int ret = assert_hw(&evsel->core, PERF_COUNT_HW_CPU_CYCLES, "cycles"); + + if (ret) + return ret; - TEST_ASSERT_VAL("wrong type", evsel->core.attr.type == PERF_TYPE_HARDWARE); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); return TEST_OK; } @@ -1697,9 +1807,11 @@ static int test__sym_event_dc(struct evlist *evlist) static int test__term_equal_term(struct evlist *evlist) { struct evsel *evsel = evlist__first(evlist); + int ret = assert_hw(&evsel->core, PERF_COUNT_HW_CPU_CYCLES, "cycles"); + + if (ret) + return ret; - TEST_ASSERT_VAL("wrong type", evsel->core.attr.type == PERF_TYPE_HARDWARE); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "name") == 0); return TEST_OK; } @@ -1707,9 +1819,11 @@ static int test__term_equal_term(struct evlist *evlist) static int test__term_equal_legacy(struct evlist *evlist) { struct evsel *evsel = evlist__first(evlist); + int ret = assert_hw(&evsel->core, PERF_COUNT_HW_CPU_CYCLES, "cycles"); + + if (ret) + return ret; - TEST_ASSERT_VAL("wrong type", evsel->core.attr.type == PERF_TYPE_HARDWARE); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "l1d") == 0); return TEST_OK; } diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index aa2f5c6fc7fc2..66eabcea42427 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -976,7 +976,7 @@ static int config_term_pmu(struct perf_event_attr *attr, struct parse_events_error *err) { if (term->type_term == PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE) { - const struct perf_pmu *pmu = perf_pmus__find_by_type(attr->type); + struct perf_pmu *pmu = perf_pmus__find_by_type(attr->type); if (!pmu) { char *err_str; @@ -986,15 +986,23 @@ static int config_term_pmu(struct perf_event_attr *attr, err_str, /*help=*/NULL); return -EINVAL; } - if (perf_pmu__supports_legacy_cache(pmu)) { + /* + * Rewrite the PMU event to a legacy cache one unless the PMU + * doesn't support legacy cache events or the event is present + * within the PMU. + */ + if (perf_pmu__supports_legacy_cache(pmu) && + !perf_pmu__have_event(pmu, term->config)) { attr->type = PERF_TYPE_HW_CACHE; return parse_events__decode_legacy_cache(term->config, pmu->type, &attr->config); - } else + } else { term->type_term = PARSE_EVENTS__TERM_TYPE_USER; + term->no_value = true; + } } if (term->type_term == PARSE_EVENTS__TERM_TYPE_HARDWARE) { - const struct perf_pmu *pmu = perf_pmus__find_by_type(attr->type); + struct perf_pmu *pmu = perf_pmus__find_by_type(attr->type); if (!pmu) { char *err_str; @@ -1004,10 +1012,19 @@ static int config_term_pmu(struct perf_event_attr *attr, err_str, /*help=*/NULL); return -EINVAL; } - attr->type = PERF_TYPE_HARDWARE; - attr->config = term->val.num; - if (perf_pmus__supports_extended_type()) - attr->config |= (__u64)pmu->type << PERF_PMU_TYPE_SHIFT; + /* + * If the PMU has a sysfs or json event prefer it over + * legacy. ARM requires this. + */ + if (perf_pmu__have_event(pmu, term->config)) { + term->type_term = PARSE_EVENTS__TERM_TYPE_USER; + term->no_value = true; + } else { + attr->type = PERF_TYPE_HARDWARE; + attr->config = term->val.num; + if (perf_pmus__supports_extended_type()) + attr->config |= (__u64)pmu->type << PERF_PMU_TYPE_SHIFT; + } return 0; } if (term->type_term == PARSE_EVENTS__TERM_TYPE_USER || @@ -1381,6 +1398,7 @@ int parse_events_add_pmu(struct parse_events_state *parse_state, YYLTYPE *loc = loc_; LIST_HEAD(config_terms); struct parse_events_terms parsed_terms; + bool alias_rewrote_terms = false; pmu = parse_state->fake_pmu ?: perf_pmus__find(name); @@ -1433,7 +1451,15 @@ int parse_events_add_pmu(struct parse_events_state *parse_state, return evsel ? 0 : -ENOMEM; } - if (!parse_state->fake_pmu && perf_pmu__check_alias(pmu, &parsed_terms, &info, err)) { + /* Configure attr/terms with a known PMU, this will set hardcoded terms. */ + if (config_attr(&attr, &parsed_terms, parse_state->error, config_term_pmu)) { + parse_events_terms__exit(&parsed_terms); + return -EINVAL; + } + + /* Look for event names in the terms and rewrite into format based terms. */ + if (!parse_state->fake_pmu && perf_pmu__check_alias(pmu, &parsed_terms, + &info, &alias_rewrote_terms, err)) { parse_events_terms__exit(&parsed_terms); return -EINVAL; } @@ -1447,11 +1473,9 @@ int parse_events_add_pmu(struct parse_events_state *parse_state, strbuf_release(&sb); } - /* - * Configure hardcoded terms first, no need to check - * return value when called with fail == 0 ;) - */ - if (config_attr(&attr, &parsed_terms, parse_state->error, config_term_pmu)) { + /* Configure attr/terms again if an alias was expanded. */ + if (alias_rewrote_terms && + config_attr(&attr, &parsed_terms, parse_state->error, config_term_pmu)) { parse_events_terms__exit(&parsed_terms); return -EINVAL; } diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index d3c9aa4326bee..3c9609944a2f3 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1494,12 +1494,14 @@ static int check_info_data(struct perf_pmu *pmu, * defined for the alias */ int perf_pmu__check_alias(struct perf_pmu *pmu, struct parse_events_terms *head_terms, - struct perf_pmu_info *info, struct parse_events_error *err) + struct perf_pmu_info *info, bool *rewrote_terms, + struct parse_events_error *err) { struct parse_events_term *term, *h; struct perf_pmu_alias *alias; int ret; + *rewrote_terms = false; info->per_pkg = false; /* @@ -1521,7 +1523,7 @@ int perf_pmu__check_alias(struct perf_pmu *pmu, struct parse_events_terms *head_ NULL); return ret; } - + *rewrote_terms = true; ret = check_info_data(pmu, alias, info, err, term->err_term); if (ret) return ret; @@ -1615,6 +1617,8 @@ bool perf_pmu__auto_merge_stats(const struct perf_pmu *pmu) bool perf_pmu__have_event(struct perf_pmu *pmu, const char *name) { + if (!name) + return false; if (perf_pmu__find_alias(pmu, name, /*load=*/ true) != NULL) return true; if (pmu->cpu_aliases_added || !pmu->events_table) diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index d2895d415f08f..424c3fee09496 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -201,7 +201,8 @@ int perf_pmu__config_terms(const struct perf_pmu *pmu, __u64 perf_pmu__format_bits(struct perf_pmu *pmu, const char *name); int perf_pmu__format_type(struct perf_pmu *pmu, const char *name); int perf_pmu__check_alias(struct perf_pmu *pmu, struct parse_events_terms *head_terms, - struct perf_pmu_info *info, struct parse_events_error *err); + struct perf_pmu_info *info, bool *rewrote_terms, + struct parse_events_error *err); int perf_pmu__find_event(struct perf_pmu *pmu, const char *event, void *state, pmu_event_callback cb); int perf_pmu__format_parse(struct perf_pmu *pmu, int dirfd, bool eager_load); -- GitLab From 4a18ab467820b75436ea9ddd42ee7cb10efa491c Mon Sep 17 00:00:00 2001 From: zhaimingbing Date: Fri, 24 Nov 2023 17:26:57 +0800 Subject: [PATCH 0126/1290] perf lock: Fix a memory leak on an error path if a strdup-ed string is NULL,the allocated memory needs freeing. Signed-off-by: zhaimingbing Acked-by: Ingo Molnar Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Jiri Olsa Cc: Li Dong Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sean Christopherson Link: https://lore.kernel.org/r/20231124092657.10392-1-zhaimingbing@cmss.chinamobile.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-lock.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index a3ff2f4edbaa5..230461280e452 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -2285,8 +2285,10 @@ setup_args: else ev_name = strdup(contention_tracepoints[j].name); - if (!ev_name) + if (!ev_name) { + free(rec_argv); return -ENOMEM; + } rec_argv[i++] = "-e"; rec_argv[i++] = ev_name; -- GitLab From 581ff5b66c94a8133d1c77ed42334623fadc968c Mon Sep 17 00:00:00 2001 From: zhujun2 Date: Tue, 14 Nov 2023 22:42:55 -0800 Subject: [PATCH 0127/1290] perf tests coresight: Remove unused variables These variables are never referenced in the code, just remove them. Reviewed-by: James Clark Signed-off-by: zhujun2 Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: coresight@lists.linaro.org Link: https://lore.kernel.org/r/20231115064255.11057-1-zhujun2@cmss.chinamobile.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/coresight/memcpy_thread/memcpy_thread.c | 1 - tools/perf/tests/shell/coresight/thread_loop/thread_loop.c | 1 - .../shell/coresight/unroll_loop_thread/unroll_loop_thread.c | 1 - 3 files changed, 3 deletions(-) diff --git a/tools/perf/tests/shell/coresight/memcpy_thread/memcpy_thread.c b/tools/perf/tests/shell/coresight/memcpy_thread/memcpy_thread.c index a7e169d1bf645..5f886cd09e6b3 100644 --- a/tools/perf/tests/shell/coresight/memcpy_thread/memcpy_thread.c +++ b/tools/perf/tests/shell/coresight/memcpy_thread/memcpy_thread.c @@ -42,7 +42,6 @@ static pthread_t new_thr(void *(*fn) (void *arg), void *arg) int main(int argc, char **argv) { unsigned long i, len, size, thr; - pthread_t threads[256]; struct args args[256]; long long v; diff --git a/tools/perf/tests/shell/coresight/thread_loop/thread_loop.c b/tools/perf/tests/shell/coresight/thread_loop/thread_loop.c index c0158fac7d0b0..e05a559253ca9 100644 --- a/tools/perf/tests/shell/coresight/thread_loop/thread_loop.c +++ b/tools/perf/tests/shell/coresight/thread_loop/thread_loop.c @@ -57,7 +57,6 @@ static pthread_t new_thr(void *(*fn) (void *arg), void *arg) int main(int argc, char **argv) { unsigned int i, len, thr; - pthread_t threads[256]; struct args args[256]; if (argc < 3) { diff --git a/tools/perf/tests/shell/coresight/unroll_loop_thread/unroll_loop_thread.c b/tools/perf/tests/shell/coresight/unroll_loop_thread/unroll_loop_thread.c index 8f6d384208ed9..0fc7bf1a25af3 100644 --- a/tools/perf/tests/shell/coresight/unroll_loop_thread/unroll_loop_thread.c +++ b/tools/perf/tests/shell/coresight/unroll_loop_thread/unroll_loop_thread.c @@ -51,7 +51,6 @@ static pthread_t new_thr(void *(*fn) (void *arg), void *arg) int main(int argc, char **argv) { unsigned int i, thr; - pthread_t threads[256]; struct args args[256]; if (argc < 2) { -- GitLab From 5ebe2f4bf0a8fe8cceb5664a7dea4c17e2cf8477 Mon Sep 17 00:00:00 2001 From: Ji Sheng Teoh Date: Wed, 22 Nov 2023 11:09:08 +0800 Subject: [PATCH 0128/1290] perf vendor events riscv: Add StarFive Dubhe-90 JSON file Similar to StarFive's Dubhe-80, Dubhe-90 supports raw event id 0x00 - 0x22. Reuse Dubhe-80 firmware and common json file. The raw events are enabled through PMU node of DT binding. Besides raw event, add standard RISC-V firmware events to support monitoring of firmware event. Example of PMU DT node: pmu { compatible = "riscv,pmu"; riscv,raw-event-to-mhpmcounters = /* Event ID 1-31 */ <0x00 0x00 0xFFFFFFFF 0xFFFFFFE0 0x00007FF8>, /* Event ID 32-33 */ <0x00 0x20 0xFFFFFFFF 0xFFFFFFFE 0x00007FF8>, /* Event ID 34 */ <0x00 0x22 0xFFFFFFFF 0xFFFFFF22 0x00007FF8>; }; 'perf stat' output: [root@user]# perf stat -a \ -e access_mmu_stlb \ -e miss_mmu_stlb \ -e access_mmu_pte_c \ -e rob_flush \ -e btb_prediction_miss \ -e itlb_miss \ -e sync_del_fetch_g \ -e icache_miss \ -e bpu_br_retire \ -e bpu_br_miss \ -e ret_ins_retire \ -e ret_ins_miss \ -- openssl speed rsa2048 Doing 2048 bits private rsa's for 10s: 39 2048 bits private RSA's in 10.03s Doing 2048 bits public rsa's for 10s: 1469 2048 bits public RSA's in 9.47s version: 3.0.10 built on: Tue Aug 1 13:47:24 2023 UTC options: bn(64,64) CPUINFO: N/A sign verify sign/s verify/s rsa 2048 bits 0.257179s 0.006447s 3.9 155.1 Performance counter stats for 'system wide': 3112882 access_mmu_stlb 10550 miss_mmu_stlb 18251 access_mmu_pte_c 274765 rob_flush 22470560 btb_prediction_miss 3035839 itlb_miss 643549060 sync_del_fetch_g 133013 icache_miss 62982796 bpu_br_retire 287548 bpu_br_miss 8935910 ret_ins_retire 8308 ret_ins_miss 20.656182600 seconds time elapsed Reviewed-by: Ley Foon Tan Signed-off-by: Ji Sheng Teoh Cc: Adrian Hunter Cc: Albert Ou Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Nikita Shubin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: linux-riscv@lists.infradead.org Link: https://lore.kernel.org/r/20231122030908.2981502-1-jisheng.teoh@starfivetech.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/riscv/mapfile.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/pmu-events/arch/riscv/mapfile.csv b/tools/perf/pmu-events/arch/riscv/mapfile.csv index ee61e26f90cd3..56b03138d46aa 100644 --- a/tools/perf/pmu-events/arch/riscv/mapfile.csv +++ b/tools/perf/pmu-events/arch/riscv/mapfile.csv @@ -15,4 +15,4 @@ # #MVENDORID-MARCHID-MIMPID,Version,Filename,EventType 0x489-0x8000000000000007-0x[[:xdigit:]]+,v1,sifive/u74,core -0x67e-0x80000000db000080-0x[[:xdigit:]]+,v1,starfive/dubhe-80,core +0x67e-0x80000000db0000[89]0-0x[[:xdigit:]]+,v1,starfive/dubhe-80,core -- GitLab From 1638b11ef8156c8551f5aaa5799069633593c5fe Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Thu, 23 Nov 2023 21:32:32 +0530 Subject: [PATCH 0129/1290] perf tools: Add perf binary dependent rule for shellcheck log in Makefile.perf Add rule in new Makefile "tests/Makefile.tests" for running shellcheck on shell test scripts. This automates below shellcheck into the build. $ for F in $(find tests/shell/ -perm -o=x -name '*.sh'); do shellcheck -S warning $F; done Condition for shellcheck is added in Makefile.perf to avoid build breakage in the absence of shellcheck binary. Update Makefile.perf to contain new rule for "SHELLCHECK_TEST" which is for making shellcheck test as a dependency on perf binary. Added "tests/Makefile.tests" to run shellcheck on shellscripts in tests/shell. The make rule "SHLLCHECK_RUN" ensures that, every time during make, shellcheck will be run only on modified files during subsequent invocations. By this, if any newly added shell scripts or fixes in existing scripts breaks coding/formatting style, it will get captured during the perf build. Example build failure by modifying probe_vfs_getname.sh in tests/shell: In tests/shell/probe_vfs_getname.sh line 8: . $(dirname $0)/lib/probe.sh ^-----------^ SC2046 (warning): Quote this to prevent word splitting. For more information: https://www.shellcheck.net/wiki/SC2046 -- Quote this to prevent word splitt... make[3]: *** [/root/athira/perf-tools-next/tools/perf/tests/Makefile.tests:18: tests/shell/.probe_vfs_getname.sh.shellcheck_log] Error 1 make[2]: *** [Makefile.perf:686: SHELLCHECK_TEST] Error 2 make[2]: *** Waiting for unfinished jobs.... make[1]: *** [Makefile.perf:244: sub-make] Error 2 make: *** [Makefile:70: all] Error 2 Here, like other files which gets created during compilation (ex: .builtin-bench.o.cmd or .perf.o.cmd ), create .shellcheck_log also as a hidden file. Example: tests/shell/.probe_vfs_getname.sh.shellcheck_log shellcheck is re-run if any of the script gets modified based on its dependency of this log file. After this, for testing, changed "tests/shell/trace+probe_vfs_getname.sh" to break shellcheck format. In the next make run, it is also captured: In tests/shell/probe_vfs_getname.sh line 8: . $(dirname $0)/lib/probe.sh ^-----------^ SC2046 (warning): Quote this to prevent word splitting. For more information: https://www.shellcheck.net/wiki/SC2046 -- Quote this to prevent word splitt... make[3]: *** [/root/athira/perf-tools-next/tools/perf/tests/Makefile.tests:18: tests/shell/.probe_vfs_getname.sh.shellcheck_log] Error 1 make[3]: *** Waiting for unfinished jobs.... In tests/shell/trace+probe_vfs_getname.sh line 14: . $(dirname $0)/lib/probe.sh ^-----------^ SC2046 (warning): Quote this to prevent word splitting. For more information: https://www.shellcheck.net/wiki/SC2046 -- Quote this to prevent word splitt... make[3]: *** [/root/athira/perf-tools-next/tools/perf/tests/Makefile.tests:18: tests/shell/.trace+probe_vfs_getname.sh.shellcheck_log] Error 1 make[2]: *** [Makefile.perf:686: SHELLCHECK_TEST] Error 2 make[2]: *** Waiting for unfinished jobs.... make[1]: *** [Makefile.perf:244: sub-make] Error 2 make: *** [Makefile:70: all] Error 2 Failure log can be found in the stdout of make itself. This is reported at build time. To be able to go ahead with the build or disable shellcheck even though it is known that some test is broken, add a "NO_SHELLCHECK" option. Example: make NO_SHELLCHECK=1 INSTALL libsubcmd_headers INSTALL libsymbol_headers INSTALL libapi_headers INSTALL libperf_headers INSTALL libbpf_headers LINK perf Note: This is tested on RHEL and also SLES. Use below check: "$(shell which shellcheck 2> /dev/null)" to look for presence of shellcheck binary. The approach "shell command -v" is not used here. In some of the distros(RHEL), command is available as executable file (/usr/bin/command). But in some distros(SLES), it is a shell builtin and not available as executable file. Committer testing: $ type shellcheck shellcheck is hashed (/usr/bin/shellcheck) $ rpm -qf /usr/bin/shellcheck ShellCheck-0.9.0-2.fc38.x86_64 $ $ alias m $ git diff diff --git a/tools/perf/tests/shell/probe_vfs_getname.sh b/tools/perf/tests/shell/probe_vfs_getname.sh index 554e12e83c55fd56..dbc14634678e2bf6 100755 --- a/tools/perf/tests/shell/probe_vfs_getname.sh +++ b/tools/perf/tests/shell/probe_vfs_getname.sh @@ -5,7 +5,7 @@ # Arnaldo Carvalho de Melo , 2017 # shellcheck source=lib/probe.sh -. "$(dirname $0)"/lib/probe.sh +. $(dirname $0)/lib/probe.sh skip_if_no_perf_probe || exit 2 alias m='rm -rf ~/libexec/perf-core/ ; make -k CORESIGHT=1 O=/tmp/build/$(basename $PWD) -C tools/perf install-bin && perf test python' $ m make: Entering directory '/home/acme/git/perf-tools-next/tools/perf' BUILD: Doing 'make -j32' parallel build INSTALL libbpf_headers In tests/shell/probe_vfs_getname.sh line 8: . $(dirname $0)/lib/probe.sh ^-----------^ SC2046 (warning): Quote this to prevent word splitting. For more information: https://www.shellcheck.net/wiki/SC2046 -- Quote this to prevent word splitt... make[3]: *** [/home/acme/git/perf-tools-next/tools/perf/tests/Makefile.tests:18: tests/shell/.probe_vfs_getname.sh.shellcheck_log] Error 1 make[2]: *** [Makefile.perf:686: SHELLCHECK_TEST] Error 2 make[2]: *** Waiting for unfinished jobs.... make[1]: *** [Makefile.perf:244: sub-make] Error 2 make: *** [Makefile:113: install-bin] Error 2 make: Leaving directory '/home/acme/git/perf-tools-next/tools/perf' $ Reviewed-by: James Clark Signed-off-by: Athira Jajeev Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Disha Goel Cc: Ian Rogers Cc: Jiri Olsa Cc: Kajol Jain Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: linuxppc-dev@lists.ozlabs.org Link: https://lore.kernel.org/r/20231123160232.94253-1-atrajeev@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.perf | 21 ++++++++++++++++++++- tools/perf/tests/Makefile.tests | 22 ++++++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 tools/perf/tests/Makefile.tests diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index d80dcaa5a1e33..824cbc0af7d7a 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -134,6 +134,8 @@ include ../scripts/utilities.mak # x86 instruction decoder - new instructions test # # Define GEN_VMLINUX_H to generate vmlinux.h from the BTF. +# +# Define NO_SHELLCHECK if you do not want to run shellcheck during build # As per kernel Makefile, avoid funny character set dependencies unexport LC_ALL @@ -671,7 +673,23 @@ $(PERF_IN): prepare FORCE $(PMU_EVENTS_IN): FORCE prepare $(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=pmu-events obj=pmu-events -$(OUTPUT)perf: $(PERFLIBS) $(PERF_IN) $(PMU_EVENTS_IN) +# Runs shellcheck on perf test shell scripts + +SHELLCHECK := $(shell which shellcheck 2> /dev/null) + +ifeq ($(NO_SHELLCHECK),1) +SHELLCHECK := +endif + +ifneq ($(SHELLCHECK),) +SHELLCHECK_TEST: FORCE prepare + $(Q)$(MAKE) -f $(srctree)/tools/perf/tests/Makefile.tests +else +SHELLCHECK_TEST: + @: +endif + +$(OUTPUT)perf: $(PERFLIBS) $(PERF_IN) $(PMU_EVENTS_IN) SHELLCHECK_TEST $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) \ $(PERF_IN) $(PMU_EVENTS_IN) $(LIBS) -o $@ @@ -1134,6 +1152,7 @@ bpf-skel-clean: $(call QUIET_CLEAN, bpf-skel) $(RM) -r $(SKEL_TMP_OUT) $(SKELETONS) clean:: $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean $(LIBSYMBOL)-clean $(LIBPERF)-clean fixdep-clean python-clean bpf-skel-clean tests-coresight-targets-clean + $(Q)$(MAKE) -f $(srctree)/tools/perf/tests/Makefile.tests clean $(call QUIET_CLEAN, core-objs) $(RM) $(LIBPERF_A) $(OUTPUT)perf-archive $(OUTPUT)perf-iostat $(LANG_BINDINGS) $(Q)find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete $(Q)$(RM) $(OUTPUT).config-detected diff --git a/tools/perf/tests/Makefile.tests b/tools/perf/tests/Makefile.tests new file mode 100644 index 0000000000000..fdaca5f7a946d --- /dev/null +++ b/tools/perf/tests/Makefile.tests @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: GPL-2.0 +# Athira Rajeev , 2023 + +PROGS := $(shell find tests/shell -perm -o=x -type f -name '*.sh') +FILE_NAME := $(notdir $(PROGS)) +FILE_NAME := $(FILE_NAME:%=.%) +LOGS := $(join $(dir $(PROGS)),$(FILE_NAME)) +LOGS := $(LOGS:%=%.shellcheck_log) + +.PHONY: all +all: SHELLCHECK_RUN + @: + +SHELLCHECK_RUN: $(LOGS) + +.%.shellcheck_log: % + $(call rule_mkdir) + $(Q)$(call frecho-cmd,test)@shellcheck -S warning "$<" > $@ || (cat $@ && rm $@ && false) + +clean: + $(eval log_files := $(shell find . -name '.*.shellcheck_log')) + @rm -rf $(log_files) -- GitLab From 8aa1e6e29a21f6bb99dcaa64d11e97a21f0f9dc1 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Wed, 22 Nov 2023 10:27:03 +0100 Subject: [PATCH 0130/1290] perf report: Remove warning on missing raw data for s390 Command # ./perf report -i /tmp/111 -D > /dev/null emits an error message when a sample for event CRYPTO_ALL in the perf.data file does not contain any raw data. This is ok. Do not trigger this warning when the sample in the perf.data files does not contain any raw data at all. Check for availability of raw data for all events and return if none is available. Output before: # ./perf report -i /tmp/111 -D > /dev/null Invalid CRYPTO_ALL raw data encountered Invalid CRYPTO_ALL raw data encountered Invalid CRYPTO_ALL raw data encountered # Output after: # ./perf report -i /tmp/111 -D > /dev/null # Fixes: b539deafbadb2fc6 ("perf report: Add s390 raw data interpretation for PAI counters") Signed-off-by: Thomas Richter Acked-by: Namhyung Kim Acked-by: Sumanth Korikkar Cc: Heiko Carstens Cc: Sven Schnelle Cc: Thomas Richter Cc: Vasily Gorbik Link: https://lore.kernel.org/r/20231122092703.3163191-1-tmricht@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/s390-sample-raw.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/s390-sample-raw.c b/tools/perf/util/s390-sample-raw.c index 29a744eeb71eb..53383e97ec9d5 100644 --- a/tools/perf/util/s390-sample-raw.c +++ b/tools/perf/util/s390-sample-raw.c @@ -51,8 +51,6 @@ static bool s390_cpumcfdg_testctr(struct perf_sample *sample) struct cf_trailer_entry *te; struct cf_ctrset_entry *cep, ce; - if (!len) - return false; while (offset < len) { cep = (struct cf_ctrset_entry *)(buf + offset); ce.def = be16_to_cpu(cep->def); @@ -234,10 +232,9 @@ struct pai_data { /* Event number and value */ */ static bool s390_pai_all_test(struct perf_sample *sample) { - unsigned char *buf = sample->raw_data; size_t len = sample->raw_size; - if (len < 0xa || !buf) + if (len < 0xa) return false; return true; } @@ -299,6 +296,10 @@ void evlist__s390_sample_raw(struct evlist *evlist, union perf_event *event, if (!evsel) return; + /* Check for raw data in sample */ + if (!sample->raw_size || !sample->raw_data) + return; + /* Display raw data on screen */ if (evsel->core.attr.config == PERF_EVENT_CPUM_CF_DIAG) { if (!evsel->pmu) -- GitLab From 70df07838fc1c0acfab3325ae79014e241a88bdf Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 23 Nov 2023 09:58:41 +0200 Subject: [PATCH 0131/1290] perf header: Fix segfault on build_mem_topology() error path Do not increase the node count unless a node has been successfully read, because it can lead to a segfault if an error occurs. For example, if perf exceeds the open file limit in memory_node__read(), which, on a test system, could be made to happen by setting the file limit to exactly 32: Before: $ ulimit -n 32 $ perf mem record --all-user -- sleep 1 [ perf record: Woken up 1 times to write data ] failed: can't open memory sysfs data perf: Segmentation fault Obtained 14 stack frames. perf(sighandler_dump_stack+0x48) [0x55f4b1f59558] /lib/x86_64-linux-gnu/libc.so.6(+0x42520) [0x7f4ba1c42520] /lib/x86_64-linux-gnu/libc.so.6(free+0x1e) [0x7f4ba1ca53fe] perf(+0x178ff4) [0x55f4b1f48ff4] perf(+0x179a70) [0x55f4b1f49a70] perf(+0x17ef5d) [0x55f4b1f4ef5d] perf(+0x85c0b) [0x55f4b1e55c0b] perf(cmd_record+0xe1d) [0x55f4b1e5920d] perf(cmd_mem+0xc96) [0x55f4b1e80e56] perf(+0x130460) [0x55f4b1f00460] perf(main+0x689) [0x55f4b1e427d9] /lib/x86_64-linux-gnu/libc.so.6(+0x29d90) [0x7f4ba1c29d90] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0x80) [0x7f4ba1c29e40] perf(_start+0x25) [0x55f4b1e42a25] Segmentation fault (core dumped) $ After: $ ulimit -n 32 $ perf mem record --all-user -- sleep 1 [ perf record: Woken up 1 times to write data ] failed: can't open memory sysfs data [ perf record: Captured and wrote 0.005 MB perf.data (11 samples) ] $ Fixes: f8e502b9d1b3b197 ("perf header: Ensure bitmaps are freed") Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Cc: German Gomez Cc: Ian Rogers Cc: James Clark Cc: Jiri Olsa Cc: Leo Yan Cc: Namhyung Kim Link: https://lore.kernel.org/r/20231123075848.9652-2-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/header.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 1c687b5789c0f..08cc2febabde2 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1444,7 +1444,9 @@ static int build_mem_topology(struct memory_node **nodesp, u64 *cntp) nodes = new_nodes; size += 4; } - ret = memory_node__read(&nodes[cnt++], idx); + ret = memory_node__read(&nodes[cnt], idx); + if (!ret) + cnt += 1; } out: closedir(dir); -- GitLab From 96ba5999e8d86138cea90422f8c00309a7eedd3b Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 23 Nov 2023 09:58:42 +0200 Subject: [PATCH 0132/1290] perf tests lib: Add perf_has_symbol.sh Some shell tests depend on finding symbols for perf itself, and fail if perf has been stripped and no debug object is available. Add helper functions to check if perf has a needed symbol. This is preparation for amending the tests themselves to be skipped if a needed symbol is not found. The functions make use of the "Symbols" test which reads and checks symbols from a dso, perf itself by default. Note the "Symbols" test will find symbols using the same method as other perf tests, including, for example, looking in the buildid cache. An alternative would be to prevent the needed symbols from being stripped, which seems to work with gcc's externally_visible attribute, but that attribute is not supported by clang. Another alternative would be to use option -Wl,-E (which is already used when perf is built with perl support) which causes the linker to add all (global) symbols to the dynamic symbol table. Then the required symbols need only be made global in scope to avoid being strippable. However that goes beyond what is needed. Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Cc: German Gomez Cc: James Clark Cc: Jiri Olsa Cc: Leo Yan Cc: Namhyung Kim Link: https://lore.kernel.org/r/20231123075848.9652-3-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/lib/perf_has_symbol.sh | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 tools/perf/tests/shell/lib/perf_has_symbol.sh diff --git a/tools/perf/tests/shell/lib/perf_has_symbol.sh b/tools/perf/tests/shell/lib/perf_has_symbol.sh new file mode 100644 index 0000000000000..5d59c32ae3e7b --- /dev/null +++ b/tools/perf/tests/shell/lib/perf_has_symbol.sh @@ -0,0 +1,21 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +perf_has_symbol() +{ + if perf test -vv "Symbols" 2>&1 | grep "[[:space:]]$1$"; then + echo "perf does have symbol '$1'" + return 0 + fi + echo "perf does not have symbol '$1'" + return 1 +} + +skip_test_missing_symbol() +{ + if ! perf_has_symbol "$1" ; then + echo "perf is missing symbols - skipping test" + exit 2 + fi + return 0 +} -- GitLab From c9526a735082bba57da322332cbcef1bbdff5698 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 23 Nov 2023 09:58:43 +0200 Subject: [PATCH 0133/1290] perf tests: Skip pipe test if noploop symbol is missing perf pipe recording and injection test depends on finding symbol noploop in perf, and fails if perf has been stripped and no debug object is available. In that case, skip the test instead. Example: Before: $ strip tools/perf/perf $ tools/perf/perf buildid-cache -p `realpath tools/perf/perf` $ tools/perf/perf test -v pipe 86: perf pipe recording and injection test : --- start --- test child forked, pid 47734 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.000 MB - ] 47741 47741 -1 |perf [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.000 MB - ] cannot find noploop function in pipe #1 test child finished with -1 ---- end ---- perf pipe recording and injection test: FAILED! After: $ tools/perf/perf test -v pipe 86: perf pipe recording and injection test : --- start --- test child forked, pid 48996 perf does not have symbol 'noploop' perf is missing symbols - skipping test test child finished with -2 ---- end ---- perf pipe recording and injection test: Skip Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Cc: German Gomez Cc: James Clark Cc: Jiri Olsa Cc: Leo Yan Cc: Namhyung Kim Link: https://lore.kernel.org/r/20231123075848.9652-4-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/pipe_test.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/perf/tests/shell/pipe_test.sh b/tools/perf/tests/shell/pipe_test.sh index 8dd115dd35a7e..a78d35d2cff07 100755 --- a/tools/perf/tests/shell/pipe_test.sh +++ b/tools/perf/tests/shell/pipe_test.sh @@ -2,10 +2,17 @@ # perf pipe recording and injection test # SPDX-License-Identifier: GPL-2.0 +shelldir=$(dirname "$0") +# shellcheck source=lib/perf_has_symbol.sh +. "${shelldir}"/lib/perf_has_symbol.sh + +sym="noploop" + +skip_test_missing_symbol ${sym} + data=$(mktemp /tmp/perf.data.XXXXXX) prog="perf test -w noploop" task="perf" -sym="noploop" if ! perf record -e task-clock:u -o - ${prog} | perf report -i - --task | grep ${task}; then echo "cannot find the test file in the perf report" -- GitLab From 3c489dbe69c155c86c4460491d11520cf8ec3637 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 23 Nov 2023 09:58:44 +0200 Subject: [PATCH 0134/1290] perf tests: Skip record test if test_loop symbol is missing perf record test depends on finding symbol test_loop in perf, and fails if perf has been stripped and no debug object is available. In that case, skip the test instead. Example: Note, building with perl support adds option -Wl,-E which causes the linker to add all (global) symbols to the dynamic symbol table. So the test_loop symbol, being global, does not get stripped unless NO_LIBPERL=1 Before: $ make NO_LIBPERL=1 -C tools/perf >/dev/null 2>&1 $ strip tools/perf/perf $ tools/perf/perf buildid-cache -p `realpath tools/perf/perf` $ tools/perf/perf test -v 'record tests' 91: perf record tests : --- start --- test child forked, pid 118750 Basic --per-thread mode test Per-thread record [Failed missing output] Register capture test Register capture test [Success] Basic --system-wide mode test System-wide record [Skipped not supported] Basic target workload test Workload record [Failed missing output] test child finished with -1 ---- end ---- perf record tests: FAILED! After: $ tools/perf/perf test -v 'record tests' 91: perf record tests : --- start --- test child forked, pid 120025 perf does not have symbol 'test_loop' perf is missing symbols - skipping test test child finished with -2 ---- end ---- perf record tests: Skip Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Cc: German Gomez Cc: James Clark Cc: Jiri Olsa Cc: Leo Yan Cc: Namhyung Kim Link: https://lore.kernel.org/r/20231123075848.9652-5-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/record.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/perf/tests/shell/record.sh b/tools/perf/tests/shell/record.sh index c74412c68e65a..3d1a7759a7b2d 100755 --- a/tools/perf/tests/shell/record.sh +++ b/tools/perf/tests/shell/record.sh @@ -8,10 +8,16 @@ shelldir=$(dirname "$0") # shellcheck source=lib/waiting.sh . "${shelldir}"/lib/waiting.sh +# shellcheck source=lib/perf_has_symbol.sh +. "${shelldir}"/lib/perf_has_symbol.sh + +testsym="test_loop" + +skip_test_missing_symbol ${testsym} + err=0 perfdata=$(mktemp /tmp/__perf_test.perf.data.XXXXX) testprog="perf test -w thloop" -testsym="test_loop" cpu_pmu_dir="/sys/bus/event_source/devices/cpu*" br_cntr_file="/caps/branch_counter_nr" br_cntr_output="branch stack counters" -- GitLab From fc1de29a8b8ad46b590b2d389b53b4ecf9758273 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 23 Nov 2023 09:58:45 +0200 Subject: [PATCH 0135/1290] perf tests: Skip Arm64 callgraphs test if leafloop symbol is missing The test "Check Arm64 callgraphs are complete in fp mode" depends on finding symbol leafloop in perf, and fails if perf has been stripped and no debug object is available. In that case, skip the test instead. Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Cc: German Gomez Cc: James Clark Cc: Jiri Olsa Cc: Leo Yan Cc: Namhyung Kim Link: https://lore.kernel.org/r/20231123075848.9652-6-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/test_arm_callgraph_fp.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/perf/tests/shell/test_arm_callgraph_fp.sh b/tools/perf/tests/shell/test_arm_callgraph_fp.sh index 66dfdfdad553f..e342e6c8aa50c 100755 --- a/tools/perf/tests/shell/test_arm_callgraph_fp.sh +++ b/tools/perf/tests/shell/test_arm_callgraph_fp.sh @@ -2,8 +2,14 @@ # Check Arm64 callgraphs are complete in fp mode # SPDX-License-Identifier: GPL-2.0 +shelldir=$(dirname "$0") +# shellcheck source=lib/perf_has_symbol.sh +. "${shelldir}"/lib/perf_has_symbol.sh + lscpu | grep -q "aarch64" || exit 2 +skip_test_missing_symbol leafloop + PERF_DATA=$(mktemp /tmp/__perf_test.perf.data.XXXXX) TEST_PROGRAM="perf test -w leafloop" -- GitLab From fcfb5a6189f55669c931dce9fec85280655c515f Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 23 Nov 2023 09:58:46 +0200 Subject: [PATCH 0136/1290] perf tests: Skip branch stack sampling test if brstack_bench symbol is missing The test "Check branch stack sampling" depends on finding symbol brstack_bench (and several others) in perf, and fails if perf has been stripped and no debug object is available. In that case, skip the test instead. Example: Before: $ strip tools/perf/perf $ tools/perf/perf buildid-cache -p `realpath tools/perf/perf` $ tools/perf/perf test -v 'branch stack sampling' 112: Check branch stack sampling : --- start --- test child forked, pid 123741 Testing user branch stack sampling + grep -E -m1 ^brstack_bench\+[^ ]*/brstack_foo\+[^ ]*/IND_CALL/.*$ /tmp/__perf_test.program.5Dz1U/perf.script + cleanup + rm -rf /tmp/__perf_test.program.5Dz1U test child finished with -1 ---- end ---- Check branch stack sampling: FAILED! After: $ tools/perf/perf test -v 'branch stack sampling' 112: Check branch stack sampling : --- start --- test child forked, pid 125157 perf does not have symbol 'brstack_bench' perf is missing symbols - skipping test test child finished with -2 ---- end ---- Check branch stack sampling: Skip Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Cc: German Gomez Cc: James Clark Cc: Jiri Olsa Cc: Leo Yan Cc: Namhyung Kim Link: https://lore.kernel.org/r/20231123075848.9652-7-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/test_brstack.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/perf/tests/shell/test_brstack.sh b/tools/perf/tests/shell/test_brstack.sh index 09908d71c9941..5f14d0cb013f8 100755 --- a/tools/perf/tests/shell/test_brstack.sh +++ b/tools/perf/tests/shell/test_brstack.sh @@ -4,6 +4,10 @@ # SPDX-License-Identifier: GPL-2.0 # German Gomez , 2022 +shelldir=$(dirname "$0") +# shellcheck source=lib/perf_has_symbol.sh +. "${shelldir}"/lib/perf_has_symbol.sh + # skip the test if the hardware doesn't support branch stack sampling # and if the architecture doesn't support filter types: any,save_type,u if ! perf record -o- --no-buildid --branch-filter any,save_type,u -- true > /dev/null 2>&1 ; then @@ -11,6 +15,8 @@ if ! perf record -o- --no-buildid --branch-filter any,save_type,u -- true > /dev exit 2 fi +skip_test_missing_symbol brstack_bench + TMPDIR=$(mktemp -d /tmp/__perf_test.program.XXXXX) TESTPROG="perf test -w brstack" -- GitLab From 3b24b15cf6fb2dbe1d009a52c9ddcb7721503d8f Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 23 Nov 2023 09:58:47 +0200 Subject: [PATCH 0137/1290] perf tests: Make data symbol test wait for perf to start The perf data symbol test waits 1 second for perf to run and collect data, which may be too little if perf takes a long time to start up, which has been noticed on systems with many CPUs. Use existing wait_for_perf_to_start helper to wait for perf to start. Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Cc: German Gomez Cc: James Clark Cc: Jiri Olsa Cc: Leo Yan Cc: Namhyung Kim Link: https://lore.kernel.org/r/20231123075848.9652-8-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/test_data_symbol.sh | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tools/perf/tests/shell/test_data_symbol.sh b/tools/perf/tests/shell/test_data_symbol.sh index 69bb6fe86c507..e50e54e94f6ff 100755 --- a/tools/perf/tests/shell/test_data_symbol.sh +++ b/tools/perf/tests/shell/test_data_symbol.sh @@ -4,6 +4,10 @@ # SPDX-License-Identifier: GPL-2.0 # Leo Yan , 2022 +shelldir=$(dirname "$0") +# shellcheck source=lib/waiting.sh +. "${shelldir}"/lib/waiting.sh + skip_if_no_mem_event() { perf mem record -e list 2>&1 | grep -E -q 'available' && return 0 return 2 @@ -13,6 +17,7 @@ skip_if_no_mem_event || exit 2 TEST_PROGRAM="perf test -w datasym" PERF_DATA=$(mktemp /tmp/__perf_test.perf.data.XXXXX) +ERR_FILE=$(mktemp /tmp/__perf_test.stderr.XXXXX) check_result() { # The memory report format is as below: @@ -50,13 +55,15 @@ echo "Recording workload..." # specific CPU and test in per-CPU mode. is_amd=$(grep -E -c 'vendor_id.*AuthenticAMD' /proc/cpuinfo) if (($is_amd >= 1)); then - perf mem record -o ${PERF_DATA} -C 0 -- taskset -c 0 $TEST_PROGRAM & + perf mem record -vvv -o ${PERF_DATA} -C 0 -- taskset -c 0 $TEST_PROGRAM 2>"${ERR_FILE}" & else - perf mem record --all-user -o ${PERF_DATA} -- $TEST_PROGRAM & + perf mem record -vvv --all-user -o ${PERF_DATA} -- $TEST_PROGRAM 2>"${ERR_FILE}" & fi PERFPID=$! +wait_for_perf_to_start ${PERFPID} "${ERR_FILE}" + sleep 1 kill $PERFPID -- GitLab From 124bf6360ad8fe9267017d10b5dd465d4af73247 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 23 Nov 2023 09:58:48 +0200 Subject: [PATCH 0138/1290] perf tests: Skip data symbol test if buf1 symbol is missing perf data symbol test depends on finding symbol buf1 in perf, and fails if perf has been stripped and no debug object is available. In that case, skip the test instead. Example: Before: $ strip tools/perf/perf $ tools/perf/perf buildid-cache -p `realpath tools/perf/perf` $ tools/perf/perf test -v 'data symbol' 113: Test data symbol : --- start --- test child forked, pid 125646 Recording workload... [ perf record: Woken up 3 times to write data ] [ perf record: Captured and wrote 0.577 MB /tmp/__perf_test.perf.data.Jhbdp (7794 samples) ] Cleaning up files... test child finished with -1 ---- end ---- Test data symbol: FAILED! After: $ tools/perf/perf test -v 'data symbol' 113: Test data symbol : --- start --- test child forked, pid 125747 perf does not have symbol 'buf1' perf is missing symbols - skipping test test child finished with -2 ---- end ---- Test data symbol: Skip Signed-off-by: Adrian Hunter Acked-by: Ian Rogers Cc: German Gomez Cc: James Clark Cc: Jiri Olsa Cc: Leo Yan Cc: Namhyung Kim Link: https://lore.kernel.org/r/20231123075848.9652-9-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/test_data_symbol.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/perf/tests/shell/test_data_symbol.sh b/tools/perf/tests/shell/test_data_symbol.sh index e50e54e94f6ff..3dfa91832aa87 100755 --- a/tools/perf/tests/shell/test_data_symbol.sh +++ b/tools/perf/tests/shell/test_data_symbol.sh @@ -8,6 +8,9 @@ shelldir=$(dirname "$0") # shellcheck source=lib/waiting.sh . "${shelldir}"/lib/waiting.sh +# shellcheck source=lib/perf_has_symbol.sh +. "${shelldir}"/lib/perf_has_symbol.sh + skip_if_no_mem_event() { perf mem record -e list 2>&1 | grep -E -q 'available' && return 0 return 2 @@ -15,6 +18,8 @@ skip_if_no_mem_event() { skip_if_no_mem_event || exit 2 +skip_test_missing_symbol buf1 + TEST_PROGRAM="perf test -w datasym" PERF_DATA=$(mktemp /tmp/__perf_test.perf.data.XXXXX) ERR_FILE=$(mktemp /tmp/__perf_test.stderr.XXXXX) -- GitLab From 19dd49c9337a482325d4dd7000e328dac11f6b5c Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 9 Nov 2023 15:27:32 -0800 Subject: [PATCH 0139/1290] perf vendor events: Add skx, clx, icx and spr upi bandwidth metric Add upi_data_receive_bw metric for skylakex, cascadelakex, icelakex and sapphirerapids. The metric was added to perfmon metrics in: https://github.com/intel/perfmon/pull/119 Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Caleb Biggers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231109232732.2973015-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- .../perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json | 6 ++++++ tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json | 6 ++++++ .../pmu-events/arch/x86/sapphirerapids/spr-metrics.json | 6 ++++++ tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json | 6 ++++++ 4 files changed, 24 insertions(+) diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json index 84c132af3dfa5..8bc6c07078566 100644 --- a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json @@ -1862,6 +1862,12 @@ "MetricName": "uncore_frequency", "ScaleUnit": "1GHz" }, + { + "BriefDescription": "Intel(R) Ultra Path Interconnect (UPI) data receive bandwidth (MB/sec)", + "MetricExpr": "UNC_UPI_RxL_FLITS.ALL_DATA * 7.111111111111111 / 1e6 / duration_time", + "MetricName": "upi_data_receive_bw", + "ScaleUnit": "1MB/s" + }, { "BriefDescription": "Intel(R) Ultra Path Interconnect (UPI) data transmit bandwidth (MB/sec)", "MetricExpr": "UNC_UPI_TxL_FLITS.ALL_DATA * 7.111111111111111 / 1e6 / duration_time", diff --git a/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json b/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json index e98602c667072..71d78a7841ea8 100644 --- a/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json @@ -1846,6 +1846,12 @@ "MetricName": "uncore_frequency", "ScaleUnit": "1GHz" }, + { + "BriefDescription": "Intel(R) Ultra Path Interconnect (UPI) data receive bandwidth (MB/sec)", + "MetricExpr": "UNC_UPI_RxL_FLITS.ALL_DATA * 7.111111111111111 / 1e6 / duration_time", + "MetricName": "upi_data_receive_bw", + "ScaleUnit": "1MB/s" + }, { "BriefDescription": "Intel(R) Ultra Path Interconnect (UPI) data transmit bandwidth (MB/sec)", "MetricExpr": "UNC_UPI_TxL_FLITS.ALL_DATA * 7.111111111111111 / 1e6 / duration_time", diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json index 06c6d67cb76b0..e31a4aac9f205 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json @@ -1964,6 +1964,12 @@ "MetricName": "uncore_frequency", "ScaleUnit": "1GHz" }, + { + "BriefDescription": "Intel(R) Ultra Path Interconnect (UPI) data receive bandwidth (MB/sec)", + "MetricExpr": "UNC_UPI_RxL_FLITS.ALL_DATA * 7.111111111111111 / 1e6 / duration_time", + "MetricName": "upi_data_receive_bw", + "ScaleUnit": "1MB/s" + }, { "BriefDescription": "Intel(R) Ultra Path Interconnect (UPI) data transmit bandwidth (MB/sec)", "MetricExpr": "UNC_UPI_TxL_FLITS.ALL_DATA * 7.111111111111111 / 1e6 / duration_time", diff --git a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json index 4a8f8eeb75255..ec3aa5ef00a3c 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json @@ -1806,6 +1806,12 @@ "MetricName": "uncore_frequency", "ScaleUnit": "1GHz" }, + { + "BriefDescription": "Intel(R) Ultra Path Interconnect (UPI) data receive bandwidth (MB/sec)", + "MetricExpr": "UNC_UPI_RxL_FLITS.ALL_DATA * 7.111111111111111 / 1e6 / duration_time", + "MetricName": "upi_data_receive_bw", + "ScaleUnit": "1MB/s" + }, { "BriefDescription": "Intel(R) Ultra Path Interconnect (UPI) data transmit bandwidth (MB/sec)", "MetricExpr": "UNC_UPI_TxL_FLITS.ALL_DATA * 7.111111111111111 / 1e6 / duration_time", -- GitLab From 7340c6df49df1b261892d287444c255d0a378063 Mon Sep 17 00:00:00 2001 From: Inochi Amaoto Date: Wed, 22 Nov 2023 20:41:06 +0800 Subject: [PATCH 0140/1290] perf vendor events riscv: add T-HEAD C9xx JSON file Add JSON file of T-HEAD C9xx series events. The event idx (raw value) is summary as following: event id range | support cpu 0x01 - 0x2a | c906,c910,c920 The event ids are based on the public document of T-HEAD and cover the c900 series. These events are the max that c900 series support. Since T-HEAD let manufacturers decide whether events are usable, the final support of the perf events is determined by the pmu node of the soc dtb. Signed-off-by: Inochi Amaoto Tested-by: Guo Ren Acked-by: Palmer Dabbelt Cc: Adrian Hunter Cc: Albert Ou Cc: Alexander Shishkin Cc: Chen Wang Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Jisheng Zhang Cc: Mark Rutland Cc: Namhyung Kim Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Wei Fu Cc: linux-riscv@lists.infradead.org Link: https://lore.kernel.org/r/IA1PR20MB495325FCF603BAA841E29281BBBAA@IA1PR20MB4953.namprd20.prod.outlook.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/riscv/mapfile.csv | 1 + .../arch/riscv/thead/c900-legacy/cache.json | 67 ++++++++++++++++ .../riscv/thead/c900-legacy/firmware.json | 68 ++++++++++++++++ .../riscv/thead/c900-legacy/instruction.json | 72 +++++++++++++++++ .../riscv/thead/c900-legacy/microarch.json | 80 +++++++++++++++++++ 5 files changed, 288 insertions(+) create mode 100644 tools/perf/pmu-events/arch/riscv/thead/c900-legacy/cache.json create mode 100644 tools/perf/pmu-events/arch/riscv/thead/c900-legacy/firmware.json create mode 100644 tools/perf/pmu-events/arch/riscv/thead/c900-legacy/instruction.json create mode 100644 tools/perf/pmu-events/arch/riscv/thead/c900-legacy/microarch.json diff --git a/tools/perf/pmu-events/arch/riscv/mapfile.csv b/tools/perf/pmu-events/arch/riscv/mapfile.csv index 56b03138d46aa..cfc449b198105 100644 --- a/tools/perf/pmu-events/arch/riscv/mapfile.csv +++ b/tools/perf/pmu-events/arch/riscv/mapfile.csv @@ -15,4 +15,5 @@ # #MVENDORID-MARCHID-MIMPID,Version,Filename,EventType 0x489-0x8000000000000007-0x[[:xdigit:]]+,v1,sifive/u74,core +0x5b7-0x0-0x0,v1,thead/c900-legacy,core 0x67e-0x80000000db0000[89]0-0x[[:xdigit:]]+,v1,starfive/dubhe-80,core diff --git a/tools/perf/pmu-events/arch/riscv/thead/c900-legacy/cache.json b/tools/perf/pmu-events/arch/riscv/thead/c900-legacy/cache.json new file mode 100644 index 0000000000000..2b142348d6359 --- /dev/null +++ b/tools/perf/pmu-events/arch/riscv/thead/c900-legacy/cache.json @@ -0,0 +1,67 @@ +[ + { + "EventName": "L1_ICACHE_ACCESS", + "EventCode": "0x00000001", + "BriefDescription": "L1 instruction cache access" + }, + { + "EventName": "L1_ICACHE_MISS", + "EventCode": "0x00000002", + "BriefDescription": "L1 instruction cache miss" + }, + { + "EventName": "ITLB_MISS", + "EventCode": "0x00000003", + "BriefDescription": "I-UTLB miss" + }, + { + "EventName": "DTLB_MISS", + "EventCode": "0x00000004", + "BriefDescription": "D-UTLB miss" + }, + { + "EventName": "JTLB_MISS", + "EventCode": "0x00000005", + "BriefDescription": "JTLB miss" + }, + { + "EventName": "L1_DCACHE_READ_ACCESS", + "EventCode": "0x0000000c", + "BriefDescription": "L1 data cache read access" + }, + { + "EventName": "L1_DCACHE_READ_MISS", + "EventCode": "0x0000000d", + "BriefDescription": "L1 data cache read miss" + }, + { + "EventName": "L1_DCACHE_WRITE_ACCESS", + "EventCode": "0x0000000e", + "BriefDescription": "L1 data cache write access" + }, + { + "EventName": "L1_DCACHE_WRITE_MISS", + "EventCode": "0x0000000f", + "BriefDescription": "L1 data cache write miss" + }, + { + "EventName": "LL_CACHE_READ_ACCESS", + "EventCode": "0x00000010", + "BriefDescription": "LL Cache read access" + }, + { + "EventName": "LL_CACHE_READ_MISS", + "EventCode": "0x00000011", + "BriefDescription": "LL Cache read miss" + }, + { + "EventName": "LL_CACHE_WRITE_ACCESS", + "EventCode": "0x00000012", + "BriefDescription": "LL Cache write access" + }, + { + "EventName": "LL_CACHE_WRITE_MISS", + "EventCode": "0x00000013", + "BriefDescription": "LL Cache write miss" + } +] diff --git a/tools/perf/pmu-events/arch/riscv/thead/c900-legacy/firmware.json b/tools/perf/pmu-events/arch/riscv/thead/c900-legacy/firmware.json new file mode 100644 index 0000000000000..9b4a032186a7b --- /dev/null +++ b/tools/perf/pmu-events/arch/riscv/thead/c900-legacy/firmware.json @@ -0,0 +1,68 @@ +[ + { + "ArchStdEvent": "FW_MISALIGNED_LOAD" + }, + { + "ArchStdEvent": "FW_MISALIGNED_STORE" + }, + { + "ArchStdEvent": "FW_ACCESS_LOAD" + }, + { + "ArchStdEvent": "FW_ACCESS_STORE" + }, + { + "ArchStdEvent": "FW_ILLEGAL_INSN" + }, + { + "ArchStdEvent": "FW_SET_TIMER" + }, + { + "ArchStdEvent": "FW_IPI_SENT" + }, + { + "ArchStdEvent": "FW_IPI_RECEIVED" + }, + { + "ArchStdEvent": "FW_FENCE_I_SENT" + }, + { + "ArchStdEvent": "FW_FENCE_I_RECEIVED" + }, + { + "ArchStdEvent": "FW_SFENCE_VMA_SENT" + }, + { + "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED" + }, + { + "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED" + }, + { + "ArchStdEvent": "FW_SFENCE_VMA_ASID_RECEIVED" + }, + { + "ArchStdEvent": "FW_HFENCE_GVMA_SENT" + }, + { + "ArchStdEvent": "FW_HFENCE_GVMA_RECEIVED" + }, + { + "ArchStdEvent": "FW_HFENCE_GVMA_VMID_SENT" + }, + { + "ArchStdEvent": "FW_HFENCE_GVMA_VMID_RECEIVED" + }, + { + "ArchStdEvent": "FW_HFENCE_VVMA_SENT" + }, + { + "ArchStdEvent": "FW_HFENCE_VVMA_RECEIVED" + }, + { + "ArchStdEvent": "FW_HFENCE_VVMA_ASID_SENT" + }, + { + "ArchStdEvent": "FW_HFENCE_VVMA_ASID_RECEIVED" + } +] diff --git a/tools/perf/pmu-events/arch/riscv/thead/c900-legacy/instruction.json b/tools/perf/pmu-events/arch/riscv/thead/c900-legacy/instruction.json new file mode 100644 index 0000000000000..c822b53733339 --- /dev/null +++ b/tools/perf/pmu-events/arch/riscv/thead/c900-legacy/instruction.json @@ -0,0 +1,72 @@ +[ + { + "EventName": "INST_BRANCH_MISPREDICT", + "EventCode": "0x00000006", + "BriefDescription": "Mispredicted branch instructions" + }, + { + "EventName": "INST_BRANCH", + "EventCode": "0x00000007", + "BriefDescription": "Retired branch instructions" + }, + { + "EventName": "INST_JMP_MISPREDICT", + "EventCode": "0x00000008", + "BriefDescription": "Indirect branch mispredict" + }, + { + "EventName": "INST_JMP", + "EventCode": "0x00000009", + "BriefDescription": "Retired jmp instructions" + }, + { + "EventName": "INST_STORE", + "EventCode": "0x0000000b", + "BriefDescription": "Retired store instructions" + }, + { + "EventName": "INST_ALU", + "EventCode": "0x0000001d", + "BriefDescription": "Retired ALU instructions" + }, + { + "EventName": "INST_LDST", + "EventCode": "0x0000001e", + "BriefDescription": "Retired Load/Store instructions" + }, + { + "EventName": "INST_VECTOR", + "EventCode": "0x0000001f", + "BriefDescription": "Retired Vector instructions" + }, + { + "EventName": "INST_CSR", + "EventCode": "0x00000020", + "BriefDescription": "Retired CSR instructions" + }, + { + "EventName": "INST_SYNC", + "EventCode": "0x00000021", + "BriefDescription": "Retired sync instructions (AMO/LR/SC instructions)" + }, + { + "EventName": "INST_UNALIGNED_ACCESS", + "EventCode": "0x00000022", + "BriefDescription": "Retired Store/Load instructions with unaligned memory access" + }, + { + "EventName": "INST_ECALL", + "EventCode": "0x00000025", + "BriefDescription": "Retired ecall instructions" + }, + { + "EventName": "INST_LONG_JP", + "EventCode": "0x00000026", + "BriefDescription": "Retired long jump instructions" + }, + { + "EventName": "INST_FP", + "EventCode": "0x0000002a", + "BriefDescription": "Retired FPU instructions" + } +] diff --git a/tools/perf/pmu-events/arch/riscv/thead/c900-legacy/microarch.json b/tools/perf/pmu-events/arch/riscv/thead/c900-legacy/microarch.json new file mode 100644 index 0000000000000..0ab6f288af91d --- /dev/null +++ b/tools/perf/pmu-events/arch/riscv/thead/c900-legacy/microarch.json @@ -0,0 +1,80 @@ +[ + { + "EventName": "LSU_SPEC_FAIL", + "EventCode": "0x0000000a", + "BriefDescription": "LSU speculation fail" + }, + { + "EventName": "IDU_RF_PIPE_FAIL", + "EventCode": "0x00000014", + "BriefDescription": "Instruction decode unit launch pipeline failed in RF state" + }, + { + "EventName": "IDU_RF_REG_FAIL", + "EventCode": "0x00000015", + "BriefDescription": "Instruction decode unit launch register file fail in RF state" + }, + { + "EventName": "IDU_RF_INSTRUCTION", + "EventCode": "0x00000016", + "BriefDescription": "retired instruction count of Instruction decode unit in RF (Register File) stage" + }, + { + "EventName": "LSU_4K_STALL", + "EventCode": "0x00000017", + "BriefDescription": "LSU stall times for long distance data access (Over 4K)", + "PublicDescription": "This stall occurs when translate virtual address with page offset over 4k" + }, + { + "EventName": "LSU_OTHER_STALL", + "EventCode": "0x00000018", + "BriefDescription": "LSU stall times for other reasons (except the 4k stall)" + }, + { + "EventName": "LSU_SQ_OTHER_DIS", + "EventCode": "0x00000019", + "BriefDescription": "LSU store queue discard others" + }, + { + "EventName": "LSU_SQ_DATA_DISCARD", + "EventCode": "0x0000001a", + "BriefDescription": "LSU store queue discard data (uops)" + }, + { + "EventName": "BRANCH_DIRECTION_MISPREDICTION", + "EventCode": "0x0000001b", + "BriefDescription": "Branch misprediction in BTB" + }, + { + "EventName": "BRANCH_DIRECTION_PREDICTION", + "EventCode": "0x0000001c", + "BriefDescription": "All branch prediction in BTB", + "PublicDescription": "This event including both successful prediction and failed prediction in BTB" + }, + { + "EventName": "INTERRUPT_ACK_COUNT", + "EventCode": "0x00000023", + "BriefDescription": "acknowledged interrupt count" + }, + { + "EventName": "INTERRUPT_OFF_CYCLE", + "EventCode": "0x00000024", + "BriefDescription": "PLIC arbitration time when the interrupt is not responded", + "PublicDescription": "The arbitration time is recorded while meeting any of the following:\n- CPU is M-mode and MIE == 0\n- CPU is S-mode and delegation and SIE == 0\n" + }, + { + "EventName": "IFU_STALLED_CYCLE", + "EventCode": "0x00000027", + "BriefDescription": "Number of stall cycles of the instruction fetch unit (IFU)." + }, + { + "EventName": "IDU_STALLED_CYCLE", + "EventCode": "0x00000028", + "BriefDescription": "hpcp_backend_stall Number of stall cycles of the instruction decoding unit (IDU) and next-level pipeline unit." + }, + { + "EventName": "SYNC_STALL", + "EventCode": "0x00000029", + "BriefDescription": "Sync instruction stall cycle fence/fence.i/sync/sfence" + } +] -- GitLab From ffa96259ca5f02bbcecd2c45831e7632cd6d3485 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 13 Nov 2023 10:23:24 +0000 Subject: [PATCH 0141/1290] perf test: Use existing config value for objdump path There is already an existing config value for changing the objdump path, so instead of having two values that do the same thing, make 'perf test' use annotate.objdump as well. Signed-off-by: James Clark Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Fangrui Song Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Peter Zijlstra Cc: Yang Jihong Cc: Yonghong Song Link: https://lore.kernel.org/r/ZU5Cx4LTrB5q0sIG@kernel.org Link: https://lore.kernel.org/r/20231113102327.695386-1-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-config.txt | 8 ++------ tools/perf/tests/builtin-test.c | 2 +- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt index 16398babd1ef5..379f9d7a8ab11 100644 --- a/tools/perf/Documentation/perf-config.txt +++ b/tools/perf/Documentation/perf-config.txt @@ -251,7 +251,8 @@ annotate.*:: addr2line binary to use for file names and line numbers. annotate.objdump:: - objdump binary to use for disassembly and annotations. + objdump binary to use for disassembly and annotations, + including in the 'perf test' command. annotate.disassembler_style:: Use this to change the default disassembler style to some other value @@ -722,11 +723,6 @@ session-.*:: Defines new record session for daemon. The value is record's command line without the 'record' keyword. -test.*:: - - test.objdump:: - objdump binary to use for disassembly and annotations. - SEE ALSO -------- linkperf:perf[1] diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index 113e92119e1d1..b8c21e81a021b 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -518,7 +518,7 @@ static int run_workload(const char *work, int argc, const char **argv) static int perf_test__config(const char *var, const char *value, void *data __maybe_unused) { - if (!strcmp(var, "test.objdump")) + if (!strcmp(var, "annotate.objdump")) test_objdump_path = value; return 0; -- GitLab From 08973307d28311505b85216d724826e2ca21759e Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 11 Oct 2023 20:50:25 -0700 Subject: [PATCH 0142/1290] perf annotate: Check if operand has multiple regs It needs to check all possible information in an instruction. Let's add a field indicating if the operand has multiple registers. I'll be used to search type information like in an array access on x86 like: mov 0x10(%rax,%rbx,8), %rcx ------------- here Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Cc: linux-toolchains@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Link: https://lore.kernel.org/r/20231012035111.676789-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 36 ++++++++++++++++++++++++++++++++++++ tools/perf/util/annotate.h | 2 ++ 2 files changed, 38 insertions(+) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 3364edf30f50e..9a828dc601c75 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -85,6 +85,8 @@ struct arch { struct { char comment_char; char skip_functions_char; + char register_char; + char memory_ref_char; } objdump; }; @@ -188,6 +190,8 @@ static struct arch architectures[] = { .insn_suffix = "bwlq", .objdump = { .comment_char = '#', + .register_char = '%', + .memory_ref_char = '(', }, }, { @@ -566,6 +570,34 @@ static struct ins_ops lock_ops = { .scnprintf = lock__scnprintf, }; +/* + * Check if the operand has more than one registers like x86 SIB addressing: + * 0x1234(%rax, %rbx, 8) + * + * But it doesn't care segment selectors like %gs:0x5678(%rcx), so just check + * the input string after 'memory_ref_char' if exists. + */ +static bool check_multi_regs(struct arch *arch, const char *op) +{ + int count = 0; + + if (arch->objdump.register_char == 0) + return false; + + if (arch->objdump.memory_ref_char) { + op = strchr(op, arch->objdump.memory_ref_char); + if (op == NULL) + return false; + } + + while ((op = strchr(op, arch->objdump.register_char)) != NULL) { + count++; + op++; + } + + return count > 1; +} + static int mov__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms __maybe_unused) { char *s = strchr(ops->raw, ','), *target, *comment, prev; @@ -593,6 +625,8 @@ static int mov__parse(struct arch *arch, struct ins_operands *ops, struct map_sy if (ops->source.raw == NULL) return -1; + ops->source.multi_regs = check_multi_regs(arch, ops->source.raw); + target = skip_spaces(++s); comment = strchr(s, arch->objdump.comment_char); @@ -613,6 +647,8 @@ static int mov__parse(struct arch *arch, struct ins_operands *ops, struct map_sy if (ops->target.raw == NULL) goto out_free_source; + ops->target.multi_regs = check_multi_regs(arch, ops->target.raw); + if (comment == NULL) return 0; diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index bc8b95e8b1be1..b64a2be287b30 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -39,12 +39,14 @@ struct ins_operands { s64 offset; bool offset_avail; bool outside; + bool multi_regs; } target; union { struct { char *raw; char *name; u64 addr; + bool multi_regs; } source; struct { struct ins ins; -- GitLab From a19937d829fbe3103e4596c8810a3e5cb372c6d4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 5 Nov 2023 16:52:17 +0900 Subject: [PATCH 0143/1290] genksyms: remove the remnant of the -s option Commit 74d931716151 ("genksyms: remove symbol prefix support") removed the -s (--symbol-prefix) option. Clean up the left-over. Signed-off-by: Masahiro Yamada --- scripts/genksyms/genksyms.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/scripts/genksyms/genksyms.c b/scripts/genksyms/genksyms.c index f5dfdb9d80e9d..6636d5b30eba1 100644 --- a/scripts/genksyms/genksyms.c +++ b/scripts/genksyms/genksyms.c @@ -719,7 +719,6 @@ static void genksyms_usage(void) { fputs("Usage:\n" "genksyms [-adDTwqhVR] > /path/to/.tmp_obj.ver\n" "\n" #ifdef __GNU_LIBRARY__ - " -s, --symbol-prefix Select symbol prefix\n" " -d, --debug Increment the debug level (repeatable)\n" " -D, --dump Dump expanded symbol defs (for debugging only)\n" " -r, --reference file Read reference symbols from a file\n" @@ -730,7 +729,6 @@ static void genksyms_usage(void) " -h, --help Print this message\n" " -V, --version Print the release version\n" #else /* __GNU_LIBRARY__ */ - " -s Select symbol prefix\n" " -d Increment the debug level (repeatable)\n" " -D Dump expanded symbol defs (for debugging only)\n" " -r file Read reference symbols from a file\n" @@ -763,10 +761,10 @@ int main(int argc, char **argv) {0, 0, 0, 0} }; - while ((o = getopt_long(argc, argv, "s:dwqVDr:T:ph", + while ((o = getopt_long(argc, argv, "dwqVDr:T:ph", &long_opts[0], NULL)) != EOF) #else /* __GNU_LIBRARY__ */ - while ((o = getopt(argc, argv, "s:dwqVDr:T:ph")) != EOF) + while ((o = getopt(argc, argv, "dwqVDr:T:ph")) != EOF) #endif /* __GNU_LIBRARY__ */ switch (o) { case 'd': -- GitLab From 96a29581e735bcf3b4e5a4f2daad9f445025f510 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 5 Nov 2023 16:52:18 +0900 Subject: [PATCH 0144/1290] genksyms: use getopt_long() unconditionally getopt_long() is used by various tools in the kernel (e.g. Kconfig). It should be fine to use it all the time. Signed-off-by: Masahiro Yamada --- scripts/genksyms/genksyms.c | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/scripts/genksyms/genksyms.c b/scripts/genksyms/genksyms.c index 6636d5b30eba1..f3901c55df239 100644 --- a/scripts/genksyms/genksyms.c +++ b/scripts/genksyms/genksyms.c @@ -16,9 +16,7 @@ #include #include #include -#ifdef __GNU_LIBRARY__ #include -#endif /* __GNU_LIBRARY__ */ #include "genksyms.h" /*----------------------------------------------------------------------*/ @@ -718,7 +716,6 @@ void error_with_pos(const char *fmt, ...) static void genksyms_usage(void) { fputs("Usage:\n" "genksyms [-adDTwqhVR] > /path/to/.tmp_obj.ver\n" "\n" -#ifdef __GNU_LIBRARY__ " -d, --debug Increment the debug level (repeatable)\n" " -D, --dump Dump expanded symbol defs (for debugging only)\n" " -r, --reference file Read reference symbols from a file\n" @@ -728,17 +725,6 @@ static void genksyms_usage(void) " -q, --quiet Disable warnings (default)\n" " -h, --help Print this message\n" " -V, --version Print the release version\n" -#else /* __GNU_LIBRARY__ */ - " -d Increment the debug level (repeatable)\n" - " -D Dump expanded symbol defs (for debugging only)\n" - " -r file Read reference symbols from a file\n" - " -T file Dump expanded types into file\n" - " -p Preserve reference modversions or fail\n" - " -w Enable warnings\n" - " -q Disable warnings (default)\n" - " -h Print this message\n" - " -V Print the release version\n" -#endif /* __GNU_LIBRARY__ */ , stderr); } @@ -747,7 +733,6 @@ int main(int argc, char **argv) FILE *dumpfile = NULL, *ref_file = NULL; int o; -#ifdef __GNU_LIBRARY__ struct option long_opts[] = { {"debug", 0, 0, 'd'}, {"warnings", 0, 0, 'w'}, @@ -763,9 +748,6 @@ int main(int argc, char **argv) while ((o = getopt_long(argc, argv, "dwqVDr:T:ph", &long_opts[0], NULL)) != EOF) -#else /* __GNU_LIBRARY__ */ - while ((o = getopt(argc, argv, "dwqVDr:T:ph")) != EOF) -#endif /* __GNU_LIBRARY__ */ switch (o) { case 'd': flag_debug++; -- GitLab From ce1fc9345a59c55d3a46dd7da872791cae41324e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 6 Nov 2023 03:10:47 +0900 Subject: [PATCH 0145/1290] kconfig: do not clear SYMBOL_DEF_USER when the value is out of range When a user-supplied value is out of range, (NEW) and an incorrect default value are shown. [Test Kconfig] config FOO int "foo" range 10 20 [Test .config] CONFIG_FOO=30 [Result without this fix] $ make config * * Main menu * foo (FOO) [10] (NEW) [Result with this fix] $ make config * * Main menu * foo (FOO) [20] Currently, the SYMBOL_DEF_USER is cleared if the user input does not reside within the range. Kconfig forgets the initial value 30, and prints (NEW) and an incorrect default [10]. Kconfig should remember the user's input. The default should be [20] because the user's input, 30, is closer to the upper limit of the range. Please note it will not show up in "make oldconfig" because it is no longer considered as a new symbol. It also fixes the inconsistent behavior in listnewconfig/helpnewconfig. Signed-off-by: Masahiro Yamada --- scripts/kconfig/confdata.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/kconfig/confdata.c b/scripts/kconfig/confdata.c index 4a6811d77d182..7fca9cc3ae748 100644 --- a/scripts/kconfig/confdata.c +++ b/scripts/kconfig/confdata.c @@ -594,7 +594,7 @@ int conf_read(const char *name) /* Reset a string value if it's out of range */ if (sym_string_within_range(sym, sym->def[S_DEF_USER].val)) break; - sym->flags &= ~(SYMBOL_VALID|SYMBOL_DEF_USER); + sym->flags &= ~SYMBOL_VALID; conf_unsaved++; break; default: -- GitLab From 259b8bd13db5f61fcc60192d4f73eb2eac9c426f Mon Sep 17 00:00:00 2001 From: Dmitrii Bundin Date: Mon, 6 Nov 2023 00:56:22 +0300 Subject: [PATCH 0146/1290] kbuild: deb-pkg: apply short -R and -j options The long version --rules-file and --jobs are available since 1.18.8 while their short analogues -R and -j have been added since 1.14.7. The option --rules-file the way it works currently was introduced in the commit 5cd52673aabdf5eaa58181972119a41041fc85f2 of dpkg dated 23.07.18 with the following changelog entry: * Fix dpkg-buildpackage option --rules-file parsing. It was trying to parse it as --rules-target, which due to the ordering was a no-op. The current behavior of the long version --rules-file is guaranteed to be in use starting 1.19.1 and might cause build failures for some versions newer than 1.18.8 even in spite of being documented that way. Signed-off-by: Dmitrii Bundin Signed-off-by: Masahiro Yamada --- scripts/Makefile.package | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/Makefile.package b/scripts/Makefile.package index 3addd1c0b989a..f30349f46a975 100644 --- a/scripts/Makefile.package +++ b/scripts/Makefile.package @@ -146,7 +146,7 @@ deb-pkg srcdeb-pkg bindeb-pkg: $(if $(findstring source, $(build-type)), \ --unsigned-source --compression=$(KDEB_SOURCE_COMPRESS)) \ $(if $(findstring binary, $(build-type)), \ - --rules-file='$(MAKE) -f debian/rules' --jobs=1 -r$(KBUILD_PKG_ROOTCMD) -a$$(cat debian/arch), \ + -R'$(MAKE) -f debian/rules' -j1 -r$(KBUILD_PKG_ROOTCMD) -a$$(cat debian/arch), \ --no-check-builddeps) \ $(DPKG_FLAGS)) -- GitLab From 61e3e3c21a9599f7f2c6f15f7e4b099cf6ea290e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 18 Nov 2023 15:18:36 +0900 Subject: [PATCH 0147/1290] kconfig: remove error check for xrealloc() xrealloc() never returns NULL as it is checked in the callee. This is a left-over of commit d717f24d8c68 ("kconfig: add xrealloc() helper"). Signed-off-by: Masahiro Yamada --- scripts/kconfig/confdata.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/scripts/kconfig/confdata.c b/scripts/kconfig/confdata.c index 7fca9cc3ae748..2ba4dfdd1aeee 100644 --- a/scripts/kconfig/confdata.c +++ b/scripts/kconfig/confdata.c @@ -289,16 +289,12 @@ static int conf_set_sym_val(struct symbol *sym, int def, int def_flags, char *p) #define LINE_GROWTH 16 static int add_byte(int c, char **lineptr, size_t slen, size_t *n) { - char *nline; size_t new_size = slen + 1; + if (new_size > *n) { new_size += LINE_GROWTH - 1; new_size *= 2; - nline = xrealloc(*lineptr, new_size); - if (!nline) - return -1; - - *lineptr = nline; + *lineptr = xrealloc(*lineptr, new_size); *n = new_size; } -- GitLab From 4d137ab0107ead0f2590fc0314e627431e3b9e3f Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 18 Nov 2023 16:59:07 +0900 Subject: [PATCH 0148/1290] kconfig: require a space after '#' for valid input Currently, when an input line starts with '#', (line + 2) is passed to memcmp() without checking line[1]. It means that line[1] can be any arbitrary character. For example, "#KCONFIG_FOO is not set" is accepted as valid input, functioning the same as "# CONFIG_FOO is not set". More importantly, this can potentially lead to a buffer overrun if line[1] == '\0'. It occurs if the input only contains '#', as (line + 2) points to an uninitialized buffer. Check line[1], and skip the line if it is not a space. Signed-off-by: Masahiro Yamada --- scripts/kconfig/confdata.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/kconfig/confdata.c b/scripts/kconfig/confdata.c index 2ba4dfdd1aeee..556b7f087dbb5 100644 --- a/scripts/kconfig/confdata.c +++ b/scripts/kconfig/confdata.c @@ -426,6 +426,8 @@ load: conf_lineno++; sym = NULL; if (line[0] == '#') { + if (line[1] != ' ') + continue; if (memcmp(line + 2, CONFIG_, strlen(CONFIG_))) continue; p = strchr(line + 2 + strlen(CONFIG_), ' '); -- GitLab From 92d4fe0a48f1ab6cf20143dd0b376f4fe842854b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 18 Nov 2023 16:59:08 +0900 Subject: [PATCH 0149/1290] kconfig: remove unused code for S_DEF_AUTO in conf_read_simple() The 'else' arm here is unreachable in practical use cases. include/config/auto.conf does not include "# CONFIG_... is not set" line unless it is manually hacked. Signed-off-by: Masahiro Yamada --- scripts/kconfig/confdata.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/scripts/kconfig/confdata.c b/scripts/kconfig/confdata.c index 556b7f087dbb5..92e8e37aca4d1 100644 --- a/scripts/kconfig/confdata.c +++ b/scripts/kconfig/confdata.c @@ -436,20 +436,15 @@ load: *p++ = 0; if (strncmp(p, "is not set", 10)) continue; - if (def == S_DEF_USER) { - sym = sym_find(line + 2 + strlen(CONFIG_)); - if (!sym) { - if (warn_unknown) - conf_warning("unknown symbol: %s", - line + 2 + strlen(CONFIG_)); - conf_set_changed(true); - continue; - } - } else { - sym = sym_lookup(line + 2 + strlen(CONFIG_), 0); - if (sym->type == S_UNKNOWN) - sym->type = S_BOOLEAN; + sym = sym_find(line + 2 + strlen(CONFIG_)); + if (!sym) { + if (warn_unknown) + conf_warning("unknown symbol: %s", + line + 2 + strlen(CONFIG_)); + + conf_set_changed(true); + continue; } if (sym->flags & def_flags) { conf_warning("override: reassigning to symbol %s", sym->name); -- GitLab From d854b4b21de684a16a7d6163c7b0e9c5ff8a09d3 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 18 Nov 2023 16:59:09 +0900 Subject: [PATCH 0150/1290] kconfig: deduplicate code in conf_read_simple() Kconfig accepts both "# CONFIG_FOO is not set" and "CONFIG_FOO=n" as a valid input, but conf_read_simple() duplicates similar code to handle them. Factor out the common code. Signed-off-by: Masahiro Yamada --- scripts/kconfig/confdata.c | 89 +++++++++++++++----------------------- 1 file changed, 35 insertions(+), 54 deletions(-) diff --git a/scripts/kconfig/confdata.c b/scripts/kconfig/confdata.c index 92e8e37aca4d1..b6a90f6baea14 100644 --- a/scripts/kconfig/confdata.c +++ b/scripts/kconfig/confdata.c @@ -342,11 +342,10 @@ int conf_read_simple(const char *name, int def) FILE *in = NULL; char *line = NULL; size_t line_asize = 0; - char *p, *p2; + char *p, *p2, *val; struct symbol *sym; int i, def_flags; - const char *warn_unknown; - const char *werror; + const char *warn_unknown, *werror, *sym_name; warn_unknown = getenv("KCONFIG_WARN_UNKNOWN_SYMBOLS"); werror = getenv("KCONFIG_WERROR"); @@ -424,77 +423,34 @@ load: while (compat_getline(&line, &line_asize, in) != -1) { conf_lineno++; - sym = NULL; if (line[0] == '#') { if (line[1] != ' ') continue; - if (memcmp(line + 2, CONFIG_, strlen(CONFIG_))) + p = line + 2; + if (memcmp(p, CONFIG_, strlen(CONFIG_))) continue; - p = strchr(line + 2 + strlen(CONFIG_), ' '); + sym_name = p + strlen(CONFIG_); + p = strchr(sym_name, ' '); if (!p) continue; *p++ = 0; if (strncmp(p, "is not set", 10)) continue; - sym = sym_find(line + 2 + strlen(CONFIG_)); - if (!sym) { - if (warn_unknown) - conf_warning("unknown symbol: %s", - line + 2 + strlen(CONFIG_)); - - conf_set_changed(true); - continue; - } - if (sym->flags & def_flags) { - conf_warning("override: reassigning to symbol %s", sym->name); - } - switch (sym->type) { - case S_BOOLEAN: - case S_TRISTATE: - sym->def[def].tri = no; - sym->flags |= def_flags; - break; - default: - ; - } + val = "n"; } else if (memcmp(line, CONFIG_, strlen(CONFIG_)) == 0) { - p = strchr(line + strlen(CONFIG_), '='); + sym_name = line + strlen(CONFIG_); + p = strchr(sym_name, '='); if (!p) continue; *p++ = 0; + val = p; p2 = strchr(p, '\n'); if (p2) { *p2-- = 0; if (*p2 == '\r') *p2 = 0; } - - sym = sym_find(line + strlen(CONFIG_)); - if (!sym) { - if (def == S_DEF_AUTO) { - /* - * Reading from include/config/auto.conf - * If CONFIG_FOO previously existed in - * auto.conf but it is missing now, - * include/config/FOO must be touched. - */ - conf_touch_dep(line + strlen(CONFIG_)); - } else { - if (warn_unknown) - conf_warning("unknown symbol: %s", - line + strlen(CONFIG_)); - - conf_set_changed(true); - } - continue; - } - - if (sym->flags & def_flags) { - conf_warning("override: reassigning to symbol %s", sym->name); - } - if (conf_set_sym_val(sym, def, def_flags, p)) - continue; } else { if (line[0] != '\r' && line[0] != '\n') conf_warning("unexpected data: %.*s", @@ -503,6 +459,31 @@ load: continue; } + sym = sym_find(sym_name); + if (!sym) { + if (def == S_DEF_AUTO) { + /* + * Reading from include/config/auto.conf. + * If CONFIG_FOO previously existed in auto.conf + * but it is missing now, include/config/FOO + * must be touched. + */ + conf_touch_dep(sym_name); + } else { + if (warn_unknown) + conf_warning("unknown symbol: %s", sym_name); + + conf_set_changed(true); + } + continue; + } + + if (sym->flags & def_flags) + conf_warning("override: reassigning to symbol %s", sym->name); + + if (conf_set_sym_val(sym, def, def_flags, val)) + continue; + if (sym && sym_is_choice_value(sym)) { struct symbol *cs = prop_get_symbol(sym_get_choice_prop(sym)); switch (sym->def[def].tri) { -- GitLab From 9925d6b7d12f5019d2a6c465ae72093101edbfd4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 18 Nov 2023 16:59:10 +0900 Subject: [PATCH 0151/1290] kconfig: introduce getline_stripped() helper Currently, newline characters are stripped away in multiple places on the caller. Doing that in the callee is helpful for further cleanups. Signed-off-by: Masahiro Yamada --- scripts/kconfig/confdata.c | 40 +++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/scripts/kconfig/confdata.c b/scripts/kconfig/confdata.c index b6a90f6baea14..795ac6c9378f1 100644 --- a/scripts/kconfig/confdata.c +++ b/scripts/kconfig/confdata.c @@ -337,12 +337,32 @@ e_out: return -1; } +/* like getline(), but the newline character is stripped away */ +static ssize_t getline_stripped(char **lineptr, size_t *n, FILE *stream) +{ + ssize_t len; + + len = compat_getline(lineptr, n, stream); + + if (len > 0 && (*lineptr)[len - 1] == '\n') { + len--; + (*lineptr)[len] = '\0'; + + if (len > 0 && (*lineptr)[len - 1] == '\r') { + len--; + (*lineptr)[len] = '\0'; + } + } + + return len; +} + int conf_read_simple(const char *name, int def) { FILE *in = NULL; char *line = NULL; size_t line_asize = 0; - char *p, *p2, *val; + char *p, *val; struct symbol *sym; int i, def_flags; const char *warn_unknown, *werror, *sym_name; @@ -421,7 +441,7 @@ load: } } - while (compat_getline(&line, &line_asize, in) != -1) { + while (getline_stripped(&line, &line_asize, in) != -1) { conf_lineno++; if (line[0] == '#') { if (line[1] != ' ') @@ -443,19 +463,11 @@ load: p = strchr(sym_name, '='); if (!p) continue; - *p++ = 0; - val = p; - p2 = strchr(p, '\n'); - if (p2) { - *p2-- = 0; - if (*p2 == '\r') - *p2 = 0; - } + *p = 0; + val = p + 1; } else { - if (line[0] != '\r' && line[0] != '\n') - conf_warning("unexpected data: %.*s", - (int)strcspn(line, "\r\n"), line); - + if (line[0] != '\0') + conf_warning("unexpected data: %s", line); continue; } -- GitLab From 4aced3ec84a848bd64bfd725e81c54eb31bf8b24 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 18 Nov 2023 16:59:11 +0900 Subject: [PATCH 0152/1290] kconfig: require an exact match for "is not set" to disable CONFIG option Currently, any string starting "is not set" disables a CONFIG option. For example, "# CONFIG_FOO is not settled down" is accepted as valid input, functioning the same as "# CONFIG_FOO is not set". It is a long-standing oddity. Check the line against the exact pattern "is not set". Signed-off-by: Masahiro Yamada --- scripts/kconfig/confdata.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/kconfig/confdata.c b/scripts/kconfig/confdata.c index 795ac6c9378f1..958be12cd621a 100644 --- a/scripts/kconfig/confdata.c +++ b/scripts/kconfig/confdata.c @@ -454,7 +454,7 @@ load: if (!p) continue; *p++ = 0; - if (strncmp(p, "is not set", 10)) + if (strcmp(p, "is not set")) continue; val = "n"; -- GitLab From 48ab6c9c9256003a4f2d737ccdcba81e01ba4e68 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 18 Nov 2023 16:59:12 +0900 Subject: [PATCH 0153/1290] kconfig: massage the loop in conf_read_simple() Make the while-loop code a little more readable. The gain is that "CONFIG_FOO" without '=' is warned as unexpected data. Signed-off-by: Masahiro Yamada --- scripts/kconfig/confdata.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/scripts/kconfig/confdata.c b/scripts/kconfig/confdata.c index 958be12cd621a..bd14aae1db582 100644 --- a/scripts/kconfig/confdata.c +++ b/scripts/kconfig/confdata.c @@ -443,6 +443,10 @@ load: while (getline_stripped(&line, &line_asize, in) != -1) { conf_lineno++; + + if (!line[0]) /* blank line */ + continue; + if (line[0] == '#') { if (line[1] != ' ') continue; @@ -458,17 +462,20 @@ load: continue; val = "n"; - } else if (memcmp(line, CONFIG_, strlen(CONFIG_)) == 0) { + } else { + if (memcmp(line, CONFIG_, strlen(CONFIG_))) { + conf_warning("unexpected data: %s", line); + continue; + } + sym_name = line + strlen(CONFIG_); p = strchr(sym_name, '='); - if (!p) + if (!p) { + conf_warning("unexpected data: %s", line); continue; + } *p = 0; val = p + 1; - } else { - if (line[0] != '\0') - conf_warning("unexpected data: %s", line); - continue; } sym = sym_find(sym_name); -- GitLab From 884f55f152cb028056bf9efe557a2d7346e932f5 Mon Sep 17 00:00:00 2001 From: Petr Vorel Date: Tue, 21 Nov 2023 12:58:54 +0100 Subject: [PATCH 0154/1290] kbuild: buildtar: Remove unused $dirs The shell variable $dirs is not used any more since 1fc9095846cc ("kbuild: tar-pkg: use tar rules in scripts/Makefile.package"), therefore remove it". Fixes: 1fc9095846cc ("kbuild: tar-pkg: use tar rules in scripts/Makefile.package") Signed-off-by: Petr Vorel Signed-off-by: Masahiro Yamada --- scripts/package/buildtar | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/package/buildtar b/scripts/package/buildtar index 65b4ea5029621..8ac075dd0e9c5 100755 --- a/scripts/package/buildtar +++ b/scripts/package/buildtar @@ -23,7 +23,6 @@ tmpdir=$1 # rm -rf -- "${tmpdir}" mkdir -p -- "${tmpdir}/boot" -dirs=boot # @@ -42,7 +41,6 @@ fi # if grep -q '^CONFIG_MODULES=y' include/config/auto.conf; then make ARCH="${ARCH}" -f ${srctree}/Makefile INSTALL_MOD_PATH="${tmpdir}" modules_install - dirs="$dirs lib" fi -- GitLab From b28d6ca1c9cbb64b0c8e435c0ff34d8c5d52812c Mon Sep 17 00:00:00 2001 From: Petr Vorel Date: Tue, 21 Nov 2023 12:58:55 +0100 Subject: [PATCH 0155/1290] kbuild: buildtar: always make modules_install It is done for the same reasons as 4243afdb9326 does it for builddeb: always runs make modules to install modules.builtin* files, which are needed for e.g. initramfs-tools or LTP testing tool. Signed-off-by: Petr Vorel Signed-off-by: Masahiro Yamada --- scripts/package/buildtar | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/scripts/package/buildtar b/scripts/package/buildtar index 8ac075dd0e9c5..72c91a1b832f9 100755 --- a/scripts/package/buildtar +++ b/scripts/package/buildtar @@ -37,11 +37,9 @@ fi # -# Try to install modules +# Install modules # -if grep -q '^CONFIG_MODULES=y' include/config/auto.conf; then - make ARCH="${ARCH}" -f ${srctree}/Makefile INSTALL_MOD_PATH="${tmpdir}" modules_install -fi +make ARCH="${ARCH}" -f ${srctree}/Makefile INSTALL_MOD_PATH="${tmpdir}" modules_install # -- GitLab From ef6609adf1ecc4c0797a894d4dd365dbbc4903f9 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 23 Nov 2023 16:18:24 +0900 Subject: [PATCH 0156/1290] kbuild: remove the last use of old cmd_src_tar rule in packaging The rpm-pkg and deb-pkg targets have transitioned to using 'git archive' for tarball creation. Although the old cmd_src_tar is still used by snap-pkg, there is no need to pack and unpack a tarball solely for passing the source to snapcraft. Instead, you can use 'source-type: local' to tell the source location to snapcraft. Signed-off-by: Masahiro Yamada --- Makefile | 2 -- scripts/Makefile.package | 24 +----------------------- scripts/package/snapcraft.template | 2 +- 3 files changed, 2 insertions(+), 26 deletions(-) diff --git a/Makefile b/Makefile index 99db546fbb452..0df7372175294 100644 --- a/Makefile +++ b/Makefile @@ -609,8 +609,6 @@ export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL KBUILD_RUSTFLAGS_KERNEL export RCS_FIND_IGNORE := \( -name SCCS -o -name BitKeeper -o -name .svn -o \ -name CVS -o -name .pc -o -name .hg -o -name .git \) \ -prune -o -export RCS_TAR_IGNORE := --exclude SCCS --exclude BitKeeper --exclude .svn \ - --exclude CVS --exclude .pc --exclude .hg --exclude .git # =========================================================================== # Rules shared between *config targets and build targets diff --git a/scripts/Makefile.package b/scripts/Makefile.package index f30349f46a975..0c3adc48dfe89 100644 --- a/scripts/Makefile.package +++ b/scripts/Makefile.package @@ -4,27 +4,6 @@ include $(srctree)/scripts/Kbuild.include include $(srctree)/scripts/Makefile.lib -KERNELPATH := kernel-$(subst -,_,$(KERNELRELEASE)) -# Include only those top-level files that are needed by make, plus the GPL copy -TAR_CONTENT := Documentation LICENSES arch block certs crypto drivers fs \ - include init io_uring ipc kernel lib mm net rust \ - samples scripts security sound tools usr virt \ - .config Makefile \ - Kbuild Kconfig COPYING $(wildcard localversion*) - -quiet_cmd_src_tar = TAR $(2).tar.gz - cmd_src_tar = \ -if test "$(objtree)" != "$(srctree)"; then \ - echo >&2; \ - echo >&2 " ERROR:"; \ - echo >&2 " Building source tarball is not possible outside the"; \ - echo >&2 " kernel source tree. Don't set KBUILD_OUTPUT"; \ - echo >&2; \ - false; \ -fi ; \ -tar -I $(KGZIP) -c $(RCS_TAR_IGNORE) -f $(2).tar.gz \ - --transform 's:^:$(2)/:S' $(TAR_CONTENT) $(3) - # Git # --------------------------------------------------------------------------- @@ -157,9 +136,8 @@ snap-pkg: rm -rf $(objtree)/snap mkdir $(objtree)/snap $(MAKE) clean - $(call cmd,src_tar,$(KERNELPATH)) sed "s@KERNELRELEASE@$(KERNELRELEASE)@; \ - s@SRCTREE@$(shell realpath $(KERNELPATH).tar.gz)@" \ + s@SRCTREE@$(abs_srctree)@" \ $(srctree)/scripts/package/snapcraft.template > \ $(objtree)/snap/snapcraft.yaml cd $(objtree)/snap && \ diff --git a/scripts/package/snapcraft.template b/scripts/package/snapcraft.template index 626d278e4a5a7..85d5e07d1b40b 100644 --- a/scripts/package/snapcraft.template +++ b/scripts/package/snapcraft.template @@ -10,5 +10,5 @@ parts: kernel: plugin: kernel source: SRCTREE - source-type: tar + source-type: local kernel-with-firmware: false -- GitLab From 55d50ace6b88eb273a10963160cadbadccfcdd64 Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Mon, 27 Nov 2023 20:44:05 +0800 Subject: [PATCH 0157/1290] soundwire: generic_bandwidth_allocation use bus->params.max_dr_freq bus->params.max_dr_freq is calculated and set in sdw_bus_master_add(). We can use it directly instead of calculating it again. Signed-off-by: Bard Liao Reviewed-by: Pierre-Louis Bossart Reviewed-by: Rander Wang Link: https://lore.kernel.org/r/20231127124405.2080431-1-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- drivers/soundwire/generic_bandwidth_allocation.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/soundwire/generic_bandwidth_allocation.c b/drivers/soundwire/generic_bandwidth_allocation.c index 31162f2b56381..c70a63d009ae4 100644 --- a/drivers/soundwire/generic_bandwidth_allocation.c +++ b/drivers/soundwire/generic_bandwidth_allocation.c @@ -333,7 +333,7 @@ static int sdw_select_row_col(struct sdw_bus *bus, int clk_freq) */ static int sdw_compute_bus_params(struct sdw_bus *bus) { - unsigned int max_dr_freq, curr_dr_freq = 0; + unsigned int curr_dr_freq = 0; struct sdw_master_prop *mstr_prop = &bus->prop; int i, clk_values, ret; bool is_gear = false; @@ -351,14 +351,12 @@ static int sdw_compute_bus_params(struct sdw_bus *bus) clk_buf = NULL; } - max_dr_freq = mstr_prop->max_clk_freq * SDW_DOUBLE_RATE_FACTOR; - for (i = 0; i < clk_values; i++) { if (!clk_buf) - curr_dr_freq = max_dr_freq; + curr_dr_freq = bus->params.max_dr_freq; else curr_dr_freq = (is_gear) ? - (max_dr_freq >> clk_buf[i]) : + (bus->params.max_dr_freq >> clk_buf[i]) : clk_buf[i] * SDW_DOUBLE_RATE_FACTOR; if (curr_dr_freq <= bus->params.bandwidth) -- GitLab From 72108c0b9c0e004d7dfc2fc88b02c30b12711325 Mon Sep 17 00:00:00 2001 From: Yang Jihong Date: Tue, 31 Oct 2023 10:55:23 +0000 Subject: [PATCH 0158/1290] perf tools: Add --debug-file option to redirect debug output Currently, debug messages is output to stderr, add --debug-file option to support redirection to a specified file. Some test scenarios: # perf --list-opts --help --version --exec-path --html-path --paginate --no-pager --debugfs-dir --buildid-dir --list-cmds --list-opts --debug --debug-file # perf --debug-file No path given for --debug-file. Usage: perf [--version] [--help] [OPTIONS] COMMAND [ARGS] # perf --debug-file /sys/perf.log record -v true Open debug file '/sys/perf.log' failed: Permission denied Usage: perf [--version] [--help] [OPTIONS] COMMAND [ARGS] # perf --debug-file /tmp/perf.log record -v true [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.013 MB perf.data (26 samples) ] # cat /tmp/perf.log DEBUGINFOD_URLS= Using CPUID GenuineIntel-6-3E-4 nr_cblocks: 0 affinity: SYS mmap flush: 1 comp level: 0 mmap size 528384B Control descriptor is not initialized mmap size 528384B Looking at the vmlinux_path (8 entries long) Using /proc/kcore for kernel data Using /proc/kallsyms for symbols symbol:unmap_start file:(null) line:0 offset:0 return:0 lazy:(null) symbol:unmap_complete file:(null) line:0 offset:0 return:0 lazy:(null) symbol:map_start file:(null) line:0 offset:0 return:0 lazy:(null) symbol:map_complete file:(null) line:0 offset:0 return:0 lazy:(null) symbol:reloc_start file:(null) line:0 offset:0 return:0 lazy:(null) symbol:reloc_complete file:(null) line:0 offset:0 return:0 lazy:(null) symbol:init_start file:(null) line:0 offset:0 return:0 lazy:(null) symbol:init_complete file:(null) line:0 offset:0 return:0 lazy:(null) symbol:lll_lock_wait_private file:(null) line:0 offset:0 return:0 lazy:(null) symbol:lll_lock_wait file:(null) line:0 offset:0 return:0 lazy:(null) symbol:setjmp file:(null) line:0 offset:0 return:0 lazy:(null) symbol:longjmp file:(null) line:0 offset:0 return:0 lazy:(null) symbol:longjmp_target file:(null) line:0 offset:0 return:0 lazy:(null) failed to write feature HYBRID_TOPOLOGY Signed-off-by: Yang Jihong Acked-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231031105523.1472558-1-yangjihong1@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf.txt | 3 +++ tools/perf/perf.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/tools/perf/Documentation/perf.txt b/tools/perf/Documentation/perf.txt index ba3df49c169d3..a7cf7bc2f9689 100644 --- a/tools/perf/Documentation/perf.txt +++ b/tools/perf/Documentation/perf.txt @@ -64,6 +64,9 @@ OPTIONS perf-event-open - Print perf_event_open() arguments and return value +--debug-file:: + Write debug output to a specified file. + DESCRIPTION ----------- Performance counters for Linux are a new kernel-based subsystem diff --git a/tools/perf/perf.c b/tools/perf/perf.c index d3fc8090413c8..921bee0a64370 100644 --- a/tools/perf/perf.c +++ b/tools/perf/perf.c @@ -39,6 +39,7 @@ #include static int use_pager = -1; +static FILE *debug_fp = NULL; struct cmd_struct { const char *cmd; @@ -162,6 +163,19 @@ static void commit_pager_choice(void) } } +static int set_debug_file(const char *path) +{ + debug_fp = fopen(path, "w"); + if (!debug_fp) { + fprintf(stderr, "Open debug file '%s' failed: %s\n", + path, strerror(errno)); + return -1; + } + + debug_set_file(debug_fp); + return 0; +} + struct option options[] = { OPT_ARGUMENT("help", "help"), OPT_ARGUMENT("version", "version"), @@ -174,6 +188,7 @@ struct option options[] = { OPT_ARGUMENT("list-cmds", "list-cmds"), OPT_ARGUMENT("list-opts", "list-opts"), OPT_ARGUMENT("debug", "debug"), + OPT_ARGUMENT("debug-file", "debug-file"), OPT_END() }; @@ -287,6 +302,18 @@ static int handle_options(const char ***argv, int *argc, int *envchanged) (*argv)++; (*argc)--; + } else if (!strcmp(cmd, "--debug-file")) { + if (*argc < 2) { + fprintf(stderr, "No path given for --debug-file.\n"); + usage(perf_usage_string); + } + + if (set_debug_file((*argv)[1])) + usage(perf_usage_string); + + (*argv)++; + (*argc)--; + } else { fprintf(stderr, "Unknown option: %s\n", cmd); usage(perf_usage_string); @@ -547,5 +574,8 @@ int main(int argc, const char **argv) fprintf(stderr, "Failed to run command '%s': %s\n", cmd, str_error_r(errno, sbuf, sizeof(sbuf))); out: + if (debug_fp) + fclose(debug_fp); + return 1; } -- GitLab From d60469d7c0e5c4e8de10699377d2ba79004236a4 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 9 Nov 2023 15:59:51 -0800 Subject: [PATCH 0159/1290] perf dwarf-aux: Add die_find_variable_by_addr() The die_find_variable_by_addr() is to find a variables in the given DIE using given (PC-relative) address. Global variables will have a location expression with DW_OP_addr which has an address so can simply compare it with the address. <1><143a7>: Abbrev Number: 2 (DW_TAG_variable) <143a8> DW_AT_name : loops_per_jiffy <143ac> DW_AT_type : <0x1cca> <143b0> DW_AT_external : 1 <143b0> DW_AT_decl_file : 193 <143b1> DW_AT_decl_line : 213 <143b2> DW_AT_location : 9 byte block: 3 b0 46 41 82 ff ff ff ff (DW_OP_addr: ffffffff824146b0) Note that the type-offset should be calculated from the base address of the global variable. Signed-off-by: Namhyung Kim Acked-by: Masami Hiramatsu (Google) Cc: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: linux-toolchains@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Link: https://lore.kernel.org/r/20231110000012.3538610-33-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-aux.c | 79 +++++++++++++++++++++++++++++++++++++ tools/perf/util/dwarf-aux.h | 14 +++++++ 2 files changed, 93 insertions(+) diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index 652e6e7368a26..edd9e407bc74e 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -1250,8 +1250,12 @@ out: struct find_var_data { /* Target instruction address */ Dwarf_Addr pc; + /* Target memory address (for global data) */ + Dwarf_Addr addr; /* Target register */ unsigned reg; + /* Access offset, set for global data */ + int offset; }; /* Max number of registers DW_OP_regN supports */ @@ -1312,6 +1316,81 @@ Dwarf_Die *die_find_variable_by_reg(Dwarf_Die *sc_die, Dwarf_Addr pc, int reg, }; return die_find_child(sc_die, __die_find_var_reg_cb, &data, die_mem); } + +/* Only checks direct child DIEs in the given scope */ +static int __die_find_var_addr_cb(Dwarf_Die *die_mem, void *arg) +{ + struct find_var_data *data = arg; + int tag = dwarf_tag(die_mem); + ptrdiff_t off = 0; + Dwarf_Attribute attr; + Dwarf_Addr base, start, end; + Dwarf_Word size; + Dwarf_Die type_die; + Dwarf_Op *ops; + size_t nops; + + if (tag != DW_TAG_variable) + return DIE_FIND_CB_SIBLING; + + if (dwarf_attr(die_mem, DW_AT_location, &attr) == NULL) + return DIE_FIND_CB_SIBLING; + + while ((off = dwarf_getlocations(&attr, off, &base, &start, &end, &ops, &nops)) > 0) { + if (ops->atom != DW_OP_addr) + continue; + + if (data->addr < ops->number) + continue; + + if (data->addr == ops->number) { + /* Update offset relative to the start of the variable */ + data->offset = 0; + return DIE_FIND_CB_END; + } + + if (die_get_real_type(die_mem, &type_die) == NULL) + continue; + + if (dwarf_aggregate_size(&type_die, &size) < 0) + continue; + + if (data->addr >= ops->number + size) + continue; + + /* Update offset relative to the start of the variable */ + data->offset = data->addr - ops->number; + return DIE_FIND_CB_END; + } + return DIE_FIND_CB_SIBLING; +} + +/** + * die_find_variable_by_addr - Find variable located at given address + * @sc_die: a scope DIE + * @pc: the program address to find + * @addr: the data address to find + * @die_mem: a buffer to save the resulting DIE + * @offset: the offset in the resulting type + * + * Find the variable DIE located at the given address (in PC-relative mode). + * This is usually for global variables. + */ +Dwarf_Die *die_find_variable_by_addr(Dwarf_Die *sc_die, Dwarf_Addr pc, + Dwarf_Addr addr, Dwarf_Die *die_mem, + int *offset) +{ + struct find_var_data data = { + .pc = pc, + .addr = addr, + }; + Dwarf_Die *result; + + result = die_find_child(sc_die, __die_find_var_addr_cb, &data, die_mem); + if (result) + *offset = data.offset; + return result; +} #endif /* diff --git a/tools/perf/util/dwarf-aux.h b/tools/perf/util/dwarf-aux.h index b6f430730bd13..0ddf61fd3f8be 100644 --- a/tools/perf/util/dwarf-aux.h +++ b/tools/perf/util/dwarf-aux.h @@ -141,6 +141,11 @@ int die_get_var_range(Dwarf_Die *sp_die, Dwarf_Die *vr_die, struct strbuf *buf); Dwarf_Die *die_find_variable_by_reg(Dwarf_Die *sc_die, Dwarf_Addr pc, int reg, Dwarf_Die *die_mem); +/* Find a (global) variable located in the 'addr' */ +Dwarf_Die *die_find_variable_by_addr(Dwarf_Die *sc_die, Dwarf_Addr pc, + Dwarf_Addr addr, Dwarf_Die *die_mem, + int *offset); + #else /* HAVE_DWARF_GETLOCATIONS_SUPPORT */ static inline int die_get_var_range(Dwarf_Die *sp_die __maybe_unused, @@ -158,6 +163,15 @@ static inline Dwarf_Die *die_find_variable_by_reg(Dwarf_Die *sc_die __maybe_unus return NULL; } +static inline Dwarf_Die *die_find_variable_by_addr(Dwarf_Die *sc_die __maybe_unused, + Dwarf_Addr pc __maybe_unused, + Dwarf_Addr addr __maybe_unused, + Dwarf_Die *die_mem __maybe_unused, + int *offset __maybe_unused) +{ + return NULL; +} + #endif /* HAVE_DWARF_GETLOCATIONS_SUPPORT */ #endif /* _DWARF_AUX_H */ -- GitLab From 5940a20a186bd74efd6d0dc0b2b7c77d891895d9 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 2 Nov 2023 10:56:46 -0700 Subject: [PATCH 0160/1290] perf mmap: Lazily initialize zstd streams to save memory when not using it Zstd streams create dictionaries that can require significant RAM, especially when there is one per-CPU. Tools like 'perf record' won't use the streams without the -z option, and so the creation of the streams is pure overhead. Switch to creating the streams on first use. Committer notes: ssize_t comes from sys/types.h, size_t from stddef.h. This worked on glibc as stdlib.h includes both, but not on musl libc. So do what 'man size_t' says and include sys/types.h and stddef.h instead of stdlib.h Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Jajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Miguel Ojeda Cc: Ming Wang Cc: Namhyung Kim Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231102175735.2272696-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 26 ++++++++++----- tools/perf/util/compress.h | 7 +++-- tools/perf/util/mmap.c | 5 ++- tools/perf/util/mmap.h | 1 - tools/perf/util/zstd.c | 63 +++++++++++++++++++------------------ 5 files changed, 59 insertions(+), 43 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 8ec818568662e..9b4f3805ca92f 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -270,7 +270,7 @@ static int record__write(struct record *rec, struct mmap *map __maybe_unused, static int record__aio_enabled(struct record *rec); static int record__comp_enabled(struct record *rec); -static size_t zstd_compress(struct perf_session *session, struct mmap *map, +static ssize_t zstd_compress(struct perf_session *session, struct mmap *map, void *dst, size_t dst_size, void *src, size_t src_size); #ifdef HAVE_AIO_SUPPORT @@ -405,9 +405,13 @@ static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size */ if (record__comp_enabled(aio->rec)) { - size = zstd_compress(aio->rec->session, NULL, aio->data + aio->size, - mmap__mmap_len(map) - aio->size, - buf, size); + ssize_t compressed = zstd_compress(aio->rec->session, NULL, aio->data + aio->size, + mmap__mmap_len(map) - aio->size, + buf, size); + if (compressed < 0) + return (int)compressed; + + size = compressed; } else { memcpy(aio->data + aio->size, buf, size); } @@ -633,7 +637,13 @@ static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size) struct record *rec = to; if (record__comp_enabled(rec)) { - size = zstd_compress(rec->session, map, map->data, mmap__mmap_len(map), bf, size); + ssize_t compressed = zstd_compress(rec->session, map, map->data, + mmap__mmap_len(map), bf, size); + + if (compressed < 0) + return (int)compressed; + + size = compressed; bf = map->data; } @@ -1527,10 +1537,10 @@ static size_t process_comp_header(void *record, size_t increment) return size; } -static size_t zstd_compress(struct perf_session *session, struct mmap *map, +static ssize_t zstd_compress(struct perf_session *session, struct mmap *map, void *dst, size_t dst_size, void *src, size_t src_size) { - size_t compressed; + ssize_t compressed; size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1; struct zstd_data *zstd_data = &session->zstd_data; @@ -1539,6 +1549,8 @@ static size_t zstd_compress(struct perf_session *session, struct mmap *map, compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size, max_record_size, process_comp_header); + if (compressed < 0) + return compressed; if (map && map->file) { thread->bytes_transferred += src_size; diff --git a/tools/perf/util/compress.h b/tools/perf/util/compress.h index 0cd3369af2a4f..b29109cd36095 100644 --- a/tools/perf/util/compress.h +++ b/tools/perf/util/compress.h @@ -3,6 +3,8 @@ #define PERF_COMPRESS_H #include +#include +#include #ifdef HAVE_ZSTD_SUPPORT #include #endif @@ -21,6 +23,7 @@ struct zstd_data { #ifdef HAVE_ZSTD_SUPPORT ZSTD_CStream *cstream; ZSTD_DStream *dstream; + int comp_level; #endif }; @@ -29,7 +32,7 @@ struct zstd_data { int zstd_init(struct zstd_data *data, int level); int zstd_fini(struct zstd_data *data); -size_t zstd_compress_stream_to_records(struct zstd_data *data, void *dst, size_t dst_size, +ssize_t zstd_compress_stream_to_records(struct zstd_data *data, void *dst, size_t dst_size, void *src, size_t src_size, size_t max_record_size, size_t process_header(void *record, size_t increment)); @@ -48,7 +51,7 @@ static inline int zstd_fini(struct zstd_data *data __maybe_unused) } static inline -size_t zstd_compress_stream_to_records(struct zstd_data *data __maybe_unused, +ssize_t zstd_compress_stream_to_records(struct zstd_data *data __maybe_unused, void *dst __maybe_unused, size_t dst_size __maybe_unused, void *src __maybe_unused, size_t src_size __maybe_unused, size_t max_record_size __maybe_unused, diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index 49093b21ee2da..122ee198a86e9 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -295,15 +295,14 @@ int mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, struct perf_cpu map->core.flush = mp->flush; - map->comp_level = mp->comp_level; #ifndef PYTHON_PERF - if (zstd_init(&map->zstd_data, map->comp_level)) { + if (zstd_init(&map->zstd_data, mp->comp_level)) { pr_debug2("failed to init mmap compressor, error %d\n", errno); return -1; } #endif - if (map->comp_level && !perf_mmap__aio_enabled(map)) { + if (mp->comp_level && !perf_mmap__aio_enabled(map)) { map->data = mmap(NULL, mmap__mmap_len(map), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); if (map->data == MAP_FAILED) { diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index f944c3cd5efa0..0df6e1621c7e8 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -39,7 +39,6 @@ struct mmap { #endif struct mmap_cpu_mask affinity_mask; void *data; - int comp_level; struct perf_data_file *file; struct zstd_data zstd_data; }; diff --git a/tools/perf/util/zstd.c b/tools/perf/util/zstd.c index 48dd2b018c47a..57027e0ac7b65 100644 --- a/tools/perf/util/zstd.c +++ b/tools/perf/util/zstd.c @@ -7,35 +7,9 @@ int zstd_init(struct zstd_data *data, int level) { - size_t ret; - - data->dstream = ZSTD_createDStream(); - if (data->dstream == NULL) { - pr_err("Couldn't create decompression stream.\n"); - return -1; - } - - ret = ZSTD_initDStream(data->dstream); - if (ZSTD_isError(ret)) { - pr_err("Failed to initialize decompression stream: %s\n", ZSTD_getErrorName(ret)); - return -1; - } - - if (!level) - return 0; - - data->cstream = ZSTD_createCStream(); - if (data->cstream == NULL) { - pr_err("Couldn't create compression stream.\n"); - return -1; - } - - ret = ZSTD_initCStream(data->cstream, level); - if (ZSTD_isError(ret)) { - pr_err("Failed to initialize compression stream: %s\n", ZSTD_getErrorName(ret)); - return -1; - } - + data->comp_level = level; + data->dstream = NULL; + data->cstream = NULL; return 0; } @@ -54,7 +28,7 @@ int zstd_fini(struct zstd_data *data) return 0; } -size_t zstd_compress_stream_to_records(struct zstd_data *data, void *dst, size_t dst_size, +ssize_t zstd_compress_stream_to_records(struct zstd_data *data, void *dst, size_t dst_size, void *src, size_t src_size, size_t max_record_size, size_t process_header(void *record, size_t increment)) { @@ -63,6 +37,21 @@ size_t zstd_compress_stream_to_records(struct zstd_data *data, void *dst, size_t ZSTD_outBuffer output; void *record; + if (!data->cstream) { + data->cstream = ZSTD_createCStream(); + if (data->cstream == NULL) { + pr_err("Couldn't create compression stream.\n"); + return -1; + } + + ret = ZSTD_initCStream(data->cstream, data->comp_level); + if (ZSTD_isError(ret)) { + pr_err("Failed to initialize compression stream: %s\n", + ZSTD_getErrorName(ret)); + return -1; + } + } + while (input.pos < input.size) { record = dst; size = process_header(record, 0); @@ -96,6 +85,20 @@ size_t zstd_decompress_stream(struct zstd_data *data, void *src, size_t src_size ZSTD_inBuffer input = { src, src_size, 0 }; ZSTD_outBuffer output = { dst, dst_size, 0 }; + if (!data->dstream) { + data->dstream = ZSTD_createDStream(); + if (data->dstream == NULL) { + pr_err("Couldn't create decompression stream.\n"); + return 0; + } + + ret = ZSTD_initDStream(data->dstream); + if (ZSTD_isError(ret)) { + pr_err("Failed to initialize decompression stream: %s\n", + ZSTD_getErrorName(ret)); + return 0; + } + } while (input.pos < input.size) { ret = ZSTD_decompressStream(data->dstream, &output, &input); if (ZSTD_isError(ret)) { -- GitLab From a472ee42e6f60c8714e2306385687922afcba8c4 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 29 Nov 2023 12:47:17 -0300 Subject: [PATCH 0161/1290] perf test sigtrap: Generalize the BTF routine to reuse it in this test Move the part that loads the BTF info to a "btf__available()" that will lazy load the BTF info so that if we need it for some other test, which we will in the following cset, we can reuse it. At some point this will move from this specific 'perf test' entry to be used in other parts of perf, do it when needed. Cc: Adrian Hunter Cc: Clark Williams Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kate Carcia Cc: Namhyung Kim Cc: Thomas Gleixner Link: https://lore.kernel.org/r/20231129154718.326330-2-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/sigtrap.c | 60 +++++++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 20 deletions(-) diff --git a/tools/perf/tests/sigtrap.c b/tools/perf/tests/sigtrap.c index 1de7478ec1894..a1bc7c776254e 100644 --- a/tools/perf/tests/sigtrap.c +++ b/tools/perf/tests/sigtrap.c @@ -57,36 +57,51 @@ static struct perf_event_attr make_event_attr(void) #ifdef HAVE_BPF_SKEL #include -static bool attr_has_sigtrap(void) +static struct btf *btf; + +static bool btf__available(void) { - bool ret = false; - struct btf *btf; - const struct btf_type *t; + if (btf == NULL) + btf = btf__load_vmlinux_btf(); + + return btf != NULL; +} + +static void btf__exit(void) +{ + btf__free(btf); + btf = NULL; +} + +static const struct btf_member *__btf_type__find_member_by_name(int type_id, const char *member_name) +{ + const struct btf_type *t = btf__type_by_id(btf, type_id); const struct btf_member *m; - const char *name; - int i, id; + int i; + + for (i = 0, m = btf_members(t); i < btf_vlen(t); i++, m++) { + const char *current_member_name = btf__name_by_offset(btf, m->name_off); + if (!strcmp(current_member_name, member_name)) + return m; + } + + return NULL; +} + +static bool attr_has_sigtrap(void) +{ + int id; - btf = btf__load_vmlinux_btf(); - if (btf == NULL) { + if (!btf__available()) { /* should be an old kernel */ return false; } id = btf__find_by_name_kind(btf, "perf_event_attr", BTF_KIND_STRUCT); if (id < 0) - goto out; + return false; - t = btf__type_by_id(btf, id); - for (i = 0, m = btf_members(t); i < btf_vlen(t); i++, m++) { - name = btf__name_by_offset(btf, m->name_off); - if (!strcmp(name, "sigtrap")) { - ret = true; - break; - } - } -out: - btf__free(btf); - return ret; + return __btf_type__find_member_by_name(id, "sigtrap") != NULL; } #else /* !HAVE_BPF_SKEL */ static bool attr_has_sigtrap(void) @@ -109,6 +124,10 @@ static bool attr_has_sigtrap(void) return ret; } + +static void btf__exit(void) +{ +} #endif /* HAVE_BPF_SKEL */ static void @@ -221,6 +240,7 @@ out_restore_sigaction: sigaction(SIGTRAP, &oldact, NULL); out: pthread_barrier_destroy(&barrier); + btf__exit(); return ret; } -- GitLab From 650e0bde43f35bb675e87e30f679a57cfa22e0e5 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 29 Nov 2023 12:47:18 -0300 Subject: [PATCH 0162/1290] perf tests sigtrap: Skip if running on a kernel with sleepable spinlocks There are issues as reported that need some more investigation on the RT kernel front, till that is addressed, skip this test. This test is already skipped for multiple hardware architectures where the tested kernel feature is not supported. Acked-by: Marco Elver Cc: Adrian Hunter Cc: Clark Williams Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Juri Lelli Cc: Kate Carcia Cc: Marco Elver Cc: Mike Galbraith Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lore.kernel.org/all/e368f2c848d77fbc8d259f44e2055fe469c219cf.camel@gmx.de/ Link: https://lore.kernel.org/r/20231129154718.326330-3-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/sigtrap.c | 46 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/tools/perf/tests/sigtrap.c b/tools/perf/tests/sigtrap.c index a1bc7c776254e..e6fd934b027a3 100644 --- a/tools/perf/tests/sigtrap.c +++ b/tools/perf/tests/sigtrap.c @@ -103,6 +103,34 @@ static bool attr_has_sigtrap(void) return __btf_type__find_member_by_name(id, "sigtrap") != NULL; } + +static bool kernel_with_sleepable_spinlocks(void) +{ + const struct btf_member *member; + const struct btf_type *type; + const char *type_name; + int id; + + if (!btf__available()) + return false; + + id = btf__find_by_name_kind(btf, "spinlock", BTF_KIND_STRUCT); + if (id < 0) + return false; + + // Only RT has a "lock" member for "struct spinlock" + member = __btf_type__find_member_by_name(id, "lock"); + if (member == NULL) + return false; + + // But check its type as well + type = btf__type_by_id(btf, member->type); + if (!type || !btf_is_struct(type)) + return false; + + type_name = btf__name_by_offset(btf, type->name_off); + return type_name && !strcmp(type_name, "rt_mutex_base"); +} #else /* !HAVE_BPF_SKEL */ static bool attr_has_sigtrap(void) { @@ -125,6 +153,11 @@ static bool attr_has_sigtrap(void) return ret; } +static bool kernel_with_sleepable_spinlocks(void) +{ + return false; +} + static void btf__exit(void) { } @@ -166,7 +199,7 @@ static int run_test_threads(pthread_t *threads, pthread_barrier_t *barrier) static int run_stress_test(int fd, pthread_t *threads, pthread_barrier_t *barrier) { - int ret; + int ret, expected_sigtraps; ctx.iterate_on = 3000; @@ -175,7 +208,16 @@ static int run_stress_test(int fd, pthread_t *threads, pthread_barrier_t *barrie ret = run_test_threads(threads, barrier); TEST_ASSERT_EQUAL("disable failed", ioctl(fd, PERF_EVENT_IOC_DISABLE, 0), 0); - TEST_ASSERT_EQUAL("unexpected sigtraps", ctx.signal_count, NUM_THREADS * ctx.iterate_on); + expected_sigtraps = NUM_THREADS * ctx.iterate_on; + + if (ctx.signal_count < expected_sigtraps && kernel_with_sleepable_spinlocks()) { + pr_debug("Expected %d sigtraps, got %d, running on a kernel with sleepable spinlocks.\n", + expected_sigtraps, ctx.signal_count); + pr_debug("See https://lore.kernel.org/all/e368f2c848d77fbc8d259f44e2055fe469c219cf.camel@gmx.de/\n"); + return TEST_SKIP; + } else + TEST_ASSERT_EQUAL("unexpected sigtraps", ctx.signal_count, expected_sigtraps); + TEST_ASSERT_EQUAL("missing signals or incorrectly delivered", ctx.tids_want_signal, 0); TEST_ASSERT_VAL("unexpected si_addr", ctx.first_siginfo.si_addr == &ctx.iterate_on); #if 0 /* FIXME: enable when libc's signal.h has si_perf_{type,data} */ -- GitLab From 72a2a0a494ec9aefbca4ad64f46b8e3370809993 Mon Sep 17 00:00:00 2001 From: Likhitha Korrapati Date: Sun, 26 Nov 2023 02:09:14 -0500 Subject: [PATCH 0163/1290] perf test record+probe_libc_inet_pton: Fix call chain match on powerpc The perf test "probe libc's inet_pton & backtrace it with ping" fails on powerpc as below: # perf test -v "probe libc's inet_pton & backtrace it with ping" 85: probe libc's inet_pton & backtrace it with ping : --- start --- test child forked, pid 96028 ping 96056 [002] 127271.101961: probe_libc:inet_pton: (7fffa1779a60) 7fffa1779a60 __GI___inet_pton+0x0 (/usr/lib64/glibc-hwcaps/power10/libc.so.6) 7fffa172a73c getaddrinfo+0x121c (/usr/lib64/glibc-hwcaps/power10/libc.so.6) FAIL: expected backtrace entry "gaih_inet.*\+0x[[:xdigit:]]+[[:space:]]\(/usr/lib64/glibc-hwcaps/power10/libc.so.6\)$" got "7fffa172a73c getaddrinfo+0x121c (/usr/lib64/glibc-hwcaps/power10/libc.so.6)" test child finished with -1 ---- end ---- probe libc's inet_pton & backtrace it with ping: FAILED! This test installs a probe on libc's inet_pton function, which will use uprobes and then uses perf trace on a ping to localhost. It gets 3 levels deep backtrace and checks whether it is what we expected or not. The test started failing from RHEL 9.4 where as it works in previous distro version (RHEL 9.2). Test expects gaih_inet function to be part of backtrace. But in the glibc version (2.34-86) which is part of distro where it fails, this function is missing and hence the test is failing. From nm and ping command output we can confirm that gaih_inet function is not present in the expected backtrace for glibc version glibc-2.34-86 [root@xxx perf]# nm /usr/lib64/glibc-hwcaps/power10/libc.so.6 | grep gaih_inet 00000000001273e0 t gaih_inet_serv 00000000001cd8d8 r gaih_inet_typeproto [root@xxx perf]# perf script -i /tmp/perf.data.6E8 ping 104048 [000] 128582.508976: probe_libc:inet_pton: (7fff83779a60) 7fff83779a60 __GI___inet_pton+0x0 (/usr/lib64/glibc-hwcaps/power10/libc.so.6) 7fff8372a73c getaddrinfo+0x121c (/usr/lib64/glibc-hwcaps/power10/libc.so.6) 11dc73534 [unknown] (/usr/bin/ping) 7fff8362a8c4 __libc_start_call_main+0x84 (/usr/lib64/glibc-hwcaps/power10/libc.so.6) FAIL: expected backtrace entry "gaih_inet.*\+0x[[:xdigit:]]+[[:space:]]\(/usr/lib64/glibc-hwcaps/power10/libc.so.6\)$" got "7fff9d52a73c getaddrinfo+0x121c (/usr/lib64/glibc-hwcaps/power10/libc.so.6)" With version glibc-2.34-60 gaih_inet function is present as part of the expected backtrace. So we cannot just remove the gaih_inet function from the backtrace. [root@xxx perf]# nm /usr/lib64/glibc-hwcaps/power10/libc.so.6 | grep gaih_inet 0000000000130490 t gaih_inet.constprop.0 000000000012e830 t gaih_inet_serv 00000000001d45e4 r gaih_inet_typeproto [root@xxx perf]# ./perf script -i /tmp/perf.data.b6S ping 67906 [000] 22699.591699: probe_libc:inet_pton_3: (7fffbdd80820) 7fffbdd80820 __GI___inet_pton+0x0 (/usr/lib64/glibc-hwcaps/power10/libc.so.6) 7fffbdd31160 gaih_inet.constprop.0+0xcd0 (/usr/lib64/glibc-hwcaps/power10/libc.so.6) 7fffbdd31c7c getaddrinfo+0x14c (/usr/lib64/glibc-hwcaps/power10/libc.so.6) 1140d3558 [unknown] (/usr/bin/ping) This patch solves this issue by doing a conditional skip. If there is a gaih_inet function present in the libc then it will be added to the expected backtrace else the function will be skipped from being added to the expected backtrace. Output with the patch [root@xxx perf]# ./perf test -v "probe libc's inet_pton & backtrace it with ping" 83: probe libc's inet_pton & backtrace it with ping : --- start --- test child forked, pid 102662 ping 102692 [000] 127935.549973: probe_libc:inet_pton: (7fff93379a60) 7fff93379a60 __GI___inet_pton+0x0 (/usr/lib64/glibc-hwcaps/power10/libc.so.6) 7fff9332a73c getaddrinfo+0x121c (/usr/lib64/glibc-hwcaps/power10/libc.so.6) 11ef03534 [unknown] (/usr/bin/ping) test child finished with 0 ---- end ---- probe libc's inet_pton & backtrace it with ping: Ok Reported-by: Disha Goel Reviewed-by: Athira Jajeev Reviewed-by: Ian Rogers Signed-off-by: Likhitha Korrapati Tested-by: Disha Goel Cc: Adrian Hunter Cc: Disha Goel Cc: James Clark Cc: Jiri Olsa Cc: Kajol Jain Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: linuxppc-dev@lists.ozlabs.org Link: https://lore.kernel.org/r/20231126070914.175332-1-likhitha@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/record+probe_libc_inet_pton.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/perf/tests/shell/record+probe_libc_inet_pton.sh b/tools/perf/tests/shell/record+probe_libc_inet_pton.sh index eebeea6bdc767..72c65570db378 100755 --- a/tools/perf/tests/shell/record+probe_libc_inet_pton.sh +++ b/tools/perf/tests/shell/record+probe_libc_inet_pton.sh @@ -45,7 +45,10 @@ trace_libc_inet_pton_backtrace() { ;; ppc64|ppc64le) eventattr='max-stack=4' - echo "gaih_inet.*\+0x[[:xdigit:]]+[[:space:]]\($libc\)$" >> $expected + # Add gaih_inet to expected backtrace only if it is part of libc. + if nm $libc | grep -F -q gaih_inet.; then + echo "gaih_inet.*\+0x[[:xdigit:]]+[[:space:]]\($libc\)$" >> $expected + fi echo "getaddrinfo\+0x[[:xdigit:]]+[[:space:]]\($libc\)$" >> $expected echo ".*(\+0x[[:xdigit:]]+|\[unknown\])[[:space:]]\(.*/bin/ping.*\)$" >> $expected ;; -- GitLab From 11baacb2fd9b1c6e759ad54ca13f259fa1769c73 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 28 Nov 2023 15:48:09 -0600 Subject: [PATCH 0164/1290] dt-bindings: input: sprd,sc27xx-vibrator: Drop incomplete example The example for the Spreadtrum SC27xx PMIC vibrator is incomplete as the binding is the full PMIC, not just the sub-functions. It is preferred for MFD examples to be complete in the top-level MFD device binding rather than piecemeal in each sub-function binding. This also fixes an undocumented (by schema) compatible warning for "sprd,sc2731". Signed-off-by: Rob Herring Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20231128214809.3975719-1-robh@kernel.org Signed-off-by: Dmitry Torokhov --- .../bindings/input/sprd,sc27xx-vibrator.yaml | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/Documentation/devicetree/bindings/input/sprd,sc27xx-vibrator.yaml b/Documentation/devicetree/bindings/input/sprd,sc27xx-vibrator.yaml index a401a0bfcbec2..4c8d303ff93c9 100644 --- a/Documentation/devicetree/bindings/input/sprd,sc27xx-vibrator.yaml +++ b/Documentation/devicetree/bindings/input/sprd,sc27xx-vibrator.yaml @@ -28,21 +28,4 @@ required: additionalProperties: false -examples: - - | - #include - sc2731_pmic: pmic@0 { - compatible = "sprd,sc2731"; - reg = <0 0>; - spi-max-frequency = <26000000>; - interrupts = ; - interrupt-controller; - #interrupt-cells = <2>; - #address-cells = <1>; - #size-cells = <0>; - - vibrator@eb4 { - compatible = "sprd,sc2731-vibrator"; - reg = <0xeb4>; - }; - }; +... -- GitLab From a42b4bd51b2aec77fb38f1ce0f41846055a8d33c Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 28 Nov 2023 15:48:16 -0600 Subject: [PATCH 0165/1290] dt-bindings: input: mediatek,pmic-keys: Drop incomplete example The example for the Mediatek PMIC keys is incomplete as the binding is the full PMIC, not just the sub-functions. It is preferred for MFD examples to be complete in the top-level MFD device binding rather than piecemeal in each sub-function binding. This also fixes an undocumented (by schema) compatible warning for "mediatek,mt6397". Signed-off-by: Rob Herring Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20231128214816.3975893-1-robh@kernel.org Signed-off-by: Dmitry Torokhov --- .../bindings/input/mediatek,pmic-keys.yaml | 24 +------------------ 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/Documentation/devicetree/bindings/input/mediatek,pmic-keys.yaml b/Documentation/devicetree/bindings/input/mediatek,pmic-keys.yaml index e34c9e78d38d8..70567d92c746e 100644 --- a/Documentation/devicetree/bindings/input/mediatek,pmic-keys.yaml +++ b/Documentation/devicetree/bindings/input/mediatek,pmic-keys.yaml @@ -90,26 +90,4 @@ required: unevaluatedProperties: false -examples: - - | - #include - #include - - pmic { - compatible = "mediatek,mt6397"; - - keys { - compatible = "mediatek,mt6397-keys"; - mediatek,long-press-mode = <1>; - power-off-time-sec = <0>; - - key-power { - linux,keycodes = ; - wakeup-source; - }; - - key-home { - linux,keycodes = ; - }; - }; - }; +... -- GitLab From d2ff98b7926e0956c0b92f7d5066cd8a65e7dfef Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Wed, 29 Nov 2023 13:06:14 +0200 Subject: [PATCH 0166/1290] dt-bindings: input: gpio-keys: Allow optional dedicated wakeirq Allow configuring an optional dedicated wakeirq for gpio-keys that some SoCs have. Let's use the common interrupt naming "irq" and "wakeup" that we already have in use for some drivers and subsystems like i2c framework. Note that the gpio-keys interrupt property is optional. If only a gpio property is specified, the driver tries to translate the gpio into an interrupt. Reviewed-by: Rob Herring Signed-off-by: Tony Lindgren Link: https://lore.kernel.org/r/20231129110618.27551-1-tony@atomide.com Signed-off-by: Dmitry Torokhov --- .../devicetree/bindings/input/gpio-keys.yaml | 41 ++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/input/gpio-keys.yaml b/Documentation/devicetree/bindings/input/gpio-keys.yaml index 159cd9d9fe573..cc78c21529213 100644 --- a/Documentation/devicetree/bindings/input/gpio-keys.yaml +++ b/Documentation/devicetree/bindings/input/gpio-keys.yaml @@ -31,7 +31,23 @@ patternProperties: maxItems: 1 interrupts: - maxItems: 1 + oneOf: + - items: + - description: Optional key interrupt or wakeup interrupt + - items: + - description: Key interrupt + - description: Wakeup interrupt + + interrupt-names: + description: + Optional interrupt names, can be used to specify a separate dedicated + wake-up interrupt in addition to the gpio irq + oneOf: + - items: + - enum: [ irq, wakeup ] + - items: + - const: irq + - const: wakeup label: description: Descriptive name of the key. @@ -97,6 +113,20 @@ patternProperties: - required: - gpios + allOf: + - if: + properties: + interrupts: + minItems: 2 + required: + - interrupts + then: + properties: + interrupt-names: + minItems: 2 + required: + - interrupt-names + dependencies: wakeup-event-action: [ wakeup-source ] linux,input-value: [ gpios ] @@ -137,6 +167,15 @@ examples: linux,code = <108>; interrupts = <1 IRQ_TYPE_EDGE_FALLING>; }; + + key-wakeup { + label = "GPIO Key WAKEUP"; + linux,code = <143>; + interrupts-extended = <&intc 2 IRQ_TYPE_EDGE_FALLING>, + <&intc_wakeup 0 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "irq", "wakeup"; + wakeup-source; + }; }; ... -- GitLab From 3717194f249227a3dfd8433bd9374cc7e0cf823d Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Wed, 29 Nov 2023 13:06:15 +0200 Subject: [PATCH 0167/1290] Input: gpio-keys - add system suspend support for dedicated wakeirqs Some SoCs have a separate dedicated wake-up interrupt controller that can be used to wake up the system from deeper idle states. We already support configuring a separate interrupt for a gpio-keys button to be used with a gpio line. However, we are lacking support system suspend for cases where a separate interrupt needs to be used in deeper sleep modes. Because of it's nature, gpio-keys does not know about the runtime PM state of the button gpios, and may have several gpio buttons configured for each gpio-keys device instance. Implementing runtime PM support for gpio-keys does not help, and we cannot use drivers/base/power/wakeirq.c support. We need to implement custom wakeirq support for gpio-keys. For handling a dedicated wakeirq for system suspend, we enable and disable it with gpio_keys_enable_wakeup() and gpio_keys_disable_wakeup() that we already use based on device_may_wakeup(). Some systems may have a dedicated wakeirq that can also be used as the main interrupt, this is already working for gpio-keys. Let's add some wakeirq related comments while at it as the usage with a gpio line and separate interrupt line may not be obvious. Tested-by: Dhruva Gole Signed-off-by: Tony Lindgren Link: https://lore.kernel.org/r/20231129110618.27551-2-tony@atomide.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/gpio_keys.c | 69 ++++++++++++++++++++++++++++-- include/linux/gpio_keys.h | 2 + 2 files changed, 67 insertions(+), 4 deletions(-) diff --git a/drivers/input/keyboard/gpio_keys.c b/drivers/input/keyboard/gpio_keys.c index 2e7c2c046e675..06c09b7addf8e 100644 --- a/drivers/input/keyboard/gpio_keys.c +++ b/drivers/input/keyboard/gpio_keys.c @@ -45,7 +45,9 @@ struct gpio_button_data { unsigned int software_debounce; /* in msecs, for GPIO-driven buttons */ unsigned int irq; + unsigned int wakeirq; unsigned int wakeup_trigger_type; + spinlock_t lock; bool disabled; bool key_pressed; @@ -511,6 +513,7 @@ static int gpio_keys_setup_key(struct platform_device *pdev, struct gpio_button_data *bdata = &ddata->data[idx]; irq_handler_t isr; unsigned long irqflags; + const char *wakedesc; int irq; int error; @@ -575,6 +578,14 @@ static int gpio_keys_setup_key(struct platform_device *pdev, !gpiod_cansleep(bdata->gpiod); } + /* + * If an interrupt was specified, use it instead of the gpio + * interrupt and use the gpio for reading the state. A separate + * interrupt may be used as the main button interrupt for + * runtime PM to detect events also in deeper idle states. If a + * dedicated wakeirq is used for system suspend only, see below + * for bdata->wakeirq setup. + */ if (button->irq) { bdata->irq = button->irq; } else { @@ -672,6 +683,36 @@ static int gpio_keys_setup_key(struct platform_device *pdev, return error; } + if (!button->wakeirq) + return 0; + + /* Use :wakeup suffix like drivers/base/power/wakeirq.c does */ + wakedesc = devm_kasprintf(dev, GFP_KERNEL, "%s:wakeup", desc); + if (!wakedesc) + return -ENOMEM; + + bdata->wakeirq = button->wakeirq; + irqflags |= IRQF_NO_SUSPEND; + + /* + * Wakeirq shares the handler with the main interrupt, it's only + * active during system suspend. See gpio_keys_button_enable_wakeup() + * and gpio_keys_button_disable_wakeup(). + */ + error = devm_request_any_context_irq(dev, bdata->wakeirq, isr, + irqflags, wakedesc, bdata); + if (error < 0) { + dev_err(dev, "Unable to claim wakeirq %d; error %d\n", + bdata->irq, error); + return error; + } + + /* + * Disable wakeirq until suspend. IRQF_NO_AUTOEN won't work if + * IRQF_SHARED was set based on !button->can_disable. + */ + disable_irq(bdata->wakeirq); + return 0; } @@ -728,7 +769,7 @@ gpio_keys_get_devtree_pdata(struct device *dev) struct gpio_keys_platform_data *pdata; struct gpio_keys_button *button; struct fwnode_handle *child; - int nbuttons; + int nbuttons, irq; nbuttons = device_get_child_node_count(dev); if (nbuttons == 0) @@ -750,9 +791,19 @@ gpio_keys_get_devtree_pdata(struct device *dev) device_property_read_string(dev, "label", &pdata->name); device_for_each_child_node(dev, child) { - if (is_of_node(child)) - button->irq = - irq_of_parse_and_map(to_of_node(child), 0); + if (is_of_node(child)) { + irq = of_irq_get_byname(to_of_node(child), "irq"); + if (irq > 0) + button->irq = irq; + + irq = of_irq_get_byname(to_of_node(child), "wakeup"); + if (irq > 0) + button->wakeirq = irq; + + if (!button->irq && !button->wakeirq) + button->irq = + irq_of_parse_and_map(to_of_node(child), 0); + } if (fwnode_property_read_u32(child, "linux,code", &button->code)) { @@ -921,6 +972,11 @@ gpio_keys_button_enable_wakeup(struct gpio_button_data *bdata) } } + if (bdata->wakeirq) { + enable_irq(bdata->wakeirq); + disable_irq(bdata->irq); + } + return 0; } @@ -929,6 +985,11 @@ gpio_keys_button_disable_wakeup(struct gpio_button_data *bdata) { int error; + if (bdata->wakeirq) { + enable_irq(bdata->irq); + disable_irq(bdata->wakeirq); + } + /* * The trigger type is always both edges for gpio-based keys and we do * not support changing wakeup trigger for interrupt-based keys. diff --git a/include/linux/gpio_keys.h b/include/linux/gpio_keys.h index 3f84aeb81e480..80fa930b04c67 100644 --- a/include/linux/gpio_keys.h +++ b/include/linux/gpio_keys.h @@ -21,6 +21,7 @@ struct device; * disable button via sysfs * @value: axis value for %EV_ABS * @irq: Irq number in case of interrupt keys + * @wakeirq: Optional dedicated wake-up interrupt */ struct gpio_keys_button { unsigned int code; @@ -34,6 +35,7 @@ struct gpio_keys_button { bool can_disable; int value; unsigned int irq; + unsigned int wakeirq; }; /** -- GitLab From af76b2dec0984a079d8497bfa37d29a9b55932e1 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 30 Nov 2023 14:11:45 -0300 Subject: [PATCH 0168/1290] libapi: Add missing linux/types.h header to get the __u64 type on io.h There are functions using __u64, so we need to have the linux/types.h header otherwise we'll break when its not included before api/io.h. Fixes: e95770af4c4a280f ("tools api: Add a lightweight buffered reading api") Reviewed-by: Ian Rogers Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/ZWjDPL+IzPPsuC3X@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/api/io.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lib/api/io.h b/tools/lib/api/io.h index a77b74c5fb655..2a7fe97588131 100644 --- a/tools/lib/api/io.h +++ b/tools/lib/api/io.h @@ -12,6 +12,7 @@ #include #include #include +#include struct io { /* File descriptor being read/ */ -- GitLab From 366efbff58092fac48421fa34018bb34c326088e Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 27 Nov 2023 14:08:14 -0800 Subject: [PATCH 0169/1290] libperf: Lazily allocate/size mmap event copy The event copy in the mmap is used to have storage to read an event. Not all users of mmaps read the events, such as perf record. The amount of buffer was also statically set to PERF_SAMPLE_MAX_SIZE rather than the amount necessary from the header's event size. Switch to a model where the event_copy is reallocated if too small to the event's size. This adds the potential for the event to move, so if a copy of the event pointer were stored it could be broken. All the current users do: while(event = perf_mmap__read_event()) { ... } and so they would be broken due to the event being overwritten if they had stored the pointer. Manual inspection and address sanitizer testing also shows the event pointer not being stored. Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Jajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Guilherme Amadio Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Miguel Ojeda Cc: Ming Wang Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231127220902.1315692-3-irogers@google.com [ Replace two lines with equivalent zfree(&map->event_copy) ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/include/internal/mmap.h | 3 ++- tools/lib/perf/mmap.c | 20 +++++++++++++++++--- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/tools/lib/perf/include/internal/mmap.h b/tools/lib/perf/include/internal/mmap.h index 5a062af8e9d8e..5f08cab61ecec 100644 --- a/tools/lib/perf/include/internal/mmap.h +++ b/tools/lib/perf/include/internal/mmap.h @@ -33,7 +33,8 @@ struct perf_mmap { bool overwrite; u64 flush; libperf_unmap_cb_t unmap_cb; - char event_copy[PERF_SAMPLE_MAX_SIZE] __aligned(8); + void *event_copy; + size_t event_copy_sz; struct perf_mmap *next; }; diff --git a/tools/lib/perf/mmap.c b/tools/lib/perf/mmap.c index 2184814b37dd3..0c903c2372c97 100644 --- a/tools/lib/perf/mmap.c +++ b/tools/lib/perf/mmap.c @@ -19,6 +19,7 @@ void perf_mmap__init(struct perf_mmap *map, struct perf_mmap *prev, bool overwrite, libperf_unmap_cb_t unmap_cb) { + /* Assume fields were zero initialized. */ map->fd = -1; map->overwrite = overwrite; map->unmap_cb = unmap_cb; @@ -51,13 +52,18 @@ int perf_mmap__mmap(struct perf_mmap *map, struct perf_mmap_param *mp, void perf_mmap__munmap(struct perf_mmap *map) { - if (map && map->base != NULL) { + if (!map) + return; + + zfree(&map->event_copy); + map->event_copy_sz = 0; + if (map->base) { munmap(map->base, perf_mmap__mmap_len(map)); map->base = NULL; map->fd = -1; refcount_set(&map->refcnt, 0); } - if (map && map->unmap_cb) + if (map->unmap_cb) map->unmap_cb(map); } @@ -223,9 +229,17 @@ static union perf_event *perf_mmap__read(struct perf_mmap *map, */ if ((*startp & map->mask) + size != ((*startp + size) & map->mask)) { unsigned int offset = *startp; - unsigned int len = min(sizeof(*event), size), cpy; + unsigned int len = size, cpy; void *dst = map->event_copy; + if (size > map->event_copy_sz) { + dst = realloc(map->event_copy, size); + if (!dst) + return NULL; + map->event_copy = dst; + map->event_copy_sz = size; + } + do { cpy = min(map->mask + 1 - (offset & map->mask), len); memcpy(dst, &data[offset & map->mask], cpy); -- GitLab From b6a15269cee255dc5fe75c249c701e3956d892cb Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 27 Nov 2023 14:08:16 -0800 Subject: [PATCH 0170/1290] tools api fs: Switch filename__read_str to use io.h filename__read_str() has its own string reading code that allocates memory before reading into it. The memory allocated is sized at BUFSIZ that is 8kb. Most strings are short and so most of this 8kb is wasted. Refactor io__getline(), as io__getdelim(), so that the newline character can be configurable and ignored in the case of filename__read_str(). Code like build_caches_for_cpu() in perf's header.c will read many strings and hold them in a data structure, in this case multiple strings per cache level per CPU. Using io.h's io__getline() avoids the wasted memory as strings are temporarily read into a buffer on the stack before being copied to a buffer that grows 128 bytes at a time and is never sized larger than the string. For a 16 hyperthread system the memory consumption of "perf record true" is reduced by 180kb, primarily through saving memory when reading the cache information. Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Jajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Guilherme Amadio Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Miguel Ojeda Cc: Ming Wang Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231127220902.1315692-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/api/fs/fs.c | 56 +++++++++++-------------------------------- tools/lib/api/io.h | 11 ++++++--- 2 files changed, 22 insertions(+), 45 deletions(-) diff --git a/tools/lib/api/fs/fs.c b/tools/lib/api/fs/fs.c index 5cb0eeec2c8a6..004f2af5504b6 100644 --- a/tools/lib/api/fs/fs.c +++ b/tools/lib/api/fs/fs.c @@ -16,6 +16,7 @@ #include #include "fs.h" +#include "../io.h" #include "debug-internal.h" #define _STR(x) #x @@ -344,53 +345,24 @@ int filename__read_ull(const char *filename, unsigned long long *value) return filename__read_ull_base(filename, value, 0); } -#define STRERR_BUFSIZE 128 /* For the buffer size of strerror_r */ - int filename__read_str(const char *filename, char **buf, size_t *sizep) { - size_t size = 0, alloc_size = 0; - void *bf = NULL, *nbf; - int fd, n, err = 0; - char sbuf[STRERR_BUFSIZE]; + struct io io; + char bf[128]; + int err; - fd = open(filename, O_RDONLY); - if (fd < 0) + io.fd = open(filename, O_RDONLY); + if (io.fd < 0) return -errno; - - do { - if (size == alloc_size) { - alloc_size += BUFSIZ; - nbf = realloc(bf, alloc_size); - if (!nbf) { - err = -ENOMEM; - break; - } - - bf = nbf; - } - - n = read(fd, bf + size, alloc_size - size); - if (n < 0) { - if (size) { - pr_warn("read failed %d: %s\n", errno, - strerror_r(errno, sbuf, sizeof(sbuf))); - err = 0; - } else - err = -errno; - - break; - } - - size += n; - } while (n > 0); - - if (!err) { - *sizep = size; - *buf = bf; + io__init(&io, io.fd, bf, sizeof(bf)); + *buf = NULL; + err = io__getdelim(&io, buf, sizep, /*delim=*/-1); + if (err < 0) { + free(*buf); + *buf = NULL; } else - free(bf); - - close(fd); + err = 0; + close(io.fd); return err; } diff --git a/tools/lib/api/io.h b/tools/lib/api/io.h index 2a7fe97588131..84adf81020185 100644 --- a/tools/lib/api/io.h +++ b/tools/lib/api/io.h @@ -141,8 +141,8 @@ static inline int io__get_dec(struct io *io, __u64 *dec) } } -/* Read up to and including the first newline following the pattern of getline. */ -static inline ssize_t io__getline(struct io *io, char **line_out, size_t *line_len_out) +/* Read up to and including the first delim. */ +static inline ssize_t io__getdelim(struct io *io, char **line_out, size_t *line_len_out, int delim) { char buf[128]; int buf_pos = 0; @@ -152,7 +152,7 @@ static inline ssize_t io__getline(struct io *io, char **line_out, size_t *line_l /* TODO: reuse previously allocated memory. */ free(*line_out); - while (ch != '\n') { + while (ch != delim) { ch = io__get_char(io); if (ch < 0) @@ -185,4 +185,9 @@ err_out: return -ENOMEM; } +static inline ssize_t io__getline(struct io *io, char **line_out, size_t *line_len_out) +{ + return io__getdelim(io, line_out, line_len_out, /*delim=*/'\n'); +} + #endif /* __API_IO__ */ -- GitLab From f8846a1a3c54a53f1c30836a2b7143cfa5da223d Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 27 Nov 2023 14:08:17 -0800 Subject: [PATCH 0171/1290] tools api fs: Avoid reading whole file for a 1 byte bool sysfs__read_bool() used the first byte from a fully read file into a string. It then looked at the first byte's value. Avoid doing this and just read the first byte. Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Jajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Guilherme Amadio Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Miguel Ojeda Cc: Ming Wang Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231127220902.1315692-6-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/api/fs/fs.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/tools/lib/api/fs/fs.c b/tools/lib/api/fs/fs.c index 004f2af5504b6..337fde770e45f 100644 --- a/tools/lib/api/fs/fs.c +++ b/tools/lib/api/fs/fs.c @@ -447,15 +447,22 @@ int sysfs__read_str(const char *entry, char **buf, size_t *sizep) int sysfs__read_bool(const char *entry, bool *value) { - char *buf; - size_t size; - int ret; + struct io io; + char bf[16]; + int ret = 0; + char path[PATH_MAX]; + const char *sysfs = sysfs__mountpoint(); + + if (!sysfs) + return -1; - ret = sysfs__read_str(entry, &buf, &size); - if (ret < 0) - return ret; + snprintf(path, sizeof(path), "%s/%s", sysfs, entry); + io.fd = open(path, O_RDONLY); + if (io.fd < 0) + return -errno; - switch (buf[0]) { + io__init(&io, io.fd, bf, sizeof(bf)); + switch (io__get_char(&io)) { case '1': case 'y': case 'Y': @@ -469,8 +476,7 @@ int sysfs__read_bool(const char *entry, bool *value) default: ret = -1; } - - free(buf); + close(io.fd); return ret; } -- GitLab From ab47505ce45b869ab649024dc932e981fcdd6e5f Mon Sep 17 00:00:00 2001 From: Flavio Suligoi Date: Wed, 29 Nov 2023 17:45:14 +0100 Subject: [PATCH 0172/1290] backlight: mp3309c: Fix uninitialized local variable In the function "pm3309c_parse_dt_node", when the dimming analog control mode (by I2C messages) is enabled, the local variable "prop_levels" is tested without any initialization, as indicated by the following smatch warning: drivers/video/backlight/mp3309c.c:279 pm3309c_parse_dt_node() error: uninitialized symbol 'prop_levels'. To avoid any problem in case of undefined behavior, we need to initialize it to "NULL". Reported-by: Dan Carpenter Closes: https://lore.kernel.org/dri-devel/af0a1870-693b-442f-9b11-0503cfcd944a@moroto.mountain/ Fixes: 2e914516a58c ("backlight: mp3309c: Add support for MPS MP3309C") Signed-off-by: Flavio Suligoi Reviewed-by: Daniel Thompson Link: https://lore.kernel.org/r/20231129164514.2772719-1-f.suligoi@asem.it Signed-off-by: Lee Jones --- drivers/video/backlight/mp3309c.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/video/backlight/mp3309c.c b/drivers/video/backlight/mp3309c.c index 3fe4469ef43fd..34d71259fac1d 100644 --- a/drivers/video/backlight/mp3309c.c +++ b/drivers/video/backlight/mp3309c.c @@ -203,7 +203,8 @@ static int pm3309c_parse_dt_node(struct mp3309c_chip *chip, struct mp3309c_platform_data *pdata) { struct device_node *node = chip->dev->of_node; - struct property *prop_pwms, *prop_levels; + struct property *prop_pwms; + struct property *prop_levels = NULL; int length = 0; int ret, i; unsigned int num_levels, tmp_value; -- GitLab From 92ef432f027cffe0ff91ff2cbe9258d89ca53968 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 23 Nov 2023 18:05:40 +0900 Subject: [PATCH 0173/1290] kbuild: support W=c and W=e shorthands for Kconfig KCONFIG_WARN_UNKNOWN_SYMBOLS=1 and KCONFIG_WERROR=1 are descriptive and suitable in scripting, but typing them from the command line can be tedious. Associate them with KBUILD_EXTRA_WARN (and the W= shorthand). Support a new letter 'c' to enable extra checks in Kconfig. You can still manage compiler warnings (W=1) and Kconfig warnings (W=c) independently. Reuse the letter 'e' to turn Kconfig warnings into errors. As usual, you can combine multiple letters in KCONFIG_EXTRA_WARN. $ KCONFIG_WARN_UNKNOWN_SYMBOLS=1 KCONFIG_WERROR=1 make defconfig can be shortened to: $ KBUILD_EXTRA_WARN=ce make defconfig or, even shorter: $ make W=ce defconfig Signed-off-by: Masahiro Yamada Reviewed-by: Nathan Chancellor --- Makefile | 10 ++++++++++ scripts/Makefile.extrawarn | 9 --------- scripts/kconfig/Makefile | 8 ++++++++ 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 0df7372175294..5a11804af640a 100644 --- a/Makefile +++ b/Makefile @@ -155,6 +155,15 @@ endif export KBUILD_EXTMOD +# backward compatibility +KBUILD_EXTRA_WARN ?= $(KBUILD_ENABLE_EXTRA_GCC_CHECKS) + +ifeq ("$(origin W)", "command line") + KBUILD_EXTRA_WARN := $(W) +endif + +export KBUILD_EXTRA_WARN + # Kbuild will save output files in the current working directory. # This does not need to match to the root of the kernel source tree. # @@ -1659,6 +1668,7 @@ help: @echo ' 1: warnings which may be relevant and do not occur too often' @echo ' 2: warnings which occur quite often but may still be relevant' @echo ' 3: more obscure warnings, can most likely be ignored' + @echo ' c: extra checks in the configuration stage (Kconfig)' @echo ' e: warnings are being treated as errors' @echo ' Multiple levels can be combined with W=12 or W=123' @$(if $(dtstree), \ diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn index 2fe6f2828d376..3f94915fab372 100644 --- a/scripts/Makefile.extrawarn +++ b/scripts/Makefile.extrawarn @@ -80,15 +80,6 @@ KBUILD_CFLAGS += $(call cc-option,-Werror=designated-init) # Warn if there is an enum types mismatch KBUILD_CFLAGS += $(call cc-option,-Wenum-conversion) -# backward compatibility -KBUILD_EXTRA_WARN ?= $(KBUILD_ENABLE_EXTRA_GCC_CHECKS) - -ifeq ("$(origin W)", "command line") - KBUILD_EXTRA_WARN := $(W) -endif - -export KBUILD_EXTRA_WARN - # # W=1 - warnings which may be relevant and do not occur too often # diff --git a/scripts/kconfig/Makefile b/scripts/kconfig/Makefile index 4eee155121a8b..322c061b464d2 100644 --- a/scripts/kconfig/Makefile +++ b/scripts/kconfig/Makefile @@ -27,6 +27,14 @@ KCONFIG_DEFCONFIG_LIST += \ endif KCONFIG_DEFCONFIG_LIST += arch/$(SRCARCH)/configs/$(KBUILD_DEFCONFIG) +ifneq ($(findstring c, $(KBUILD_EXTRA_WARN)),) +export KCONFIG_WARN_UNKNOWN_SYMBOLS=1 +endif + +ifneq ($(findstring e, $(KBUILD_EXTRA_WARN)),) +export KCONFIG_WERROR=1 +endif + # We need this, in case the user has it in its environment unexport CONFIG_ -- GitLab From 0df8e97085946dd79c06720678a845778b6d6bf8 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 24 Nov 2023 23:09:08 +0900 Subject: [PATCH 0174/1290] scripts: clean up IA-64 code A little more janitorial work after commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture"). Signed-off-by: Masahiro Yamada Reviewed-by: Nicolas Schier --- scripts/checkstack.pl | 3 --- scripts/gdb/linux/tasks.py | 15 +++------------ scripts/head-object-list.txt | 1 - scripts/kconfig/mconf.c | 2 +- scripts/kconfig/nconf.c | 2 +- scripts/package/kernel.spec | 6 ------ scripts/package/mkdebian | 2 +- scripts/recordmcount.c | 1 - scripts/recordmcount.pl | 7 ------- scripts/xz_wrap.sh | 1 - 10 files changed, 6 insertions(+), 34 deletions(-) diff --git a/scripts/checkstack.pl b/scripts/checkstack.pl index d83ba5d8f3f49..5995dd11a5a6e 100755 --- a/scripts/checkstack.pl +++ b/scripts/checkstack.pl @@ -68,9 +68,6 @@ my (@stack, $re, $dre, $sub, $x, $xs, $funcre, $min_stack); # 2f60: 48 81 ec e8 05 00 00 sub $0x5e8,%rsp $re = qr/^.*[as][du][db] \$(0x$x{1,8}),\%(e|r)sp$/o; $dre = qr/^.*[as][du][db] (%.*),\%(e|r)sp$/o; - } elsif ($arch eq 'ia64') { - #e0000000044011fc: 01 0f fc 8c adds r12=-384,r12 - $re = qr/.*adds.*r12=-(([0-9]{2}|[3-9])[0-9]{2}),r12/o; } elsif ($arch eq 'm68k') { # 2b6c: 4e56 fb70 linkw %fp,#-1168 # 1df770: defc ffe4 addaw #-28,%sp diff --git a/scripts/gdb/linux/tasks.py b/scripts/gdb/linux/tasks.py index 17ec19e9b5bf6..5be53b372a693 100644 --- a/scripts/gdb/linux/tasks.py +++ b/scripts/gdb/linux/tasks.py @@ -86,21 +86,12 @@ LxPs() thread_info_type = utils.CachedType("struct thread_info") -ia64_task_size = None - def get_thread_info(task): thread_info_ptr_type = thread_info_type.get_type().pointer() - if utils.is_target_arch("ia64"): - global ia64_task_size - if ia64_task_size is None: - ia64_task_size = gdb.parse_and_eval("sizeof(struct task_struct)") - thread_info_addr = task.address + ia64_task_size - thread_info = thread_info_addr.cast(thread_info_ptr_type) - else: - if task.type.fields()[0].type == thread_info_type.get_type(): - return task['thread_info'] - thread_info = task['stack'].cast(thread_info_ptr_type) + if task.type.fields()[0].type == thread_info_type.get_type(): + return task['thread_info'] + thread_info = task['stack'].cast(thread_info_ptr_type) return thread_info.dereference() diff --git a/scripts/head-object-list.txt b/scripts/head-object-list.txt index 26359968744ef..890f69005bab4 100644 --- a/scripts/head-object-list.txt +++ b/scripts/head-object-list.txt @@ -17,7 +17,6 @@ arch/arm/kernel/head-nommu.o arch/arm/kernel/head.o arch/csky/kernel/head.o arch/hexagon/kernel/head.o -arch/ia64/kernel/head.o arch/loongarch/kernel/head.o arch/m68k/68000/head.o arch/m68k/coldfire/head.o diff --git a/scripts/kconfig/mconf.c b/scripts/kconfig/mconf.c index eccc87a441e71..3795c36a9181a 100644 --- a/scripts/kconfig/mconf.c +++ b/scripts/kconfig/mconf.c @@ -247,7 +247,7 @@ search_help[] = " -> PCI support (PCI [=y])\n" "(1) -> PCI access mode ( [=y])\n" " Defined at drivers/pci/Kconfig:47\n" - " Depends on: X86_LOCAL_APIC && X86_IO_APIC || IA64\n" + " Depends on: X86_LOCAL_APIC && X86_IO_APIC\n" " Selects: LIBCRC32\n" " Selected by: BAR [=n]\n" "-----------------------------------------------------------------\n" diff --git a/scripts/kconfig/nconf.c b/scripts/kconfig/nconf.c index 143a2c351d576..8cd72fe259740 100644 --- a/scripts/kconfig/nconf.c +++ b/scripts/kconfig/nconf.c @@ -216,7 +216,7 @@ search_help[] = "Symbol: FOO [ = m]\n" "Prompt: Foo bus is used to drive the bar HW\n" "Defined at drivers/pci/Kconfig:47\n" -"Depends on: X86_LOCAL_APIC && X86_IO_APIC || IA64\n" +"Depends on: X86_LOCAL_APIC && X86_IO_APIC\n" "Location:\n" " -> Bus options (PCI, PCMCIA, EISA, ISA)\n" " -> PCI support (PCI [ = y])\n" diff --git a/scripts/package/kernel.spec b/scripts/package/kernel.spec index 3eee0143e0c5c..89298983a1694 100644 --- a/scripts/package/kernel.spec +++ b/scripts/package/kernel.spec @@ -56,13 +56,7 @@ patch -p1 < %{SOURCE2} %install mkdir -p %{buildroot}/boot -%ifarch ia64 -mkdir -p %{buildroot}/boot/efi -cp $(%{make} %{makeflags} -s image_name) %{buildroot}/boot/efi/vmlinuz-%{KERNELRELEASE} -ln -s efi/vmlinuz-%{KERNELRELEASE} %{buildroot}/boot/ -%else cp $(%{make} %{makeflags} -s image_name) %{buildroot}/boot/vmlinuz-%{KERNELRELEASE} -%endif %{make} %{makeflags} INSTALL_MOD_PATH=%{buildroot} modules_install %{make} %{makeflags} INSTALL_HDR_PATH=%{buildroot}/usr headers_install cp System.map %{buildroot}/boot/System.map-%{KERNELRELEASE} diff --git a/scripts/package/mkdebian b/scripts/package/mkdebian index 5044224cf6714..c1a36da85e84f 100755 --- a/scripts/package/mkdebian +++ b/scripts/package/mkdebian @@ -26,7 +26,7 @@ set_debarch() { # Attempt to find the correct Debian architecture case "$UTS_MACHINE" in - i386|ia64|alpha|m68k|riscv*) + i386|alpha|m68k|riscv*) debarch="$UTS_MACHINE" ;; x86_64) debarch=amd64 ;; diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index 40ae6b2c7a6da..3e4f54799cc0a 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -590,7 +590,6 @@ static int do_file(char const *const fname) ideal_nop = ideal_nop4_arm64; is_fake_mcount64 = arm64_is_fake_mcount; break; - case EM_IA_64: reltype = R_IA64_IMM64; break; case EM_MIPS: /* reltype: e_class */ break; case EM_LOONGARCH: /* reltype: e_class */ break; case EM_PPC: reltype = R_PPC_ADDR32; break; diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 6a4645a579760..f84df9e383fd0 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -275,13 +275,6 @@ if ($arch eq "x86_64") { $section_type = '%progbits'; $mcount_regex = "^\\s*([0-9a-fA-F]+):\\s*R_AARCH64_CALL26\\s+_mcount\$"; $type = ".quad"; -} elsif ($arch eq "ia64") { - $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s_mcount\$"; - $type = "data8"; - - if ($is_module eq "0") { - $cc .= " -mconstant-gp"; - } } elsif ($arch eq "sparc64") { # In the objdump output there are giblets like: # 0000000000000000 : diff --git a/scripts/xz_wrap.sh b/scripts/xz_wrap.sh index 76e9cbcfbeab4..d06baf626abe7 100755 --- a/scripts/xz_wrap.sh +++ b/scripts/xz_wrap.sh @@ -15,7 +15,6 @@ LZMA2OPTS= case $SRCARCH in x86) BCJ=--x86 ;; powerpc) BCJ=--powerpc ;; - ia64) BCJ=--ia64; LZMA2OPTS=pb=4 ;; arm) BCJ=--arm ;; sparc) BCJ=--sparc ;; esac -- GitLab From 4e244c10eab345a735c5052688e4a55bddce5bf7 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 26 Nov 2023 01:35:58 +0900 Subject: [PATCH 0175/1290] kconfig: remove unneeded symbol_empty variable This is used only for initializing other variables. Use the empty string "" directly. Please note newval.tri is unused for S_INT/HEX/STRING. Signed-off-by: Masahiro Yamada --- scripts/kconfig/symbol.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/scripts/kconfig/symbol.c b/scripts/kconfig/symbol.c index a76925b46ce63..f7075d148ac79 100644 --- a/scripts/kconfig/symbol.c +++ b/scripts/kconfig/symbol.c @@ -29,12 +29,6 @@ struct symbol symbol_no = { .flags = SYMBOL_CONST|SYMBOL_VALID, }; -static struct symbol symbol_empty = { - .name = "", - .curr = { "", no }, - .flags = SYMBOL_VALID, -}; - struct symbol *modules_sym; static tristate modules_val; @@ -346,7 +340,7 @@ void sym_calc_value(struct symbol *sym) case S_INT: case S_HEX: case S_STRING: - newval = symbol_empty.curr; + newval.val = ""; break; case S_BOOLEAN: case S_TRISTATE: @@ -697,13 +691,12 @@ const char *sym_get_string_default(struct symbol *sym) { struct property *prop; struct symbol *ds; - const char *str; + const char *str = ""; tristate val; sym_calc_visibility(sym); sym_calc_value(modules_sym); val = symbol_no.curr.tri; - str = symbol_empty.curr.val; /* If symbol has a default value look it up */ prop = sym_get_default_prop(sym); -- GitLab From 6262afa10ef7cc8fdf39b81a36f9546b68810431 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 26 Nov 2023 01:35:59 +0900 Subject: [PATCH 0176/1290] kconfig: default to zero if int/hex symbol lacks default property When a default property is missing in an int or hex symbol, it defaults to an empty string, which is not a valid symbol value. It results in an incorrect .config, and can also lead to an infinite loop in scripting. Use "0" for int and "0x0" for hex as a default value. Signed-off-by: Masahiro Yamada Reviewed-by: Yoann Congal --- scripts/kconfig/symbol.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/scripts/kconfig/symbol.c b/scripts/kconfig/symbol.c index f7075d148ac79..a5a4f9153eb73 100644 --- a/scripts/kconfig/symbol.c +++ b/scripts/kconfig/symbol.c @@ -338,7 +338,11 @@ void sym_calc_value(struct symbol *sym) switch (sym->type) { case S_INT: + newval.val = "0"; + break; case S_HEX: + newval.val = "0x0"; + break; case S_STRING: newval.val = ""; break; @@ -746,14 +750,17 @@ const char *sym_get_string_default(struct symbol *sym) case yes: return "y"; } case S_INT: + if (!str[0]) + str = "0"; + break; case S_HEX: - return str; - case S_STRING: - return str; - case S_UNKNOWN: + if (!str[0]) + str = "0x0"; + break; + default: break; } - return ""; + return str; } const char *sym_get_string_value(struct symbol *sym) -- GitLab From 072b6ad7cac6a868e56dec5f48a2e67d9ab8cb6e Mon Sep 17 00:00:00 2001 From: Nick Forrington Date: Thu, 2 Nov 2023 16:11:16 +0000 Subject: [PATCH 0177/1290] perf docs: Fix man page formatting for 'perf lock' This makes "CONTENTION" a top level section (rather than a subsection of "INFO"). Fixes: 79079f21f50a501f ("perf lock: Add -k and -F options to 'contention' subcommand") Reviewed-by: James Clark Signed-off-by: Nick Forrington Tested-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Link: https://lore.kernel.org/r/20231102161117.49533-1-nick.forrington@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-lock.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt index 503abcba14380..f5938d616d751 100644 --- a/tools/perf/Documentation/perf-lock.txt +++ b/tools/perf/Documentation/perf-lock.txt @@ -119,7 +119,7 @@ INFO OPTIONS CONTENTION OPTIONS --------------- +------------------ -k:: --key=:: -- GitLab From 556bed5c6d4167f9ffb5c8648cdd3c8e39aefec7 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 30 Nov 2023 18:46:36 -0300 Subject: [PATCH 0178/1290] perf beauty: Don't use 'find ... -printf' as it isn't available in busybox MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Namhyung reported: I'm seeing a build error on my Alpine linux image which uses busybox + musl libc: In file included from trace/beauty/arch_errno_names.c:1, from builtin-trace.c:899: /build/trace/beauty/generated/arch_errno_name_array.c: In function 'arch_syscalls__strerrno': /build/trace/beauty/generated/arch_errno_name_array.c:142:49: error: unused parameter 'arch' [-Werror=unused-parameter] 142 | const char *arch_syscalls__strerrno(const char *arch, int err) It looks like busybox find command doesn't have -printf option find: unrecognized: -printf , Yesterday 9:16 PM , BusyBox v1.36.1 (2023-07-27 17:12:24 UTC) multi-call binary. Usage: find [-HL] [PATH]... [OPTIONS] [ACTIONS] Search for files and perform actions on them. First failed action stops processing of current file. Defaults: PATH is current directory, action is '-print' So just remove it and pipe find's entry to a basename loop to produce the same result. Then use an alternative loop that relies on the shell to avoid needless forks and execs. The discussion about it generated the impetus to stop doing strcmps to find the right table at each errno to string translation but instead do this just once and then use a function pointer to the right arch specific table. Suggested-by: David Laight Reported-by: Namhyung Kim Cc: Adrian Hunter Cc: Hendrik Brueckner Cc: Ian Rogers Cc: Jiri Olsa Cc: Michael Petlan Cc: Thomas Richter Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/trace/beauty/arch_errno_names.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/perf/trace/beauty/arch_errno_names.sh b/tools/perf/trace/beauty/arch_errno_names.sh index cc09dcaa891e0..b6e0767b4b34e 100755 --- a/tools/perf/trace/beauty/arch_errno_names.sh +++ b/tools/perf/trace/beauty/arch_errno_names.sh @@ -76,7 +76,9 @@ EoHEADER # Create list of architectures that have a specific errno.h. archlist="" -for arch in $(find $toolsdir/arch -maxdepth 1 -mindepth 1 -type d -printf "%f\n" | sort -r); do +for f in $toolsdir/arch/*/include/uapi/asm/errno.h; do + d=${f%/include/uapi/asm/errno.h} + arch="${d##*/}" test -f $toolsdir/arch/$arch/include/uapi/asm/errno.h && archlist="$archlist $arch" done -- GitLab From 54373b5d53c1f6aa6164ee5bea4761abb16b351c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 1 Dec 2023 14:52:00 -0300 Subject: [PATCH 0179/1290] perf env: Introduce perf_env__arch_strerrno() That will cache the arch specific function translating error numbers to strings. Reviewed-by: Ian Rogers Cc: Adrian Hunter Cc: David Laight Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/20231201203046.486596-2-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 6 ++---- tools/perf/util/env.c | 12 ++++++++++++ tools/perf/util/env.h | 1 + 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index e541d0e2777ab..109b8e64fe69a 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2470,9 +2470,8 @@ static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sam static const char *errno_to_name(struct evsel *evsel, int err) { struct perf_env *env = evsel__env(evsel); - const char *arch_name = perf_env__arch(env); - return arch_syscalls__strerrno(arch_name, err); + return perf_env__arch_strerrno(env, err); } static int trace__sys_exit(struct trace *trace, struct evsel *evsel, @@ -4264,12 +4263,11 @@ static size_t thread__dump_stats(struct thread_trace *ttrace, printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct); if (trace->errno_summary && stats->nr_failures) { - const char *arch_name = perf_env__arch(trace->host->env); int e; for (e = 0; e < stats->max_errno; ++e) { if (stats->errnos[e] != 0) - fprintf(fp, "\t\t\t\t%s: %d\n", arch_syscalls__strerrno(arch_name, e + 1), stats->errnos[e]); + fprintf(fp, "\t\t\t\t%s: %d\n", perf_env__arch_strerrno(trace->host->env, e + 1), stats->errnos[e]); } } } diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index cbc18b22ace52..a632f33646bb3 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -3,6 +3,7 @@ #include "debug.h" #include "env.h" #include "util/header.h" +#include "linux/compiler.h" #include #include #include "cgroup.h" @@ -12,6 +13,7 @@ #include #include "pmus.h" #include "strbuf.h" +#include "trace/beauty/beauty.h" struct perf_env perf_env; @@ -453,6 +455,16 @@ const char *perf_env__arch(struct perf_env *env) return normalize_arch(arch_name); } +const char *perf_env__arch_strerrno(struct perf_env *env __maybe_unused, int err __maybe_unused) +{ +#if defined(HAVE_SYSCALL_TABLE_SUPPORT) && defined(HAVE_LIBTRACEEVENT) + const char *arch_name = perf_env__arch(env); + return arch_syscalls__strerrno(arch_name, err); +#else + return "!(HAVE_SYSCALL_TABLE_SUPPORT && HAVE_LIBTRACEEVENT)"; +#endif +} + const char *perf_env__cpuid(struct perf_env *env) { int status; diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h index 94596ff124d54..79f371879f45b 100644 --- a/tools/perf/util/env.h +++ b/tools/perf/util/env.h @@ -164,6 +164,7 @@ int perf_env__read_cpu_topology_map(struct perf_env *env); void cpu_cache_level__free(struct cpu_cache_level *cache); const char *perf_env__arch(struct perf_env *env); +const char *perf_env__arch_strerrno(struct perf_env *env, int err); const char *perf_env__cpuid(struct perf_env *env); const char *perf_env__raw_arch(struct perf_env *env); int perf_env__nr_cpus_avail(struct perf_env *env); -- GitLab From 4acef67646f35e14d202d5f534c0fd68d7691b22 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 1 Dec 2023 17:07:34 -0300 Subject: [PATCH 0180/1290] perf env: Cache the arch specific strerrno function in perf_env__arch_strerrno() So that we don't have to go thru the series of strcmp(arch) calls for each id -> string translation. Reviewed-by: Ian Rogers Cc: Adrian Hunter Cc: David Laight Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/20231201203046.486596-3-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/trace/beauty/arch_errno_names.sh | 6 +++--- tools/perf/trace/beauty/beauty.h | 2 -- tools/perf/util/env.c | 6 ++++-- tools/perf/util/env.h | 5 +++++ 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/tools/perf/trace/beauty/arch_errno_names.sh b/tools/perf/trace/beauty/arch_errno_names.sh index b6e0767b4b34e..7df4bf5b55a3c 100755 --- a/tools/perf/trace/beauty/arch_errno_names.sh +++ b/tools/perf/trace/beauty/arch_errno_names.sh @@ -57,13 +57,13 @@ create_arch_errno_table_func() archlist="$1" default="$2" - printf 'const char *arch_syscalls__strerrno(const char *arch, int err)\n' + printf 'arch_syscalls__strerrno_t *arch_syscalls__strerrno_function(const char *arch)\n' printf '{\n' for arch in $archlist; do printf '\tif (!strcmp(arch, "%s"))\n' $(arch_string "$arch") - printf '\t\treturn errno_to_name__%s(err);\n' $(arch_string "$arch") + printf '\t\treturn errno_to_name__%s;\n' $(arch_string "$arch") done - printf '\treturn errno_to_name__%s(err);\n' $(arch_string "$default") + printf '\treturn errno_to_name__%s;\n' $(arch_string "$default") printf '}\n' } diff --git a/tools/perf/trace/beauty/beauty.h b/tools/perf/trace/beauty/beauty.h index 788e8f6bd90eb..9feb794f5c6e1 100644 --- a/tools/perf/trace/beauty/beauty.h +++ b/tools/perf/trace/beauty/beauty.h @@ -251,6 +251,4 @@ size_t open__scnprintf_flags(unsigned long flags, char *bf, size_t size, bool sh void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg, size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg)); -const char *arch_syscalls__strerrno(const char *arch, int err); - #endif /* _PERF_TRACE_BEAUTY_H */ diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index a632f33646bb3..c68b7a004f294 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -458,8 +458,10 @@ const char *perf_env__arch(struct perf_env *env) const char *perf_env__arch_strerrno(struct perf_env *env __maybe_unused, int err __maybe_unused) { #if defined(HAVE_SYSCALL_TABLE_SUPPORT) && defined(HAVE_LIBTRACEEVENT) - const char *arch_name = perf_env__arch(env); - return arch_syscalls__strerrno(arch_name, err); + if (env->arch_strerrno == NULL) + env->arch_strerrno = arch_syscalls__strerrno_function(perf_env__arch(env)); + + return env->arch_strerrno ? env->arch_strerrno(err) : "no arch specific strerrno function"; #else return "!(HAVE_SYSCALL_TABLE_SUPPORT && HAVE_LIBTRACEEVENT)"; #endif diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h index 79f371879f45b..bf7e3c4c211f9 100644 --- a/tools/perf/util/env.h +++ b/tools/perf/util/env.h @@ -53,6 +53,10 @@ struct pmu_caps { char *pmu_name; }; +typedef const char *(arch_syscalls__strerrno_t)(int err); + +arch_syscalls__strerrno_t *arch_syscalls__strerrno_function(const char *arch); + struct perf_env { char *hostname; char *os_release; @@ -135,6 +139,7 @@ struct perf_env { */ bool enabled; } clock; + arch_syscalls__strerrno_t *arch_strerrno; }; enum perf_compress_type { -- GitLab From 28b01743ca752cea5ab182297d8b912b22f2a2d1 Mon Sep 17 00:00:00 2001 From: Veronika Molnarova Date: Fri, 1 Dec 2023 20:46:17 +0100 Subject: [PATCH 0181/1290] perf test record user-regs: Fix mask for vg register The 'vg' register for arm64 shows up in --user_regs as available when masking the variable AT_HWCAP with 1 << 22 returns '1' as done in perf_regs.c. However, in subtests for support of SVE, the check for the 'vg' register is done by masking the variable AT_HWCAP with the value 0x200000 which is equals to 1 << 21 instead of 1 << 22. This results in inconsistencies on certain systems where the test expects that the 'vg' register is not operational when it is, and vice-versa. During the testing on a machine that the test expected not to have the 'vg' register available, 'perf record' with the option --user-regs showed records for the 'vg' register together with all of the others, which means that the mask for the subtest of perf_event_attr is off by one. Change the value of the mask from 0x200000 to 0x400000 to correct it. Fixes: 9440ebdc333dd12e ("perf test arm64: Add attr tests for new VG register") Reviewed-by: Leo Yan Signed-off-by: Veronika Molnarova Cc: James Clark Cc: Michael Petlan Link: https://lore.kernel.org/r/20231201194617.13012-1-vmolnaro@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/attr/test-record-user-regs-no-sve-aarch64 | 2 +- tools/perf/tests/attr/test-record-user-regs-sve-aarch64 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/tests/attr/test-record-user-regs-no-sve-aarch64 b/tools/perf/tests/attr/test-record-user-regs-no-sve-aarch64 index fbb065842880f..bed765450ca97 100644 --- a/tools/perf/tests/attr/test-record-user-regs-no-sve-aarch64 +++ b/tools/perf/tests/attr/test-record-user-regs-no-sve-aarch64 @@ -6,4 +6,4 @@ args = --no-bpf-event --user-regs=vg kill >/dev/null 2>&1 ret = 129 test_ret = true arch = aarch64 -auxv = auxv["AT_HWCAP"] & 0x200000 == 0 +auxv = auxv["AT_HWCAP"] & 0x400000 == 0 diff --git a/tools/perf/tests/attr/test-record-user-regs-sve-aarch64 b/tools/perf/tests/attr/test-record-user-regs-sve-aarch64 index c598c803221da..a65113cd7311b 100644 --- a/tools/perf/tests/attr/test-record-user-regs-sve-aarch64 +++ b/tools/perf/tests/attr/test-record-user-regs-sve-aarch64 @@ -6,7 +6,7 @@ args = --no-bpf-event --user-regs=vg kill >/dev/null 2>&1 ret = 1 test_ret = true arch = aarch64 -auxv = auxv["AT_HWCAP"] & 0x200000 == 0x200000 +auxv = auxv["AT_HWCAP"] & 0x400000 == 0x400000 kernel_since = 6.1 [event:base-record] -- GitLab From 2202844e4468c7539dba0c0b06577c93735af952 Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Mon, 6 Nov 2023 15:22:23 +0800 Subject: [PATCH 0182/1290] vfio/migration: Add debugfs to live migration driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are multiple devices, software and operational steps involved in the process of live migration. An error occurred on any node may cause the live migration operation to fail. This complex process makes it very difficult to locate and analyze the cause when the function fails. In order to quickly locate the cause of the problem when the live migration fails, I added a set of debugfs to the vfio live migration driver. +-------------------------------------------+ | | | | | QEMU | | | | | +---+----------------------------+----------+ | ^ | ^ | | | | | | | | v | v | +---------+--+ +---------+--+ |src vfio_dev| |dst vfio_dev| +--+---------+ +--+---------+ | ^ | ^ | | | | v | | | +-----------+----+ +-----------+----+ |src dev debugfs | |dst dev debugfs | +----------------+ +----------------+ The entire debugfs directory will be based on the definition of the CONFIG_DEBUG_FS macro. If this macro is not enabled, the interfaces in vfio.h will be empty definitions, and the creation and initialization of the debugfs directory will not be executed. vfio | +--- | +---migration | +--state | +--- +---migration +--state debugfs will create a public root directory "vfio" file. then create a dev_name() file for each live migration device. First, create a unified state acquisition file of "migration" in this device directory. Then, create a public live migration state lookup file "state". Signed-off-by: Longfang Liu Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20231106072225.28577-2-liulongfang@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/Kconfig | 10 +++++ drivers/vfio/Makefile | 1 + drivers/vfio/debugfs.c | 92 +++++++++++++++++++++++++++++++++++++++ drivers/vfio/vfio.h | 14 ++++++ drivers/vfio/vfio_main.c | 4 ++ include/linux/vfio.h | 7 +++ include/uapi/linux/vfio.h | 1 + 7 files changed, 129 insertions(+) create mode 100644 drivers/vfio/debugfs.c diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 6bda6dbb48784..ceae52fd7586d 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -80,6 +80,16 @@ config VFIO_VIRQFD select EVENTFD default n +config VFIO_DEBUGFS + bool "Export VFIO internals in DebugFS" + depends on DEBUG_FS + help + Allows exposure of VFIO device internals. This option enables + the use of debugfs by VFIO drivers as required. The device can + cause the VFIO code create a top-level debug/vfio directory + during initialization, and then populate a subdirectory with + entries as required. + source "drivers/vfio/pci/Kconfig" source "drivers/vfio/platform/Kconfig" source "drivers/vfio/mdev/Kconfig" diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index 68c05705200fc..b2fc9fb499d86 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -7,6 +7,7 @@ vfio-$(CONFIG_VFIO_GROUP) += group.o vfio-$(CONFIG_IOMMUFD) += iommufd.o vfio-$(CONFIG_VFIO_CONTAINER) += container.o vfio-$(CONFIG_VFIO_VIRQFD) += virqfd.o +vfio-$(CONFIG_VFIO_DEBUGFS) += debugfs.o obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o diff --git a/drivers/vfio/debugfs.c b/drivers/vfio/debugfs.c new file mode 100644 index 0000000000000..298bd866f1576 --- /dev/null +++ b/drivers/vfio/debugfs.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2023, HiSilicon Ltd. + */ + +#include +#include +#include +#include +#include "vfio.h" + +static struct dentry *vfio_debugfs_root; + +static int vfio_device_state_read(struct seq_file *seq, void *data) +{ + struct device *vf_dev = seq->private; + struct vfio_device *vdev = container_of(vf_dev, + struct vfio_device, device); + enum vfio_device_mig_state state; + int ret; + + BUILD_BUG_ON(VFIO_DEVICE_STATE_NR != + VFIO_DEVICE_STATE_PRE_COPY_P2P + 1); + + ret = vdev->mig_ops->migration_get_state(vdev, &state); + if (ret) + return -EINVAL; + + switch (state) { + case VFIO_DEVICE_STATE_ERROR: + seq_puts(seq, "ERROR\n"); + break; + case VFIO_DEVICE_STATE_STOP: + seq_puts(seq, "STOP\n"); + break; + case VFIO_DEVICE_STATE_RUNNING: + seq_puts(seq, "RUNNING\n"); + break; + case VFIO_DEVICE_STATE_STOP_COPY: + seq_puts(seq, "STOP_COPY\n"); + break; + case VFIO_DEVICE_STATE_RESUMING: + seq_puts(seq, "RESUMING\n"); + break; + case VFIO_DEVICE_STATE_RUNNING_P2P: + seq_puts(seq, "RUNNING_P2P\n"); + break; + case VFIO_DEVICE_STATE_PRE_COPY: + seq_puts(seq, "PRE_COPY\n"); + break; + case VFIO_DEVICE_STATE_PRE_COPY_P2P: + seq_puts(seq, "PRE_COPY_P2P\n"); + break; + default: + seq_puts(seq, "Invalid\n"); + } + + return 0; +} + +void vfio_device_debugfs_init(struct vfio_device *vdev) +{ + struct device *dev = &vdev->device; + + vdev->debug_root = debugfs_create_dir(dev_name(vdev->dev), + vfio_debugfs_root); + + if (vdev->mig_ops) { + struct dentry *vfio_dev_migration = NULL; + + vfio_dev_migration = debugfs_create_dir("migration", + vdev->debug_root); + debugfs_create_devm_seqfile(dev, "state", vfio_dev_migration, + vfio_device_state_read); + } +} + +void vfio_device_debugfs_exit(struct vfio_device *vdev) +{ + debugfs_remove_recursive(vdev->debug_root); +} + +void vfio_debugfs_create_root(void) +{ + vfio_debugfs_root = debugfs_create_dir("vfio", NULL); +} + +void vfio_debugfs_remove_root(void) +{ + debugfs_remove_recursive(vfio_debugfs_root); + vfio_debugfs_root = NULL; +} diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index 307e3f29b527f..bde84ad344e50 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -448,4 +448,18 @@ static inline void vfio_device_put_kvm(struct vfio_device *device) } #endif +#ifdef CONFIG_VFIO_DEBUGFS +void vfio_debugfs_create_root(void); +void vfio_debugfs_remove_root(void); + +void vfio_device_debugfs_init(struct vfio_device *vdev); +void vfio_device_debugfs_exit(struct vfio_device *vdev); +#else +static inline void vfio_debugfs_create_root(void) { } +static inline void vfio_debugfs_remove_root(void) { } + +static inline void vfio_device_debugfs_init(struct vfio_device *vdev) { } +static inline void vfio_device_debugfs_exit(struct vfio_device *vdev) { } +#endif /* CONFIG_VFIO_DEBUGFS */ + #endif diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 8d4995ada74a0..1cc93aac99a29 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -311,6 +311,7 @@ static int __vfio_register_dev(struct vfio_device *device, refcount_set(&device->refcount, 1); vfio_device_group_register(device); + vfio_device_debugfs_init(device); return 0; err_out: @@ -378,6 +379,7 @@ void vfio_unregister_group_dev(struct vfio_device *device) } } + vfio_device_debugfs_exit(device); /* Balances vfio_device_set_group in register path */ vfio_device_remove_group(device); } @@ -1676,6 +1678,7 @@ static int __init vfio_init(void) if (ret) goto err_alloc_dev_chrdev; + vfio_debugfs_create_root(); pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); return 0; @@ -1691,6 +1694,7 @@ err_virqfd: static void __exit vfio_cleanup(void) { + vfio_debugfs_remove_root(); ida_destroy(&vfio.device_ida); vfio_cdev_cleanup(); class_destroy(vfio.device_class); diff --git a/include/linux/vfio.h b/include/linux/vfio.h index a65b2513f8cdc..89b265bc6ec31 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -69,6 +69,13 @@ struct vfio_device { u8 iommufd_attached:1; #endif u8 cdev_opened:1; +#ifdef CONFIG_DEBUG_FS + /* + * debug_root is a static property of the vfio_device + * which must be set prior to registering the vfio_device. + */ + struct dentry *debug_root; +#endif }; /** diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 7f5fb010226d8..2b68e6cdf1902 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -1219,6 +1219,7 @@ enum vfio_device_mig_state { VFIO_DEVICE_STATE_RUNNING_P2P = 5, VFIO_DEVICE_STATE_PRE_COPY = 6, VFIO_DEVICE_STATE_PRE_COPY_P2P = 7, + VFIO_DEVICE_STATE_NR, }; /** -- GitLab From 7b994177805f4ef05913c3f242840b262ecf2fa1 Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Mon, 6 Nov 2023 15:22:24 +0800 Subject: [PATCH 0183/1290] Documentation: add debugfs description for vfio MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an debugfs document description file to help users understand how to use the accelerator live migration driver's debugfs. Signed-off-by: Longfang Liu Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20231106072225.28577-3-liulongfang@huawei.com Signed-off-by: Alex Williamson --- Documentation/ABI/testing/debugfs-vfio | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 Documentation/ABI/testing/debugfs-vfio diff --git a/Documentation/ABI/testing/debugfs-vfio b/Documentation/ABI/testing/debugfs-vfio new file mode 100644 index 0000000000000..90f7c262f5913 --- /dev/null +++ b/Documentation/ABI/testing/debugfs-vfio @@ -0,0 +1,25 @@ +What: /sys/kernel/debug/vfio +Date: December 2023 +KernelVersion: 6.8 +Contact: Longfang Liu +Description: This debugfs file directory is used for debugging + of vfio devices, it's a common directory for all vfio devices. + Vfio core will create a device subdirectory under this + directory. + +What: /sys/kernel/debug/vfio//migration +Date: December 2023 +KernelVersion: 6.8 +Contact: Longfang Liu +Description: This debugfs file directory is used for debugging + of vfio devices that support live migration. + The debugfs of each vfio device that supports live migration + could be created under this directory. + +What: /sys/kernel/debug/vfio//migration/state +Date: December 2023 +KernelVersion: 6.8 +Contact: Longfang Liu +Description: Read the live migration status of the vfio device. + The contents of the state file reflects the migration state + relative to those defined in the vfio_device_mig_state enum -- GitLab From 0d9bacfa615310e88bae385d64ca065953cb95ac Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Mon, 6 Nov 2023 15:22:25 +0800 Subject: [PATCH 0184/1290] MAINTAINERS: Add vfio debugfs interface doc link After adding the debugfs function to the vfio driver, a new debugfs-vfio file was added. Signed-off-by: Longfang Liu Link: https://lore.kernel.org/r/20231106072225.28577-4-liulongfang@huawei.com Signed-off-by: Alex Williamson --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 788be9ab5b733..d924ef5f56999 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22837,6 +22837,7 @@ M: Alex Williamson L: kvm@vger.kernel.org S: Maintained T: git https://github.com/awilliam/linux-vfio.git +F: Documentation/ABI/testing/debugfs-vfio F: Documentation/ABI/testing/sysfs-devices-vfio-dev F: Documentation/driver-api/vfio.rst F: drivers/vfio/ -- GitLab From 4004497cec3093d7b0087bc70709b45969fa07b6 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Thu, 16 Nov 2023 16:12:02 -0800 Subject: [PATCH 0185/1290] vfio/pds: Fix calculations in pds_vfio_dirty_sync The incorrect check is being done for comparing the iova/length being requested to sync. This can cause the dirty sync operation to fail. Fix this by making sure the iova offset added to the requested sync length doesn't exceed the region_size. Also, the region_start is assumed to always be at 0. This can cause dirty tracking to fail because the device/driver bitmap offset always starts at 0, however, the region_start/iova may not. Fix this by determining the iova offset from region_start to determine the bitmap offset. Fixes: f232836a9152 ("vfio/pds: Add support for dirty page tracking") Signed-off-by: Brett Creeley Signed-off-by: Shannon Nelson Link: https://lore.kernel.org/r/20231117001207.2793-2-brett.creeley@amd.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/pds/dirty.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/pci/pds/dirty.c b/drivers/vfio/pci/pds/dirty.c index c937aa6f39546..27607d7b9030a 100644 --- a/drivers/vfio/pci/pds/dirty.c +++ b/drivers/vfio/pci/pds/dirty.c @@ -478,8 +478,7 @@ static int pds_vfio_dirty_sync(struct pds_vfio_pci_device *pds_vfio, pds_vfio->vf_id, iova, length, pds_vfio->dirty.region_page_size, pages, bitmap_size); - if (!length || ((dirty->region_start + iova + length) > - (dirty->region_start + dirty->region_size))) { + if (!length || ((iova - dirty->region_start + length) > dirty->region_size)) { dev_err(dev, "Invalid iova 0x%lx and/or length 0x%lx to sync\n", iova, length); return -EINVAL; @@ -496,7 +495,8 @@ static int pds_vfio_dirty_sync(struct pds_vfio_pci_device *pds_vfio, return -EINVAL; } - bmp_offset = DIV_ROUND_UP(iova / dirty->region_page_size, sizeof(u64)); + bmp_offset = DIV_ROUND_UP((iova - dirty->region_start) / + dirty->region_page_size, sizeof(u64)); dev_dbg(dev, "Syncing dirty bitmap, iova 0x%lx length 0x%lx, bmp_offset %llu bmp_bytes %llu\n", -- GitLab From 3b8f7a24d1fe79cc3fa1da9be5610b9c3aedbc2a Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Thu, 16 Nov 2023 16:12:03 -0800 Subject: [PATCH 0186/1290] vfio/pds: Only use a single SGL for both seq and ack Since the seq/ack operations never happen in parallel there is no need for multiple scatter gather lists per region. The current implementation is wasting memory. Fix this by only using a single scatter-gather list for both the seq and ack operations. Signed-off-by: Brett Creeley Signed-off-by: Shannon Nelson Link: https://lore.kernel.org/r/20231117001207.2793-3-brett.creeley@amd.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/pds/dirty.c | 68 +++++++++++++----------------------- drivers/vfio/pci/pds/dirty.h | 6 ++-- 2 files changed, 28 insertions(+), 46 deletions(-) diff --git a/drivers/vfio/pci/pds/dirty.c b/drivers/vfio/pci/pds/dirty.c index 27607d7b9030a..4462f6edb0ed4 100644 --- a/drivers/vfio/pci/pds/dirty.c +++ b/drivers/vfio/pci/pds/dirty.c @@ -100,35 +100,35 @@ static void pds_vfio_dirty_free_bitmaps(struct pds_vfio_dirty *dirty) } static void __pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio, - struct pds_vfio_bmp_info *bmp_info) + struct pds_vfio_dirty *dirty) { struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; struct device *pdsc_dev = &pci_physfn(pdev)->dev; - dma_unmap_single(pdsc_dev, bmp_info->sgl_addr, - bmp_info->num_sge * sizeof(struct pds_lm_sg_elem), + dma_unmap_single(pdsc_dev, dirty->sgl_addr, + dirty->num_sge * sizeof(struct pds_lm_sg_elem), DMA_BIDIRECTIONAL); - kfree(bmp_info->sgl); + kfree(dirty->sgl); - bmp_info->num_sge = 0; - bmp_info->sgl = NULL; - bmp_info->sgl_addr = 0; + dirty->num_sge = 0; + dirty->sgl = NULL; + dirty->sgl_addr = 0; } static void pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio) { - if (pds_vfio->dirty.host_seq.sgl) - __pds_vfio_dirty_free_sgl(pds_vfio, &pds_vfio->dirty.host_seq); - if (pds_vfio->dirty.host_ack.sgl) - __pds_vfio_dirty_free_sgl(pds_vfio, &pds_vfio->dirty.host_ack); + struct pds_vfio_dirty *dirty = &pds_vfio->dirty; + + if (dirty->sgl) + __pds_vfio_dirty_free_sgl(pds_vfio, dirty); } -static int __pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio, - struct pds_vfio_bmp_info *bmp_info, - u32 page_count) +static int pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio, + u32 page_count) { struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; struct device *pdsc_dev = &pci_physfn(pdev)->dev; + struct pds_vfio_dirty *dirty = &pds_vfio->dirty; struct pds_lm_sg_elem *sgl; dma_addr_t sgl_addr; size_t sgl_size; @@ -147,30 +147,9 @@ static int __pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio, return -EIO; } - bmp_info->sgl = sgl; - bmp_info->num_sge = max_sge; - bmp_info->sgl_addr = sgl_addr; - - return 0; -} - -static int pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio, - u32 page_count) -{ - struct pds_vfio_dirty *dirty = &pds_vfio->dirty; - int err; - - err = __pds_vfio_dirty_alloc_sgl(pds_vfio, &dirty->host_seq, - page_count); - if (err) - return err; - - err = __pds_vfio_dirty_alloc_sgl(pds_vfio, &dirty->host_ack, - page_count); - if (err) { - __pds_vfio_dirty_free_sgl(pds_vfio, &dirty->host_seq); - return err; - } + dirty->sgl = sgl; + dirty->num_sge = max_sge; + dirty->sgl_addr = sgl_addr; return 0; } @@ -328,6 +307,8 @@ static int pds_vfio_dirty_seq_ack(struct pds_vfio_pci_device *pds_vfio, u8 dma_dir = read_seq ? DMA_FROM_DEVICE : DMA_TO_DEVICE; struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; struct device *pdsc_dev = &pci_physfn(pdev)->dev; + struct pds_vfio_dirty *dirty = &pds_vfio->dirty; + struct pds_lm_sg_elem *sgl; unsigned long long npages; struct sg_table sg_table; struct scatterlist *sg; @@ -374,8 +355,9 @@ static int pds_vfio_dirty_seq_ack(struct pds_vfio_pci_device *pds_vfio, if (err) goto out_free_sg_table; + sgl = pds_vfio->dirty.sgl; for_each_sgtable_dma_sg(&sg_table, sg, i) { - struct pds_lm_sg_elem *sg_elem = &bmp_info->sgl[i]; + struct pds_lm_sg_elem *sg_elem = &sgl[i]; sg_elem->addr = cpu_to_le64(sg_dma_address(sg)); sg_elem->len = cpu_to_le32(sg_dma_len(sg)); @@ -383,15 +365,15 @@ static int pds_vfio_dirty_seq_ack(struct pds_vfio_pci_device *pds_vfio, num_sge = sg_table.nents; size = num_sge * sizeof(struct pds_lm_sg_elem); - dma_sync_single_for_device(pdsc_dev, bmp_info->sgl_addr, size, dma_dir); - err = pds_vfio_dirty_seq_ack_cmd(pds_vfio, bmp_info->sgl_addr, num_sge, + dma_sync_single_for_device(pdsc_dev, dirty->sgl_addr, size, dma_dir); + err = pds_vfio_dirty_seq_ack_cmd(pds_vfio, dirty->sgl_addr, num_sge, offset, bmp_bytes, read_seq); if (err) dev_err(&pdev->dev, "Dirty bitmap %s failed offset %u bmp_bytes %u num_sge %u DMA 0x%llx: %pe\n", bmp_type_str, offset, bmp_bytes, - num_sge, bmp_info->sgl_addr, ERR_PTR(err)); - dma_sync_single_for_cpu(pdsc_dev, bmp_info->sgl_addr, size, dma_dir); + num_sge, dirty->sgl_addr, ERR_PTR(err)); + dma_sync_single_for_cpu(pdsc_dev, dirty->sgl_addr, size, dma_dir); dma_unmap_sgtable(pdsc_dev, &sg_table, dma_dir, 0); out_free_sg_table: diff --git a/drivers/vfio/pci/pds/dirty.h b/drivers/vfio/pci/pds/dirty.h index f78da25d75ca9..9de5aac58190e 100644 --- a/drivers/vfio/pci/pds/dirty.h +++ b/drivers/vfio/pci/pds/dirty.h @@ -7,9 +7,6 @@ struct pds_vfio_bmp_info { unsigned long *bmp; u32 bmp_bytes; - struct pds_lm_sg_elem *sgl; - dma_addr_t sgl_addr; - u16 num_sge; }; struct pds_vfio_dirty { @@ -18,6 +15,9 @@ struct pds_vfio_dirty { u64 region_size; u64 region_start; u64 region_page_size; + struct pds_lm_sg_elem *sgl; + dma_addr_t sgl_addr; + u16 num_sge; bool is_enabled; }; -- GitLab From 3f5898133a706402180908b671d5e781fc48f548 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Thu, 16 Nov 2023 16:12:04 -0800 Subject: [PATCH 0187/1290] vfio/pds: Move and rename region specific info An upcoming change in this series will add support for multiple regions. To prepare for that, move region specific information into struct pds_vfio_region and rename the members for readability. This will reduce the size of the patch that actually implements multiple region support. Signed-off-by: Brett Creeley Signed-off-by: Shannon Nelson Link: https://lore.kernel.org/r/20231117001207.2793-4-brett.creeley@amd.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/pds/dirty.c | 83 ++++++++++++++++++------------------ drivers/vfio/pci/pds/dirty.h | 12 ++++-- 2 files changed, 50 insertions(+), 45 deletions(-) diff --git a/drivers/vfio/pci/pds/dirty.c b/drivers/vfio/pci/pds/dirty.c index 4462f6edb0ed4..8a7f9eed4e598 100644 --- a/drivers/vfio/pci/pds/dirty.c +++ b/drivers/vfio/pci/pds/dirty.c @@ -85,18 +85,18 @@ static int pds_vfio_dirty_alloc_bitmaps(struct pds_vfio_dirty *dirty, return -ENOMEM; } - dirty->host_seq.bmp = host_seq_bmp; - dirty->host_ack.bmp = host_ack_bmp; + dirty->region.host_seq.bmp = host_seq_bmp; + dirty->region.host_ack.bmp = host_ack_bmp; return 0; } static void pds_vfio_dirty_free_bitmaps(struct pds_vfio_dirty *dirty) { - vfree(dirty->host_seq.bmp); - vfree(dirty->host_ack.bmp); - dirty->host_seq.bmp = NULL; - dirty->host_ack.bmp = NULL; + vfree(dirty->region.host_seq.bmp); + vfree(dirty->region.host_ack.bmp); + dirty->region.host_seq.bmp = NULL; + dirty->region.host_ack.bmp = NULL; } static void __pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio, @@ -105,21 +105,21 @@ static void __pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio, struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; struct device *pdsc_dev = &pci_physfn(pdev)->dev; - dma_unmap_single(pdsc_dev, dirty->sgl_addr, - dirty->num_sge * sizeof(struct pds_lm_sg_elem), + dma_unmap_single(pdsc_dev, dirty->region.sgl_addr, + dirty->region.num_sge * sizeof(struct pds_lm_sg_elem), DMA_BIDIRECTIONAL); - kfree(dirty->sgl); + kfree(dirty->region.sgl); - dirty->num_sge = 0; - dirty->sgl = NULL; - dirty->sgl_addr = 0; + dirty->region.num_sge = 0; + dirty->region.sgl = NULL; + dirty->region.sgl_addr = 0; } static void pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio) { struct pds_vfio_dirty *dirty = &pds_vfio->dirty; - if (dirty->sgl) + if (dirty->region.sgl) __pds_vfio_dirty_free_sgl(pds_vfio, dirty); } @@ -147,9 +147,9 @@ static int pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio, return -EIO; } - dirty->sgl = sgl; - dirty->num_sge = max_sge; - dirty->sgl_addr = sgl_addr; + dirty->region.sgl = sgl; + dirty->region.num_sge = max_sge; + dirty->region.sgl_addr = sgl_addr; return 0; } @@ -267,9 +267,9 @@ static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio, goto out_free_bitmaps; } - dirty->region_start = region_start; - dirty->region_size = region_size; - dirty->region_page_size = region_page_size; + dirty->region.start = region_start; + dirty->region.size = region_size; + dirty->region.page_size = region_page_size; pds_vfio_dirty_set_enabled(pds_vfio); pds_vfio_print_guest_region_info(pds_vfio, max_regions); @@ -304,11 +304,10 @@ static int pds_vfio_dirty_seq_ack(struct pds_vfio_pci_device *pds_vfio, u32 offset, u32 bmp_bytes, bool read_seq) { const char *bmp_type_str = read_seq ? "read_seq" : "write_ack"; + struct pds_vfio_region *region = &pds_vfio->dirty.region; u8 dma_dir = read_seq ? DMA_FROM_DEVICE : DMA_TO_DEVICE; struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; struct device *pdsc_dev = &pci_physfn(pdev)->dev; - struct pds_vfio_dirty *dirty = &pds_vfio->dirty; - struct pds_lm_sg_elem *sgl; unsigned long long npages; struct sg_table sg_table; struct scatterlist *sg; @@ -355,9 +354,8 @@ static int pds_vfio_dirty_seq_ack(struct pds_vfio_pci_device *pds_vfio, if (err) goto out_free_sg_table; - sgl = pds_vfio->dirty.sgl; for_each_sgtable_dma_sg(&sg_table, sg, i) { - struct pds_lm_sg_elem *sg_elem = &sgl[i]; + struct pds_lm_sg_elem *sg_elem = ®ion->sgl[i]; sg_elem->addr = cpu_to_le64(sg_dma_address(sg)); sg_elem->len = cpu_to_le32(sg_dma_len(sg)); @@ -365,15 +363,15 @@ static int pds_vfio_dirty_seq_ack(struct pds_vfio_pci_device *pds_vfio, num_sge = sg_table.nents; size = num_sge * sizeof(struct pds_lm_sg_elem); - dma_sync_single_for_device(pdsc_dev, dirty->sgl_addr, size, dma_dir); - err = pds_vfio_dirty_seq_ack_cmd(pds_vfio, dirty->sgl_addr, num_sge, + dma_sync_single_for_device(pdsc_dev, region->sgl_addr, size, dma_dir); + err = pds_vfio_dirty_seq_ack_cmd(pds_vfio, region->sgl_addr, num_sge, offset, bmp_bytes, read_seq); if (err) dev_err(&pdev->dev, "Dirty bitmap %s failed offset %u bmp_bytes %u num_sge %u DMA 0x%llx: %pe\n", bmp_type_str, offset, bmp_bytes, - num_sge, dirty->sgl_addr, ERR_PTR(err)); - dma_sync_single_for_cpu(pdsc_dev, dirty->sgl_addr, size, dma_dir); + num_sge, region->sgl_addr, ERR_PTR(err)); + dma_sync_single_for_cpu(pdsc_dev, region->sgl_addr, size, dma_dir); dma_unmap_sgtable(pdsc_dev, &sg_table, dma_dir, 0); out_free_sg_table: @@ -387,14 +385,18 @@ out_free_pages: static int pds_vfio_dirty_write_ack(struct pds_vfio_pci_device *pds_vfio, u32 offset, u32 len) { - return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_ack, + struct pds_vfio_region *region = &pds_vfio->dirty.region; + + return pds_vfio_dirty_seq_ack(pds_vfio, ®ion->host_ack, offset, len, WRITE_ACK); } static int pds_vfio_dirty_read_seq(struct pds_vfio_pci_device *pds_vfio, u32 offset, u32 len) { - return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_seq, + struct pds_vfio_region *region = &pds_vfio->dirty.region; + + return pds_vfio_dirty_seq_ack(pds_vfio, ®ion->host_seq, offset, len, READ_SEQ); } @@ -402,15 +404,15 @@ static int pds_vfio_dirty_process_bitmaps(struct pds_vfio_pci_device *pds_vfio, struct iova_bitmap *dirty_bitmap, u32 bmp_offset, u32 len_bytes) { - u64 page_size = pds_vfio->dirty.region_page_size; - u64 region_start = pds_vfio->dirty.region_start; + u64 page_size = pds_vfio->dirty.region.page_size; + u64 region_start = pds_vfio->dirty.region.start; u32 bmp_offset_bit; __le64 *seq, *ack; int dword_count; dword_count = len_bytes / sizeof(u64); - seq = (__le64 *)((u64)pds_vfio->dirty.host_seq.bmp + bmp_offset); - ack = (__le64 *)((u64)pds_vfio->dirty.host_ack.bmp + bmp_offset); + seq = (__le64 *)((u64)pds_vfio->dirty.region.host_seq.bmp + bmp_offset); + ack = (__le64 *)((u64)pds_vfio->dirty.region.host_ack.bmp + bmp_offset); bmp_offset_bit = bmp_offset * 8; for (int i = 0; i < dword_count; i++) { @@ -451,25 +453,24 @@ static int pds_vfio_dirty_sync(struct pds_vfio_pci_device *pds_vfio, return -EINVAL; } - pages = DIV_ROUND_UP(length, pds_vfio->dirty.region_page_size); + pages = DIV_ROUND_UP(length, pds_vfio->dirty.region.page_size); bitmap_size = round_up(pages, sizeof(u64) * BITS_PER_BYTE) / BITS_PER_BYTE; dev_dbg(dev, "vf%u: iova 0x%lx length %lu page_size %llu pages %llu bitmap_size %llu\n", - pds_vfio->vf_id, iova, length, pds_vfio->dirty.region_page_size, + pds_vfio->vf_id, iova, length, pds_vfio->dirty.region.page_size, pages, bitmap_size); - if (!length || ((iova - dirty->region_start + length) > dirty->region_size)) { + if (!length || ((iova - dirty->region.start + length) > dirty->region.size)) { dev_err(dev, "Invalid iova 0x%lx and/or length 0x%lx to sync\n", iova, length); return -EINVAL; } /* bitmap is modified in 64 bit chunks */ - bmp_bytes = ALIGN(DIV_ROUND_UP(length / dirty->region_page_size, - sizeof(u64)), - sizeof(u64)); + bmp_bytes = ALIGN(DIV_ROUND_UP(length / dirty->region.page_size, + sizeof(u64)), sizeof(u64)); if (bmp_bytes != bitmap_size) { dev_err(dev, "Calculated bitmap bytes %llu not equal to bitmap size %llu\n", @@ -477,8 +478,8 @@ static int pds_vfio_dirty_sync(struct pds_vfio_pci_device *pds_vfio, return -EINVAL; } - bmp_offset = DIV_ROUND_UP((iova - dirty->region_start) / - dirty->region_page_size, sizeof(u64)); + bmp_offset = DIV_ROUND_UP((iova - dirty->region.start) / + dirty->region.page_size, sizeof(u64)); dev_dbg(dev, "Syncing dirty bitmap, iova 0x%lx length 0x%lx, bmp_offset %llu bmp_bytes %llu\n", diff --git a/drivers/vfio/pci/pds/dirty.h b/drivers/vfio/pci/pds/dirty.h index 9de5aac58190e..07662d369e7c6 100644 --- a/drivers/vfio/pci/pds/dirty.h +++ b/drivers/vfio/pci/pds/dirty.h @@ -9,15 +9,19 @@ struct pds_vfio_bmp_info { u32 bmp_bytes; }; -struct pds_vfio_dirty { +struct pds_vfio_region { struct pds_vfio_bmp_info host_seq; struct pds_vfio_bmp_info host_ack; - u64 region_size; - u64 region_start; - u64 region_page_size; + u64 size; + u64 start; + u64 page_size; struct pds_lm_sg_elem *sgl; dma_addr_t sgl_addr; u16 num_sge; +}; + +struct pds_vfio_dirty { + struct pds_vfio_region region; bool is_enabled; }; -- GitLab From 87bdf9807ed70f5d3a0f852f9d04ceb1c99ad9f6 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Thu, 16 Nov 2023 16:12:05 -0800 Subject: [PATCH 0188/1290] vfio/pds: Pass region info to relevant functions A later patch in the series implements multi-region support. That will require specific regions to be passed to relevant functions. Prepare for that change by passing the region structure to relevant functions. Signed-off-by: Brett Creeley Signed-off-by: Shannon Nelson Link: https://lore.kernel.org/r/20231117001207.2793-5-brett.creeley@amd.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/pds/dirty.c | 71 ++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 35 deletions(-) diff --git a/drivers/vfio/pci/pds/dirty.c b/drivers/vfio/pci/pds/dirty.c index 8a7f9eed4e598..09fed7c1771a8 100644 --- a/drivers/vfio/pci/pds/dirty.c +++ b/drivers/vfio/pci/pds/dirty.c @@ -100,35 +100,35 @@ static void pds_vfio_dirty_free_bitmaps(struct pds_vfio_dirty *dirty) } static void __pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio, - struct pds_vfio_dirty *dirty) + struct pds_vfio_region *region) { struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; struct device *pdsc_dev = &pci_physfn(pdev)->dev; - dma_unmap_single(pdsc_dev, dirty->region.sgl_addr, - dirty->region.num_sge * sizeof(struct pds_lm_sg_elem), + dma_unmap_single(pdsc_dev, region->sgl_addr, + region->num_sge * sizeof(struct pds_lm_sg_elem), DMA_BIDIRECTIONAL); - kfree(dirty->region.sgl); + kfree(region->sgl); - dirty->region.num_sge = 0; - dirty->region.sgl = NULL; - dirty->region.sgl_addr = 0; + region->num_sge = 0; + region->sgl = NULL; + region->sgl_addr = 0; } static void pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio) { - struct pds_vfio_dirty *dirty = &pds_vfio->dirty; + struct pds_vfio_region *region = &pds_vfio->dirty.region; - if (dirty->region.sgl) - __pds_vfio_dirty_free_sgl(pds_vfio, dirty); + if (region->sgl) + __pds_vfio_dirty_free_sgl(pds_vfio, region); } static int pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio, + struct pds_vfio_region *region, u32 page_count) { struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; struct device *pdsc_dev = &pci_physfn(pdev)->dev; - struct pds_vfio_dirty *dirty = &pds_vfio->dirty; struct pds_lm_sg_elem *sgl; dma_addr_t sgl_addr; size_t sgl_size; @@ -147,9 +147,9 @@ static int pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio, return -EIO; } - dirty->region.sgl = sgl; - dirty->region.num_sge = max_sge; - dirty->region.sgl_addr = sgl_addr; + region->sgl = sgl; + region->num_sge = max_sge; + region->sgl_addr = sgl_addr; return 0; } @@ -260,7 +260,7 @@ static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio, goto out_free_region_info; } - err = pds_vfio_dirty_alloc_sgl(pds_vfio, page_count); + err = pds_vfio_dirty_alloc_sgl(pds_vfio, &dirty->region, page_count); if (err) { dev_err(&pdev->dev, "Failed to alloc dirty sg lists: %pe\n", ERR_PTR(err)); @@ -300,11 +300,11 @@ void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio, bool send_cmd) } static int pds_vfio_dirty_seq_ack(struct pds_vfio_pci_device *pds_vfio, + struct pds_vfio_region *region, struct pds_vfio_bmp_info *bmp_info, u32 offset, u32 bmp_bytes, bool read_seq) { const char *bmp_type_str = read_seq ? "read_seq" : "write_ack"; - struct pds_vfio_region *region = &pds_vfio->dirty.region; u8 dma_dir = read_seq ? DMA_FROM_DEVICE : DMA_TO_DEVICE; struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; struct device *pdsc_dev = &pci_physfn(pdev)->dev; @@ -383,36 +383,36 @@ out_free_pages: } static int pds_vfio_dirty_write_ack(struct pds_vfio_pci_device *pds_vfio, + struct pds_vfio_region *region, u32 offset, u32 len) { - struct pds_vfio_region *region = &pds_vfio->dirty.region; - return pds_vfio_dirty_seq_ack(pds_vfio, ®ion->host_ack, + return pds_vfio_dirty_seq_ack(pds_vfio, region, ®ion->host_ack, offset, len, WRITE_ACK); } static int pds_vfio_dirty_read_seq(struct pds_vfio_pci_device *pds_vfio, + struct pds_vfio_region *region, u32 offset, u32 len) { - struct pds_vfio_region *region = &pds_vfio->dirty.region; - - return pds_vfio_dirty_seq_ack(pds_vfio, ®ion->host_seq, + return pds_vfio_dirty_seq_ack(pds_vfio, region, ®ion->host_seq, offset, len, READ_SEQ); } static int pds_vfio_dirty_process_bitmaps(struct pds_vfio_pci_device *pds_vfio, + struct pds_vfio_region *region, struct iova_bitmap *dirty_bitmap, u32 bmp_offset, u32 len_bytes) { - u64 page_size = pds_vfio->dirty.region.page_size; - u64 region_start = pds_vfio->dirty.region.start; + u64 page_size = region->page_size; + u64 region_start = region->start; u32 bmp_offset_bit; __le64 *seq, *ack; int dword_count; dword_count = len_bytes / sizeof(u64); - seq = (__le64 *)((u64)pds_vfio->dirty.region.host_seq.bmp + bmp_offset); - ack = (__le64 *)((u64)pds_vfio->dirty.region.host_ack.bmp + bmp_offset); + seq = (__le64 *)((u64)region->host_seq.bmp + bmp_offset); + ack = (__le64 *)((u64)region->host_ack.bmp + bmp_offset); bmp_offset_bit = bmp_offset * 8; for (int i = 0; i < dword_count; i++) { @@ -441,6 +441,7 @@ static int pds_vfio_dirty_sync(struct pds_vfio_pci_device *pds_vfio, { struct device *dev = &pds_vfio->vfio_coredev.pdev->dev; struct pds_vfio_dirty *dirty = &pds_vfio->dirty; + struct pds_vfio_region *region = &dirty->region; u64 bmp_offset, bmp_bytes; u64 bitmap_size, pages; int err; @@ -453,23 +454,23 @@ static int pds_vfio_dirty_sync(struct pds_vfio_pci_device *pds_vfio, return -EINVAL; } - pages = DIV_ROUND_UP(length, pds_vfio->dirty.region.page_size); + pages = DIV_ROUND_UP(length, region->page_size); bitmap_size = round_up(pages, sizeof(u64) * BITS_PER_BYTE) / BITS_PER_BYTE; dev_dbg(dev, "vf%u: iova 0x%lx length %lu page_size %llu pages %llu bitmap_size %llu\n", - pds_vfio->vf_id, iova, length, pds_vfio->dirty.region.page_size, + pds_vfio->vf_id, iova, length, region->page_size, pages, bitmap_size); - if (!length || ((iova - dirty->region.start + length) > dirty->region.size)) { + if (!length || ((iova - region->start + length) > region->size)) { dev_err(dev, "Invalid iova 0x%lx and/or length 0x%lx to sync\n", iova, length); return -EINVAL; } /* bitmap is modified in 64 bit chunks */ - bmp_bytes = ALIGN(DIV_ROUND_UP(length / dirty->region.page_size, + bmp_bytes = ALIGN(DIV_ROUND_UP(length / region->page_size, sizeof(u64)), sizeof(u64)); if (bmp_bytes != bitmap_size) { dev_err(dev, @@ -478,23 +479,23 @@ static int pds_vfio_dirty_sync(struct pds_vfio_pci_device *pds_vfio, return -EINVAL; } - bmp_offset = DIV_ROUND_UP((iova - dirty->region.start) / - dirty->region.page_size, sizeof(u64)); + bmp_offset = DIV_ROUND_UP((iova - region->start) / + region->page_size, sizeof(u64)); dev_dbg(dev, "Syncing dirty bitmap, iova 0x%lx length 0x%lx, bmp_offset %llu bmp_bytes %llu\n", iova, length, bmp_offset, bmp_bytes); - err = pds_vfio_dirty_read_seq(pds_vfio, bmp_offset, bmp_bytes); + err = pds_vfio_dirty_read_seq(pds_vfio, region, bmp_offset, bmp_bytes); if (err) return err; - err = pds_vfio_dirty_process_bitmaps(pds_vfio, dirty_bitmap, bmp_offset, - bmp_bytes); + err = pds_vfio_dirty_process_bitmaps(pds_vfio, region, dirty_bitmap, + bmp_offset, bmp_bytes); if (err) return err; - err = pds_vfio_dirty_write_ack(pds_vfio, bmp_offset, bmp_bytes); + err = pds_vfio_dirty_write_ack(pds_vfio, region, bmp_offset, bmp_bytes); if (err) return err; -- GitLab From 0c320f223ee6892a413428051f35bebf85fe83d3 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Thu, 16 Nov 2023 16:12:06 -0800 Subject: [PATCH 0189/1290] vfio/pds: Move seq/ack bitmaps into region struct Since the host seq/ack bitmaps are part of a region move them into struct pds_vfio_region. Also, make use of the bmp_bytes value for validation purposes. Signed-off-by: Brett Creeley Signed-off-by: Shannon Nelson Link: https://lore.kernel.org/r/20231117001207.2793-6-brett.creeley@amd.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/pds/dirty.c | 35 ++++++++++++++++++++++------------- drivers/vfio/pci/pds/dirty.h | 10 +++------- 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/drivers/vfio/pci/pds/dirty.c b/drivers/vfio/pci/pds/dirty.c index 09fed7c1771a8..4824cdfe01eda 100644 --- a/drivers/vfio/pci/pds/dirty.c +++ b/drivers/vfio/pci/pds/dirty.c @@ -85,18 +85,20 @@ static int pds_vfio_dirty_alloc_bitmaps(struct pds_vfio_dirty *dirty, return -ENOMEM; } - dirty->region.host_seq.bmp = host_seq_bmp; - dirty->region.host_ack.bmp = host_ack_bmp; + dirty->region.host_seq = host_seq_bmp; + dirty->region.host_ack = host_ack_bmp; + dirty->region.bmp_bytes = bytes; return 0; } static void pds_vfio_dirty_free_bitmaps(struct pds_vfio_dirty *dirty) { - vfree(dirty->region.host_seq.bmp); - vfree(dirty->region.host_ack.bmp); - dirty->region.host_seq.bmp = NULL; - dirty->region.host_ack.bmp = NULL; + vfree(dirty->region.host_seq); + vfree(dirty->region.host_ack); + dirty->region.host_seq = NULL; + dirty->region.host_ack = NULL; + dirty->region.bmp_bytes = 0; } static void __pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio, @@ -301,8 +303,8 @@ void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio, bool send_cmd) static int pds_vfio_dirty_seq_ack(struct pds_vfio_pci_device *pds_vfio, struct pds_vfio_region *region, - struct pds_vfio_bmp_info *bmp_info, - u32 offset, u32 bmp_bytes, bool read_seq) + unsigned long *seq_ack_bmp, u32 offset, + u32 bmp_bytes, bool read_seq) { const char *bmp_type_str = read_seq ? "read_seq" : "write_ack"; u8 dma_dir = read_seq ? DMA_FROM_DEVICE : DMA_TO_DEVICE; @@ -319,7 +321,7 @@ static int pds_vfio_dirty_seq_ack(struct pds_vfio_pci_device *pds_vfio, int err; int i; - bmp = (void *)((u64)bmp_info->bmp + offset); + bmp = (void *)((u64)seq_ack_bmp + offset); page_offset = offset_in_page(bmp); bmp -= page_offset; @@ -387,7 +389,7 @@ static int pds_vfio_dirty_write_ack(struct pds_vfio_pci_device *pds_vfio, u32 offset, u32 len) { - return pds_vfio_dirty_seq_ack(pds_vfio, region, ®ion->host_ack, + return pds_vfio_dirty_seq_ack(pds_vfio, region, region->host_ack, offset, len, WRITE_ACK); } @@ -395,7 +397,7 @@ static int pds_vfio_dirty_read_seq(struct pds_vfio_pci_device *pds_vfio, struct pds_vfio_region *region, u32 offset, u32 len) { - return pds_vfio_dirty_seq_ack(pds_vfio, region, ®ion->host_seq, + return pds_vfio_dirty_seq_ack(pds_vfio, region, region->host_seq, offset, len, READ_SEQ); } @@ -411,8 +413,8 @@ static int pds_vfio_dirty_process_bitmaps(struct pds_vfio_pci_device *pds_vfio, int dword_count; dword_count = len_bytes / sizeof(u64); - seq = (__le64 *)((u64)region->host_seq.bmp + bmp_offset); - ack = (__le64 *)((u64)region->host_ack.bmp + bmp_offset); + seq = (__le64 *)((u64)region->host_seq + bmp_offset); + ack = (__le64 *)((u64)region->host_ack + bmp_offset); bmp_offset_bit = bmp_offset * 8; for (int i = 0; i < dword_count; i++) { @@ -479,6 +481,13 @@ static int pds_vfio_dirty_sync(struct pds_vfio_pci_device *pds_vfio, return -EINVAL; } + if (bmp_bytes > region->bmp_bytes) { + dev_err(dev, + "Calculated bitmap bytes %llu larger than region's cached bmp_bytes %llu\n", + bmp_bytes, region->bmp_bytes); + return -EINVAL; + } + bmp_offset = DIV_ROUND_UP((iova - region->start) / region->page_size, sizeof(u64)); diff --git a/drivers/vfio/pci/pds/dirty.h b/drivers/vfio/pci/pds/dirty.h index 07662d369e7c6..a1f6d894f913a 100644 --- a/drivers/vfio/pci/pds/dirty.h +++ b/drivers/vfio/pci/pds/dirty.h @@ -4,14 +4,10 @@ #ifndef _DIRTY_H_ #define _DIRTY_H_ -struct pds_vfio_bmp_info { - unsigned long *bmp; - u32 bmp_bytes; -}; - struct pds_vfio_region { - struct pds_vfio_bmp_info host_seq; - struct pds_vfio_bmp_info host_ack; + unsigned long *host_seq; + unsigned long *host_ack; + u64 bmp_bytes; u64 size; u64 start; u64 page_size; -- GitLab From 2e7c6feb4ef5257189819359e754625ccf5a1d6c Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Thu, 16 Nov 2023 16:12:07 -0800 Subject: [PATCH 0190/1290] vfio/pds: Add multi-region support Only supporting a single region/range is limiting, wasteful, and in some cases broken (i.e. when there are large gaps in the iova memory ranges). Fix this by adding support for multiple regions based on what the device tells the driver it can support. Signed-off-by: Brett Creeley Signed-off-by: Shannon Nelson Link: https://lore.kernel.org/r/20231117001207.2793-7-brett.creeley@amd.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/pds/dirty.c | 220 ++++++++++++++++++++++++----------- drivers/vfio/pci/pds/dirty.h | 4 +- 2 files changed, 156 insertions(+), 68 deletions(-) diff --git a/drivers/vfio/pci/pds/dirty.c b/drivers/vfio/pci/pds/dirty.c index 4824cdfe01eda..8ddf4346fcd5d 100644 --- a/drivers/vfio/pci/pds/dirty.c +++ b/drivers/vfio/pci/pds/dirty.c @@ -70,7 +70,7 @@ out_free_region_info: kfree(region_info); } -static int pds_vfio_dirty_alloc_bitmaps(struct pds_vfio_dirty *dirty, +static int pds_vfio_dirty_alloc_bitmaps(struct pds_vfio_region *region, unsigned long bytes) { unsigned long *host_seq_bmp, *host_ack_bmp; @@ -85,20 +85,27 @@ static int pds_vfio_dirty_alloc_bitmaps(struct pds_vfio_dirty *dirty, return -ENOMEM; } - dirty->region.host_seq = host_seq_bmp; - dirty->region.host_ack = host_ack_bmp; - dirty->region.bmp_bytes = bytes; + region->host_seq = host_seq_bmp; + region->host_ack = host_ack_bmp; + region->bmp_bytes = bytes; return 0; } static void pds_vfio_dirty_free_bitmaps(struct pds_vfio_dirty *dirty) { - vfree(dirty->region.host_seq); - vfree(dirty->region.host_ack); - dirty->region.host_seq = NULL; - dirty->region.host_ack = NULL; - dirty->region.bmp_bytes = 0; + if (!dirty->regions) + return; + + for (int i = 0; i < dirty->num_regions; i++) { + struct pds_vfio_region *region = &dirty->regions[i]; + + vfree(region->host_seq); + vfree(region->host_ack); + region->host_seq = NULL; + region->host_ack = NULL; + region->bmp_bytes = 0; + } } static void __pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio, @@ -119,10 +126,17 @@ static void __pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio, static void pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio) { - struct pds_vfio_region *region = &pds_vfio->dirty.region; + struct pds_vfio_dirty *dirty = &pds_vfio->dirty; - if (region->sgl) - __pds_vfio_dirty_free_sgl(pds_vfio, region); + if (!dirty->regions) + return; + + for (int i = 0; i < dirty->num_regions; i++) { + struct pds_vfio_region *region = &dirty->regions[i]; + + if (region->sgl) + __pds_vfio_dirty_free_sgl(pds_vfio, region); + } } static int pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio, @@ -156,22 +170,90 @@ static int pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio, return 0; } +static void pds_vfio_dirty_free_regions(struct pds_vfio_dirty *dirty) +{ + vfree(dirty->regions); + dirty->regions = NULL; + dirty->num_regions = 0; +} + +static int pds_vfio_dirty_alloc_regions(struct pds_vfio_pci_device *pds_vfio, + struct pds_lm_dirty_region_info *region_info, + u64 region_page_size, u8 num_regions) +{ + struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; + struct pds_vfio_dirty *dirty = &pds_vfio->dirty; + u32 dev_bmp_offset_byte = 0; + int err; + + dirty->regions = vcalloc(num_regions, sizeof(struct pds_vfio_region)); + if (!dirty->regions) + return -ENOMEM; + dirty->num_regions = num_regions; + + for (int i = 0; i < num_regions; i++) { + struct pds_lm_dirty_region_info *ri = ®ion_info[i]; + struct pds_vfio_region *region = &dirty->regions[i]; + u64 region_size, region_start; + u32 page_count; + + /* page_count might be adjusted by the device */ + page_count = le32_to_cpu(ri->page_count); + region_start = le64_to_cpu(ri->dma_base); + region_size = page_count * region_page_size; + + err = pds_vfio_dirty_alloc_bitmaps(region, + page_count / BITS_PER_BYTE); + if (err) { + dev_err(&pdev->dev, "Failed to alloc dirty bitmaps: %pe\n", + ERR_PTR(err)); + goto out_free_regions; + } + + err = pds_vfio_dirty_alloc_sgl(pds_vfio, region, page_count); + if (err) { + dev_err(&pdev->dev, "Failed to alloc dirty sg lists: %pe\n", + ERR_PTR(err)); + goto out_free_regions; + } + + region->size = region_size; + region->start = region_start; + region->page_size = region_page_size; + region->dev_bmp_offset_start_byte = dev_bmp_offset_byte; + + dev_bmp_offset_byte += page_count / BITS_PER_BYTE; + if (dev_bmp_offset_byte % BITS_PER_BYTE) { + dev_err(&pdev->dev, "Device bitmap offset is mis-aligned\n"); + err = -EINVAL; + goto out_free_regions; + } + } + + return 0; + +out_free_regions: + pds_vfio_dirty_free_bitmaps(dirty); + pds_vfio_dirty_free_sgl(pds_vfio); + pds_vfio_dirty_free_regions(dirty); + + return err; +} + static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio, struct rb_root_cached *ranges, u32 nnodes, u64 *page_size) { struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; struct device *pdsc_dev = &pci_physfn(pdev)->dev; - struct pds_vfio_dirty *dirty = &pds_vfio->dirty; - u64 region_start, region_size, region_page_size; struct pds_lm_dirty_region_info *region_info; struct interval_tree_node *node = NULL; + u64 region_page_size = *page_size; u8 max_regions = 0, num_regions; dma_addr_t regions_dma = 0; u32 num_ranges = nnodes; - u32 page_count; - u16 len; int err; + u16 len; dev_dbg(&pdev->dev, "vf%u: Start dirty page tracking\n", pds_vfio->vf_id); @@ -198,39 +280,38 @@ static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio, return -EOPNOTSUPP; } - /* - * Only support 1 region for now. If there are any large gaps in the - * VM's address regions, then this would be a waste of memory as we are - * generating 2 bitmaps (ack/seq) from the min address to the max - * address of the VM's address regions. In the future, if we support - * more than one region in the device/driver we can split the bitmaps - * on the largest address region gaps. We can do this split up to the - * max_regions times returned from the dirty_status command. - */ - max_regions = 1; if (num_ranges > max_regions) { vfio_combine_iova_ranges(ranges, nnodes, max_regions); num_ranges = max_regions; } + region_info = kcalloc(num_ranges, sizeof(*region_info), GFP_KERNEL); + if (!region_info) + return -ENOMEM; + len = num_ranges * sizeof(*region_info); + node = interval_tree_iter_first(ranges, 0, ULONG_MAX); if (!node) return -EINVAL; + for (int i = 0; i < num_ranges; i++) { + struct pds_lm_dirty_region_info *ri = ®ion_info[i]; + u64 region_size = node->last - node->start + 1; + u64 region_start = node->start; + u32 page_count; - region_size = node->last - node->start + 1; - region_start = node->start; - region_page_size = *page_size; + page_count = DIV_ROUND_UP(region_size, region_page_size); - len = sizeof(*region_info); - region_info = kzalloc(len, GFP_KERNEL); - if (!region_info) - return -ENOMEM; + ri->dma_base = cpu_to_le64(region_start); + ri->page_count = cpu_to_le32(page_count); + ri->page_size_log2 = ilog2(region_page_size); - page_count = DIV_ROUND_UP(region_size, region_page_size); + dev_dbg(&pdev->dev, + "region_info[%d]: region_start 0x%llx region_end 0x%lx region_size 0x%llx page_count %u page_size %llu\n", + i, region_start, node->last, region_size, page_count, + region_page_size); - region_info->dma_base = cpu_to_le64(region_start); - region_info->page_count = cpu_to_le32(page_count); - region_info->page_size_log2 = ilog2(region_page_size); + node = interval_tree_iter_next(node, 0, ULONG_MAX); + } regions_dma = dma_map_single(pdsc_dev, (void *)region_info, len, DMA_BIDIRECTIONAL); @@ -239,39 +320,20 @@ static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio, goto out_free_region_info; } - err = pds_vfio_dirty_enable_cmd(pds_vfio, regions_dma, max_regions); + err = pds_vfio_dirty_enable_cmd(pds_vfio, regions_dma, num_ranges); dma_unmap_single(pdsc_dev, regions_dma, len, DMA_BIDIRECTIONAL); if (err) goto out_free_region_info; - /* - * page_count might be adjusted by the device, - * update it before freeing region_info DMA - */ - page_count = le32_to_cpu(region_info->page_count); - - dev_dbg(&pdev->dev, - "region_info: regions_dma 0x%llx dma_base 0x%llx page_count %u page_size_log2 %u\n", - regions_dma, region_start, page_count, - (u8)ilog2(region_page_size)); - - err = pds_vfio_dirty_alloc_bitmaps(dirty, page_count / BITS_PER_BYTE); - if (err) { - dev_err(&pdev->dev, "Failed to alloc dirty bitmaps: %pe\n", - ERR_PTR(err)); - goto out_free_region_info; - } - - err = pds_vfio_dirty_alloc_sgl(pds_vfio, &dirty->region, page_count); + err = pds_vfio_dirty_alloc_regions(pds_vfio, region_info, + region_page_size, num_ranges); if (err) { - dev_err(&pdev->dev, "Failed to alloc dirty sg lists: %pe\n", - ERR_PTR(err)); - goto out_free_bitmaps; + dev_err(&pdev->dev, + "Failed to allocate %d regions for tracking dirty regions: %pe\n", + num_regions, ERR_PTR(err)); + goto out_dirty_disable; } - dirty->region.start = region_start; - dirty->region.size = region_size; - dirty->region.page_size = region_page_size; pds_vfio_dirty_set_enabled(pds_vfio); pds_vfio_print_guest_region_info(pds_vfio, max_regions); @@ -280,8 +342,8 @@ static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio, return 0; -out_free_bitmaps: - pds_vfio_dirty_free_bitmaps(dirty); +out_dirty_disable: + pds_vfio_dirty_disable_cmd(pds_vfio); out_free_region_info: kfree(region_info); return err; @@ -295,6 +357,7 @@ void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio, bool send_cmd) pds_vfio_dirty_disable_cmd(pds_vfio); pds_vfio_dirty_free_sgl(pds_vfio); pds_vfio_dirty_free_bitmaps(&pds_vfio->dirty); + pds_vfio_dirty_free_regions(&pds_vfio->dirty); } if (send_cmd) @@ -365,6 +428,7 @@ static int pds_vfio_dirty_seq_ack(struct pds_vfio_pci_device *pds_vfio, num_sge = sg_table.nents; size = num_sge * sizeof(struct pds_lm_sg_elem); + offset += region->dev_bmp_offset_start_byte; dma_sync_single_for_device(pdsc_dev, region->sgl_addr, size, dma_dir); err = pds_vfio_dirty_seq_ack_cmd(pds_vfio, region->sgl_addr, num_sge, offset, bmp_bytes, read_seq); @@ -437,13 +501,28 @@ static int pds_vfio_dirty_process_bitmaps(struct pds_vfio_pci_device *pds_vfio, return 0; } +static struct pds_vfio_region * +pds_vfio_get_region(struct pds_vfio_pci_device *pds_vfio, unsigned long iova) +{ + struct pds_vfio_dirty *dirty = &pds_vfio->dirty; + + for (int i = 0; i < dirty->num_regions; i++) { + struct pds_vfio_region *region = &dirty->regions[i]; + + if (iova >= region->start && + iova < (region->start + region->size)) + return region; + } + + return NULL; +} + static int pds_vfio_dirty_sync(struct pds_vfio_pci_device *pds_vfio, struct iova_bitmap *dirty_bitmap, unsigned long iova, unsigned long length) { struct device *dev = &pds_vfio->vfio_coredev.pdev->dev; - struct pds_vfio_dirty *dirty = &pds_vfio->dirty; - struct pds_vfio_region *region = &dirty->region; + struct pds_vfio_region *region; u64 bmp_offset, bmp_bytes; u64 bitmap_size, pages; int err; @@ -456,6 +535,13 @@ static int pds_vfio_dirty_sync(struct pds_vfio_pci_device *pds_vfio, return -EINVAL; } + region = pds_vfio_get_region(pds_vfio, iova); + if (!region) { + dev_err(dev, "vf%u: Failed to find region that contains iova 0x%lx length 0x%lx\n", + pds_vfio->vf_id, iova, length); + return -EINVAL; + } + pages = DIV_ROUND_UP(length, region->page_size); bitmap_size = round_up(pages, sizeof(u64) * BITS_PER_BYTE) / BITS_PER_BYTE; diff --git a/drivers/vfio/pci/pds/dirty.h b/drivers/vfio/pci/pds/dirty.h index a1f6d894f913a..c8e23018b8018 100644 --- a/drivers/vfio/pci/pds/dirty.h +++ b/drivers/vfio/pci/pds/dirty.h @@ -13,11 +13,13 @@ struct pds_vfio_region { u64 page_size; struct pds_lm_sg_elem *sgl; dma_addr_t sgl_addr; + u32 dev_bmp_offset_start_byte; u16 num_sge; }; struct pds_vfio_dirty { - struct pds_vfio_region region; + struct pds_vfio_region *regions; + u8 num_regions; bool is_enabled; }; -- GitLab From 160912fc3d4ae492954c25cd78b90083869f0011 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 30 Nov 2023 20:09:00 +0000 Subject: [PATCH 0191/1290] vfio/type1: account iommu allocations iommu allocations should be accounted in order to allow admins to monitor and limit the amount of iommu memory. Signed-off-by: Pasha Tatashin Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231130200900.2320829-1-pasha.tatashin@soleen.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index eacd6ec04de5a..b2854d7939ce0 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -1436,7 +1436,7 @@ static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova, list_for_each_entry(d, &iommu->domain_list, next) { ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT, npage << PAGE_SHIFT, prot | IOMMU_CACHE, - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (ret) goto unwind; @@ -1750,7 +1750,8 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu, } ret = iommu_map(domain->domain, iova, phys, size, - dma->prot | IOMMU_CACHE, GFP_KERNEL); + dma->prot | IOMMU_CACHE, + GFP_KERNEL_ACCOUNT); if (ret) { if (!dma->iommu_mapped) { vfio_unpin_pages_remote(dma, iova, @@ -1845,7 +1846,8 @@ static void vfio_test_domain_fgsp(struct vfio_domain *domain, struct list_head * continue; ret = iommu_map(domain->domain, start, page_to_phys(pages), PAGE_SIZE * 2, - IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE, GFP_KERNEL); + IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE, + GFP_KERNEL_ACCOUNT); if (!ret) { size_t unmapped = iommu_unmap(domain->domain, start, PAGE_SIZE); -- GitLab From 10a149e4b4a9187940adbfff0f216ccb5a15aa41 Mon Sep 17 00:00:00 2001 From: Ilkka Koskinen Date: Thu, 30 Nov 2023 18:15:49 -0800 Subject: [PATCH 0192/1290] perf vendor events arm64 AmpereOne: Rename BPU_FLUSH_MEM_FAULT to GPC_FLUSH_MEM_FAULT The documentation wrongly called the event as BPU_FLUSH_MEM_FAULT and now has been fixed. Correct the name in the perf tool as well. Fixes: a9650b7f6fc09d16 ("perf vendor events arm64: Add AmpereOne core PMU events") Reviewed-by: Ian Rogers Signed-off-by: Ilkka Koskinen Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ilkka Koskinen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: linux-arm-kernel@lists.infradead.org Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Will Deacon Link: https://lore.kernel.org/r/20231201021550.1109196-3-ilkka@os.amperecomputing.com Signed-off-by: Arnaldo Carvalho de Melo --- .../pmu-events/arch/arm64/ampere/ampereone/core-imp-def.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereone/core-imp-def.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/core-imp-def.json index 88b23b85e33cd..879ff21e0b177 100644 --- a/tools/perf/pmu-events/arch/arm64/ampere/ampereone/core-imp-def.json +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/core-imp-def.json @@ -110,7 +110,7 @@ { "PublicDescription": "Flushes due to memory hazards", "EventCode": "0x121", - "EventName": "BPU_FLUSH_MEM_FAULT", + "EventName": "GPC_FLUSH_MEM_FAULT", "BriefDescription": "Flushes due to memory hazards" }, { -- GitLab From 16438b652b464ef7d0a877d31e93ab54338f6b0a Mon Sep 17 00:00:00 2001 From: Ilkka Koskinen Date: Thu, 30 Nov 2023 18:15:50 -0800 Subject: [PATCH 0193/1290] perf vendor events arm64 AmpereOneX: Add core PMU events and metrics Add JSON files for AmpereOneX core PMU events and metrics. Reviewed-by: Ian Rogers Signed-off-by: Ilkka Koskinen Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20231201021550.1109196-4-ilkka@os.amperecomputing.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/arm64/ampere/ampereonex/branch.json | 125 +++++ .../arch/arm64/ampere/ampereonex/bus.json | 20 + .../arch/arm64/ampere/ampereonex/cache.json | 206 ++++++++ .../arm64/ampere/ampereonex/core-imp-def.json | 464 ++++++++++++++++++ .../arm64/ampere/ampereonex/exception.json | 47 ++ .../arm64/ampere/ampereonex/instruction.json | 128 +++++ .../arm64/ampere/ampereonex/intrinsic.json | 14 + .../arch/arm64/ampere/ampereonex/memory.json | 41 ++ .../arch/arm64/ampere/ampereonex/metrics.json | 442 +++++++++++++++++ .../arch/arm64/ampere/ampereonex/mmu.json | 170 +++++++ .../arm64/ampere/ampereonex/pipeline.json | 41 ++ .../arch/arm64/ampere/ampereonex/spe.json | 14 + tools/perf/pmu-events/arch/arm64/mapfile.csv | 1 + 13 files changed, 1713 insertions(+) create mode 100644 tools/perf/pmu-events/arch/arm64/ampere/ampereonex/branch.json create mode 100644 tools/perf/pmu-events/arch/arm64/ampere/ampereonex/bus.json create mode 100644 tools/perf/pmu-events/arch/arm64/ampere/ampereonex/cache.json create mode 100644 tools/perf/pmu-events/arch/arm64/ampere/ampereonex/core-imp-def.json create mode 100644 tools/perf/pmu-events/arch/arm64/ampere/ampereonex/exception.json create mode 100644 tools/perf/pmu-events/arch/arm64/ampere/ampereonex/instruction.json create mode 100644 tools/perf/pmu-events/arch/arm64/ampere/ampereonex/intrinsic.json create mode 100644 tools/perf/pmu-events/arch/arm64/ampere/ampereonex/memory.json create mode 100644 tools/perf/pmu-events/arch/arm64/ampere/ampereonex/metrics.json create mode 100644 tools/perf/pmu-events/arch/arm64/ampere/ampereonex/mmu.json create mode 100644 tools/perf/pmu-events/arch/arm64/ampere/ampereonex/pipeline.json create mode 100644 tools/perf/pmu-events/arch/arm64/ampere/ampereonex/spe.json diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/branch.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/branch.json new file mode 100644 index 0000000000000..a632755fc0869 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/branch.json @@ -0,0 +1,125 @@ +[ + { + "ArchStdEvent": "BR_IMMED_SPEC" + }, + { + "ArchStdEvent": "BR_RETURN_SPEC" + }, + { + "ArchStdEvent": "BR_INDIRECT_SPEC" + }, + { + "ArchStdEvent": "BR_MIS_PRED" + }, + { + "ArchStdEvent": "BR_PRED" + }, + { + "PublicDescription": "Instruction architecturally executed, branch not taken", + "EventCode": "0x8107", + "EventName": "BR_SKIP_RETIRED", + "BriefDescription": "Instruction architecturally executed, branch not taken" + }, + { + "PublicDescription": "Instruction architecturally executed, immediate branch taken", + "EventCode": "0x8108", + "EventName": "BR_IMMED_TAKEN_RETIRED", + "BriefDescription": "Instruction architecturally executed, immediate branch taken" + }, + { + "PublicDescription": "Instruction architecturally executed, indirect branch excluding return retired", + "EventCode": "0x810c", + "EventName": "BR_INDNR_TAKEN_RETIRED", + "BriefDescription": "Instruction architecturally executed, indirect branch excluding return retired" + }, + { + "PublicDescription": "Instruction architecturally executed, predicted immediate branch", + "EventCode": "0x8110", + "EventName": "BR_IMMED_PRED_RETIRED", + "BriefDescription": "Instruction architecturally executed, predicted immediate branch" + }, + { + "PublicDescription": "Instruction architecturally executed, mispredicted immediate branch", + "EventCode": "0x8111", + "EventName": "BR_IMMED_MIS_PRED_RETIRED", + "BriefDescription": "Instruction architecturally executed, mispredicted immediate branch" + }, + { + "PublicDescription": "Instruction architecturally executed, predicted indirect branch", + "EventCode": "0x8112", + "EventName": "BR_IND_PRED_RETIRED", + "BriefDescription": "Instruction architecturally executed, predicted indirect branch" + }, + { + "PublicDescription": "Instruction architecturally executed, mispredicted indirect branch", + "EventCode": "0x8113", + "EventName": "BR_IND_MIS_PRED_RETIRED", + "BriefDescription": "Instruction architecturally executed, mispredicted indirect branch" + }, + { + "PublicDescription": "Instruction architecturally executed, predicted procedure return", + "EventCode": "0x8114", + "EventName": "BR_RETURN_PRED_RETIRED", + "BriefDescription": "Instruction architecturally executed, predicted procedure return" + }, + { + "PublicDescription": "Instruction architecturally executed, mispredicted procedure return", + "EventCode": "0x8115", + "EventName": "BR_RETURN_MIS_PRED_RETIRED", + "BriefDescription": "Instruction architecturally executed, mispredicted procedure return" + }, + { + "PublicDescription": "Instruction architecturally executed, predicted indirect branch excluding return", + "EventCode": "0x8116", + "EventName": "BR_INDNR_PRED_RETIRED", + "BriefDescription": "Instruction architecturally executed, predicted indirect branch excluding return" + }, + { + "PublicDescription": "Instruction architecturally executed, mispredicted indirect branch excluding return", + "EventCode": "0x8117", + "EventName": "BR_INDNR_MIS_PRED_RETIRED", + "BriefDescription": "Instruction architecturally executed, mispredicted indirect branch excluding return" + }, + { + "PublicDescription": "Instruction architecturally executed, predicted branch, taken", + "EventCode": "0x8118", + "EventName": "BR_TAKEN_PRED_RETIRED", + "BriefDescription": "Instruction architecturally executed, predicted branch, taken" + }, + { + "PublicDescription": "Instruction architecturally executed, mispredicted branch, taken", + "EventCode": "0x8119", + "EventName": "BR_TAKEN_MIS_PRED_RETIRED", + "BriefDescription": "Instruction architecturally executed, mispredicted branch, taken" + }, + { + "PublicDescription": "Instruction architecturally executed, predicted branch, not taken", + "EventCode": "0x811a", + "EventName": "BR_SKIP_PRED_RETIRED", + "BriefDescription": "Instruction architecturally executed, predicted branch, not taken" + }, + { + "PublicDescription": "Instruction architecturally executed, mispredicted branch, not taken", + "EventCode": "0x811b", + "EventName": "BR_SKIP_MIS_PRED_RETIRED", + "BriefDescription": "Instruction architecturally executed, mispredicted branch, not taken" + }, + { + "PublicDescription": "Instruction architecturally executed, predicted branch", + "EventCode": "0x811c", + "EventName": "BR_PRED_RETIRED", + "BriefDescription": "Instruction architecturally executed, predicted branch" + }, + { + "PublicDescription": "Instruction architecturally executed, indirect branch", + "EventCode": "0x811d", + "EventName": "BR_IND_RETIRED", + "BriefDescription": "Instruction architecturally executed, indirect branch" + }, + { + "PublicDescription": "Branch Record captured.", + "EventCode": "0x811f", + "EventName": "BRB_FILTRATE", + "BriefDescription": "Branch Record captured." + } +] diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/bus.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/bus.json new file mode 100644 index 0000000000000..2aeb9907831d6 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/bus.json @@ -0,0 +1,20 @@ +[ + { + "ArchStdEvent": "CPU_CYCLES" + }, + { + "ArchStdEvent": "BUS_CYCLES" + }, + { + "ArchStdEvent": "BUS_ACCESS_RD" + }, + { + "ArchStdEvent": "BUS_ACCESS_WR" + }, + { + "ArchStdEvent": "BUS_ACCESS" + }, + { + "ArchStdEvent": "CNT_CYCLES" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/cache.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/cache.json new file mode 100644 index 0000000000000..c50d8e930b05e --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/cache.json @@ -0,0 +1,206 @@ +[ + { + "ArchStdEvent": "L1D_CACHE_RD" + }, + { + "ArchStdEvent": "L1D_CACHE_WR" + }, + { + "ArchStdEvent": "L1D_CACHE_REFILL_RD" + }, + { + "ArchStdEvent": "L1D_CACHE_INVAL" + }, + { + "ArchStdEvent": "L1D_TLB_REFILL_RD" + }, + { + "ArchStdEvent": "L1D_TLB_REFILL_WR" + }, + { + "ArchStdEvent": "L2D_CACHE_RD" + }, + { + "ArchStdEvent": "L2D_CACHE_WR" + }, + { + "ArchStdEvent": "L2D_CACHE_REFILL_RD" + }, + { + "ArchStdEvent": "L2D_CACHE_REFILL_WR" + }, + { + "ArchStdEvent": "L2D_CACHE_WB_VICTIM" + }, + { + "ArchStdEvent": "L2D_CACHE_WB_CLEAN" + }, + { + "ArchStdEvent": "L2D_CACHE_INVAL" + }, + { + "ArchStdEvent": "L1I_CACHE_REFILL" + }, + { + "ArchStdEvent": "L1I_TLB_REFILL" + }, + { + "ArchStdEvent": "L1D_CACHE_REFILL" + }, + { + "ArchStdEvent": "L1D_CACHE" + }, + { + "ArchStdEvent": "L1D_TLB_REFILL" + }, + { + "ArchStdEvent": "L1I_CACHE" + }, + { + "ArchStdEvent": "L2D_CACHE" + }, + { + "ArchStdEvent": "L2D_CACHE_REFILL" + }, + { + "ArchStdEvent": "L2D_CACHE_WB" + }, + { + "ArchStdEvent": "L1D_TLB" + }, + { + "ArchStdEvent": "L1I_TLB" + }, + { + "ArchStdEvent": "L2D_TLB_REFILL" + }, + { + "ArchStdEvent": "L2I_TLB_REFILL" + }, + { + "ArchStdEvent": "L2D_TLB" + }, + { + "ArchStdEvent": "L2I_TLB" + }, + { + "ArchStdEvent": "DTLB_WALK" + }, + { + "ArchStdEvent": "ITLB_WALK" + }, + { + "ArchStdEvent": "L1D_CACHE_REFILL_WR" + }, + { + "ArchStdEvent": "L1D_CACHE_LMISS_RD" + }, + { + "ArchStdEvent": "L1I_CACHE_LMISS" + }, + { + "ArchStdEvent": "L2D_CACHE_LMISS_RD" + }, + { + "PublicDescription": "Level 1 data or unified cache demand access", + "EventCode": "0x8140", + "EventName": "L1D_CACHE_RW", + "BriefDescription": "Level 1 data or unified cache demand access" + }, + { + "PublicDescription": "Level 1 data or unified cache preload or prefetch", + "EventCode": "0x8142", + "EventName": "L1D_CACHE_PRFM", + "BriefDescription": "Level 1 data or unified cache preload or prefetch" + }, + { + "PublicDescription": "Level 1 data or unified cache refill, preload or prefetch", + "EventCode": "0x8146", + "EventName": "L1D_CACHE_REFILL_PRFM", + "BriefDescription": "Level 1 data or unified cache refill, preload or prefetch" + }, + { + "ArchStdEvent": "L1D_TLB_RD" + }, + { + "ArchStdEvent": "L1D_TLB_WR" + }, + { + "ArchStdEvent": "L2D_TLB_REFILL_RD" + }, + { + "ArchStdEvent": "L2D_TLB_REFILL_WR" + }, + { + "ArchStdEvent": "L2D_TLB_RD" + }, + { + "ArchStdEvent": "L2D_TLB_WR" + }, + { + "PublicDescription": "L1D TLB miss", + "EventCode": "0xD600", + "EventName": "L1D_TLB_MISS", + "BriefDescription": "L1D TLB miss" + }, + { + "PublicDescription": "Level 1 prefetcher, load prefetch requests generated", + "EventCode": "0xd606", + "EventName": "L1_PREFETCH_LD_GEN", + "BriefDescription": "Level 1 prefetcher, load prefetch requests generated" + }, + { + "PublicDescription": "Level 1 prefetcher, load prefetch fills into the level 1 cache", + "EventCode": "0xd607", + "EventName": "L1_PREFETCH_LD_FILL", + "BriefDescription": "Level 1 prefetcher, load prefetch fills into the level 1 cache" + }, + { + "PublicDescription": "Level 1 prefetcher, load prefetch to level 2 generated", + "EventCode": "0xd608", + "EventName": "L1_PREFETCH_L2_REQ", + "BriefDescription": "Level 1 prefetcher, load prefetch to level 2 generated" + }, + { + "PublicDescription": "L1 prefetcher, distance was reset", + "EventCode": "0xd609", + "EventName": "L1_PREFETCH_DIST_RST", + "BriefDescription": "L1 prefetcher, distance was reset" + }, + { + "PublicDescription": "L1 prefetcher, distance was increased", + "EventCode": "0xd60a", + "EventName": "L1_PREFETCH_DIST_INC", + "BriefDescription": "L1 prefetcher, distance was increased" + }, + { + "PublicDescription": "Level 1 prefetcher, table entry is trained", + "EventCode": "0xd60b", + "EventName": "L1_PREFETCH_ENTRY_TRAINED", + "BriefDescription": "Level 1 prefetcher, table entry is trained" + }, + { + "PublicDescription": "L1 data cache refill - Read or Write", + "EventCode": "0xd60e", + "EventName": "L1D_CACHE_REFILL_RW", + "BriefDescription": "L1 data cache refill - Read or Write" + }, + { + "PublicDescription": "Level 2 cache refill from instruction-side miss, including IMMU refills", + "EventCode": "0xD701", + "EventName": "L2C_INST_REFILL", + "BriefDescription": "Level 2 cache refill from instruction-side miss, including IMMU refills" + }, + { + "PublicDescription": "Level 2 cache refill from data-side miss, including DMMU refills", + "EventCode": "0xD702", + "EventName": "L2C_DATA_REFILL", + "BriefDescription": "Level 2 cache refill from data-side miss, including DMMU refills" + }, + { + "PublicDescription": "Level 2 cache prefetcher, load prefetch requests generated", + "EventCode": "0xD703", + "EventName": "L2_PREFETCH_REQ", + "BriefDescription": "Level 2 cache prefetcher, load prefetch requests generated" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/core-imp-def.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/core-imp-def.json new file mode 100644 index 0000000000000..eb5a2208d2604 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/core-imp-def.json @@ -0,0 +1,464 @@ +[ + { + "PublicDescription": "Level 2 prefetch requests, refilled to L2 cache", + "EventCode": "0x10A", + "EventName": "L2_PREFETCH_REFILL", + "BriefDescription": "Level 2 prefetch requests, refilled to L2 cache" + }, + { + "PublicDescription": "Level 2 prefetch requests, late", + "EventCode": "0x10B", + "EventName": "L2_PREFETCH_UPGRADE", + "BriefDescription": "Level 2 prefetch requests, late" + }, + { + "PublicDescription": "Predictable branch speculatively executed that hit any level of BTB", + "EventCode": "0x110", + "EventName": "BPU_HIT_BTB", + "BriefDescription": "Predictable branch speculatively executed that hit any level of BTB" + }, + { + "PublicDescription": "Predictable conditional branch speculatively executed that hit any level of BTB", + "EventCode": "0x111", + "EventName": "BPU_CONDITIONAL_BRANCH_HIT_BTB", + "BriefDescription": "Predictable conditional branch speculatively executed that hit any level of BTB" + }, + { + "PublicDescription": "Predictable taken branch speculatively executed that hit any level of BTB that access the indirect predictor", + "EventCode": "0x112", + "EventName": "BPU_HIT_INDIRECT_PREDICTOR", + "BriefDescription": "Predictable taken branch speculatively executed that hit any level of BTB that access the indirect predictor" + }, + { + "PublicDescription": "Predictable taken branch speculatively executed that hit any level of BTB that access the return predictor", + "EventCode": "0x113", + "EventName": "BPU_HIT_RSB", + "BriefDescription": "Predictable taken branch speculatively executed that hit any level of BTB that access the return predictor" + }, + { + "PublicDescription": "Predictable unconditional branch speculatively executed that did not hit any level of BTB", + "EventCode": "0x114", + "EventName": "BPU_UNCONDITIONAL_BRANCH_MISS_BTB", + "BriefDescription": "Predictable unconditional branch speculatively executed that did not hit any level of BTB" + }, + { + "PublicDescription": "Predictable branch speculatively executed, unpredicted", + "EventCode": "0x115", + "EventName": "BPU_BRANCH_NO_HIT", + "BriefDescription": "Predictable branch speculatively executed, unpredicted" + }, + { + "PublicDescription": "Predictable branch speculatively executed that hit any level of BTB that mispredict", + "EventCode": "0x116", + "EventName": "BPU_HIT_BTB_AND_MISPREDICT", + "BriefDescription": "Predictable branch speculatively executed that hit any level of BTB that mispredict" + }, + { + "PublicDescription": "Predictable conditional branch speculatively executed that hit any level of BTB that (direction) mispredict", + "EventCode": "0x117", + "EventName": "BPU_CONDITIONAL_BRANCH_HIT_BTB_AND_MISPREDICT", + "BriefDescription": "Predictable conditional branch speculatively executed that hit any level of BTB that (direction) mispredict" + }, + { + "PublicDescription": "Predictable taken branch speculatively executed that hit any level of BTB that access the indirect predictor that mispredict", + "EventCode": "0x118", + "EventName": "BPU_INDIRECT_BRANCH_HIT_BTB_AND_MISPREDICT", + "BriefDescription": "Predictable taken branch speculatively executed that hit any level of BTB that access the indirect predictor that mispredict" + }, + { + "PublicDescription": "Predictable taken branch speculatively executed that hit any level of BTB that access the return predictor that mispredict", + "EventCode": "0x119", + "EventName": "BPU_HIT_RSB_AND_MISPREDICT", + "BriefDescription": "Predictable taken branch speculatively executed that hit any level of BTB that access the return predictor that mispredict" + }, + { + "PublicDescription": "Predictable taken branch speculatively executed that hit any level of BTB that access the overflow/underflow return predictor that mispredict", + "EventCode": "0x11a", + "EventName": "BPU_MISS_RSB_AND_MISPREDICT", + "BriefDescription": "Predictable taken branch speculatively executed that hit any level of BTB that access the overflow/underflow return predictor that mispredict" + }, + { + "PublicDescription": "Predictable branch speculatively executed, unpredicted, that mispredict", + "EventCode": "0x11b", + "EventName": "BPU_NO_PREDICTION_MISPREDICT", + "BriefDescription": "Predictable branch speculatively executed, unpredicted, that mispredict" + }, + { + "PublicDescription": "Preditable branch update the BTB region buffer entry", + "EventCode": "0x11c", + "EventName": "BPU_BTB_UPDATE", + "BriefDescription": "Preditable branch update the BTB region buffer entry" + }, + { + "PublicDescription": "Count predict pipe stalls due to speculative return address predictor full", + "EventCode": "0x11d", + "EventName": "BPU_RSB_FULL_STALL", + "BriefDescription": "Count predict pipe stalls due to speculative return address predictor full" + }, + { + "PublicDescription": "Macro-ops speculatively decoded", + "EventCode": "0x11f", + "EventName": "ICF_INST_SPEC_DECODE", + "BriefDescription": "Macro-ops speculatively decoded" + }, + { + "PublicDescription": "Flushes", + "EventCode": "0x120", + "EventName": "GPC_FLUSH", + "BriefDescription": "Flushes" + }, + { + "PublicDescription": "Flushes due to memory hazards", + "EventCode": "0x121", + "EventName": "GPC_FLUSH_MEM_FAULT", + "BriefDescription": "Flushes due to memory hazards" + }, + { + "PublicDescription": "ETM extout bit 0", + "EventCode": "0x141", + "EventName": "MSC_ETM_EXTOUT0", + "BriefDescription": "ETM extout bit 0" + }, + { + "PublicDescription": "ETM extout bit 1", + "EventCode": "0x142", + "EventName": "MSC_ETM_EXTOUT1", + "BriefDescription": "ETM extout bit 1" + }, + { + "PublicDescription": "ETM extout bit 2", + "EventCode": "0x143", + "EventName": "MSC_ETM_EXTOUT2", + "BriefDescription": "ETM extout bit 2" + }, + { + "PublicDescription": "ETM extout bit 3", + "EventCode": "0x144", + "EventName": "MSC_ETM_EXTOUT3", + "BriefDescription": "ETM extout bit 3" + }, + { + "PublicDescription": "Bus request sn", + "EventCode": "0x156", + "EventName": "L2C_SNOOP", + "BriefDescription": "Bus request sn" + }, + { + "PublicDescription": "L2 TXDAT LCRD blocked", + "EventCode": "0x169", + "EventName": "L2C_DAT_CRD_STALL", + "BriefDescription": "L2 TXDAT LCRD blocked" + }, + { + "PublicDescription": "L2 TXRSP LCRD blocked", + "EventCode": "0x16a", + "EventName": "L2C_RSP_CRD_STALL", + "BriefDescription": "L2 TXRSP LCRD blocked" + }, + { + "PublicDescription": "L2 TXREQ LCRD blocked", + "EventCode": "0x16b", + "EventName": "L2C_REQ_CRD_STALL", + "BriefDescription": "L2 TXREQ LCRD blocked" + }, + { + "PublicDescription": "Early mispredict", + "EventCode": "0xD100", + "EventName": "ICF_EARLY_MIS_PRED", + "BriefDescription": "Early mispredict" + }, + { + "PublicDescription": "FEQ full cycles", + "EventCode": "0xD101", + "EventName": "ICF_FEQ_FULL", + "BriefDescription": "FEQ full cycles" + }, + { + "PublicDescription": "Instruction FIFO Full", + "EventCode": "0xD102", + "EventName": "ICF_INST_FIFO_FULL", + "BriefDescription": "Instruction FIFO Full" + }, + { + "PublicDescription": "L1I TLB miss", + "EventCode": "0xD103", + "EventName": "L1I_TLB_MISS", + "BriefDescription": "L1I TLB miss" + }, + { + "PublicDescription": "ICF sent 0 instructions to IDR this cycle", + "EventCode": "0xD104", + "EventName": "ICF_STALL", + "BriefDescription": "ICF sent 0 instructions to IDR this cycle" + }, + { + "PublicDescription": "PC FIFO Full", + "EventCode": "0xD105", + "EventName": "ICF_PC_FIFO_FULL", + "BriefDescription": "PC FIFO Full" + }, + { + "PublicDescription": "Stall due to BOB ID", + "EventCode": "0xD200", + "EventName": "IDR_STALL_BOB_ID", + "BriefDescription": "Stall due to BOB ID" + }, + { + "PublicDescription": "Dispatch stall due to LOB entries", + "EventCode": "0xD201", + "EventName": "IDR_STALL_LOB_ID", + "BriefDescription": "Dispatch stall due to LOB entries" + }, + { + "PublicDescription": "Dispatch stall due to SOB entries", + "EventCode": "0xD202", + "EventName": "IDR_STALL_SOB_ID", + "BriefDescription": "Dispatch stall due to SOB entries" + }, + { + "PublicDescription": "Dispatch stall due to IXU scheduler entries", + "EventCode": "0xD203", + "EventName": "IDR_STALL_IXU_SCHED", + "BriefDescription": "Dispatch stall due to IXU scheduler entries" + }, + { + "PublicDescription": "Dispatch stall due to FSU scheduler entries", + "EventCode": "0xD204", + "EventName": "IDR_STALL_FSU_SCHED", + "BriefDescription": "Dispatch stall due to FSU scheduler entries" + }, + { + "PublicDescription": "Dispatch stall due to ROB entries", + "EventCode": "0xD205", + "EventName": "IDR_STALL_ROB_ID", + "BriefDescription": "Dispatch stall due to ROB entries" + }, + { + "PublicDescription": "Dispatch stall due to flush", + "EventCode": "0xD206", + "EventName": "IDR_STALL_FLUSH", + "BriefDescription": "Dispatch stall due to flush" + }, + { + "PublicDescription": "Dispatch stall due to WFI", + "EventCode": "0xD207", + "EventName": "IDR_STALL_WFI", + "BriefDescription": "Dispatch stall due to WFI" + }, + { + "PublicDescription": "Number of SWOB drains triggered by timeout", + "EventCode": "0xD208", + "EventName": "IDR_STALL_SWOB_TIMEOUT", + "BriefDescription": "Number of SWOB drains triggered by timeout" + }, + { + "PublicDescription": "Number of SWOB drains triggered by system register or special-purpose register read-after-write or specific special-purpose register writes that cause SWOB drain", + "EventCode": "0xD209", + "EventName": "IDR_STALL_SWOB_RAW", + "BriefDescription": "Number of SWOB drains triggered by system register or special-purpose register read-after-write or specific special-purpose register writes that cause SWOB drain" + }, + { + "PublicDescription": "Number of SWOB drains triggered by system register write when SWOB full", + "EventCode": "0xD20A", + "EventName": "IDR_STALL_SWOB_FULL", + "BriefDescription": "Number of SWOB drains triggered by system register write when SWOB full" + }, + { + "PublicDescription": "Dispatch stall due to L1 instruction cache miss", + "EventCode": "0xD20B", + "EventName": "STALL_FRONTEND_CACHE", + "BriefDescription": "Dispatch stall due to L1 instruction cache miss" + }, + { + "PublicDescription": "Dispatch stall due to L1 data cache miss", + "EventCode": "0xD20D", + "EventName": "STALL_BACKEND_CACHE", + "BriefDescription": "Dispatch stall due to L1 data cache miss" + }, + { + "PublicDescription": "Dispatch stall due to lack of any core resource", + "EventCode": "0xD20F", + "EventName": "STALL_BACKEND_RESOURCE", + "BriefDescription": "Dispatch stall due to lack of any core resource" + }, + { + "PublicDescription": "Instructions issued by the scheduler", + "EventCode": "0xD300", + "EventName": "IXU_NUM_UOPS_ISSUED", + "BriefDescription": "Instructions issued by the scheduler" + }, + { + "PublicDescription": "Any uop issued was canceled for any reason", + "EventCode": "0xD301", + "EventName": "IXU_ISSUE_CANCEL", + "BriefDescription": "Any uop issued was canceled for any reason" + }, + { + "PublicDescription": "A load wakeup to the scheduler has been canceled", + "EventCode": "0xD302", + "EventName": "IXU_LOAD_CANCEL", + "BriefDescription": "A load wakeup to the scheduler has been canceled" + }, + { + "PublicDescription": "The scheduler had to cancel one slow Uop due to resource conflict", + "EventCode": "0xD303", + "EventName": "IXU_SLOW_CANCEL", + "BriefDescription": "The scheduler had to cancel one slow Uop due to resource conflict" + }, + { + "PublicDescription": "Uops issued by the scheduler on IXA", + "EventCode": "0xD304", + "EventName": "IXU_IXA_ISSUED", + "BriefDescription": "Uops issued by the scheduler on IXA" + }, + { + "PublicDescription": "Uops issued by the scheduler on IXA Par 0", + "EventCode": "0xD305", + "EventName": "IXU_IXA_PAR0_ISSUED", + "BriefDescription": "Uops issued by the scheduler on IXA Par 0" + }, + { + "PublicDescription": "Uops issued by the scheduler on IXA Par 1", + "EventCode": "0xD306", + "EventName": "IXU_IXA_PAR1_ISSUED", + "BriefDescription": "Uops issued by the scheduler on IXA Par 1" + }, + { + "PublicDescription": "Uops issued by the scheduler on IXB", + "EventCode": "0xD307", + "EventName": "IXU_IXB_ISSUED", + "BriefDescription": "Uops issued by the scheduler on IXB" + }, + { + "PublicDescription": "Uops issued by the scheduler on IXB Par 0", + "EventCode": "0xD308", + "EventName": "IXU_IXB_PAR0_ISSUED", + "BriefDescription": "Uops issued by the scheduler on IXB Par 0" + }, + { + "PublicDescription": "Uops issued by the scheduler on IXB Par 1", + "EventCode": "0xD309", + "EventName": "IXU_IXB_PAR1_ISSUED", + "BriefDescription": "Uops issued by the scheduler on IXB Par 1" + }, + { + "PublicDescription": "Uops issued by the scheduler on IXC", + "EventCode": "0xD30A", + "EventName": "IXU_IXC_ISSUED", + "BriefDescription": "Uops issued by the scheduler on IXC" + }, + { + "PublicDescription": "Uops issued by the scheduler on IXC Par 0", + "EventCode": "0xD30B", + "EventName": "IXU_IXC_PAR0_ISSUED", + "BriefDescription": "Uops issued by the scheduler on IXC Par 0" + }, + { + "PublicDescription": "Uops issued by the scheduler on IXC Par 1", + "EventCode": "0xD30C", + "EventName": "IXU_IXC_PAR1_ISSUED", + "BriefDescription": "Uops issued by the scheduler on IXC Par 1" + }, + { + "PublicDescription": "Uops issued by the scheduler on IXD", + "EventCode": "0xD30D", + "EventName": "IXU_IXD_ISSUED", + "BriefDescription": "Uops issued by the scheduler on IXD" + }, + { + "PublicDescription": "Uops issued by the scheduler on IXD Par 0", + "EventCode": "0xD30E", + "EventName": "IXU_IXD_PAR0_ISSUED", + "BriefDescription": "Uops issued by the scheduler on IXD Par 0" + }, + { + "PublicDescription": "Uops issued by the scheduler on IXD Par 1", + "EventCode": "0xD30F", + "EventName": "IXU_IXD_PAR1_ISSUED", + "BriefDescription": "Uops issued by the scheduler on IXD Par 1" + }, + { + "PublicDescription": "Uops issued by the FSU scheduler", + "EventCode": "0xD400", + "EventName": "FSU_ISSUED", + "BriefDescription": "Uops issued by the FSU scheduler" + }, + { + "PublicDescription": "Uops issued by the scheduler on FSX", + "EventCode": "0xD401", + "EventName": "FSU_FSX_ISSUED", + "BriefDescription": "Uops issued by the scheduler on FSX" + }, + { + "PublicDescription": "Uops issued by the scheduler on FSY", + "EventCode": "0xD402", + "EventName": "FSU_FSY_ISSUED", + "BriefDescription": "Uops issued by the scheduler on FSY" + }, + { + "PublicDescription": "Uops issued by the scheduler on FSZ", + "EventCode": "0xD403", + "EventName": "FSU_FSZ_ISSUED", + "BriefDescription": "Uops issued by the scheduler on FSZ" + }, + { + "PublicDescription": "Uops canceled (load cancels)", + "EventCode": "0xD404", + "EventName": "FSU_CANCEL", + "BriefDescription": "Uops canceled (load cancels)" + }, + { + "PublicDescription": "Count scheduler stalls due to divide/sqrt", + "EventCode": "0xD405", + "EventName": "FSU_DIV_SQRT_STALL", + "BriefDescription": "Count scheduler stalls due to divide/sqrt" + }, + { + "PublicDescription": "Number of SWOB drains", + "EventCode": "0xD500", + "EventName": "GPC_SWOB_DRAIN", + "BriefDescription": "Number of SWOB drains" + }, + { + "PublicDescription": "GPC detected a Breakpoint instruction match", + "EventCode": "0xD501", + "EventName": "BREAKPOINT_MATCH", + "BriefDescription": "GPC detected a Breakpoint instruction match" + }, + { + "PublicDescription": "Core progress monitor triggered", + "EventCode": "0xd502", + "EventName": "GPC_CPM_TRIGGER", + "BriefDescription": "Core progress monitor triggered" + }, + { + "PublicDescription": "Fill buffer full", + "EventCode": "0xD601", + "EventName": "OFB_FULL", + "BriefDescription": "Fill buffer full" + }, + { + "PublicDescription": "Load satisified from store forwarded data", + "EventCode": "0xD605", + "EventName": "LD_FROM_ST_FWD", + "BriefDescription": "Load satisified from store forwarded data" + }, + { + "PublicDescription": "Store retirement pipe stall", + "EventCode": "0xD60C", + "EventName": "LSU_ST_RETIRE_STALL", + "BriefDescription": "Store retirement pipe stall" + }, + { + "PublicDescription": "LSU detected a Watchpoint data match", + "EventCode": "0xD60D", + "EventName": "WATCHPOINT_MATCH", + "BriefDescription": "LSU detected a Watchpoint data match" + }, + { + "PublicDescription": "Counts cycles that MSC is telling GPC to stall commit due to ETM ISTALL feature", + "EventCode": "0xda00", + "EventName": "MSC_ETM_COMMIT_STALL", + "BriefDescription": "Counts cycles that MSC is telling GPC to stall commit due to ETM ISTALL feature" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/exception.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/exception.json new file mode 100644 index 0000000000000..bd59ba7b74e42 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/exception.json @@ -0,0 +1,47 @@ +[ + { + "ArchStdEvent": "EXC_UNDEF" + }, + { + "ArchStdEvent": "EXC_SVC" + }, + { + "ArchStdEvent": "EXC_PABORT" + }, + { + "ArchStdEvent": "EXC_DABORT" + }, + { + "ArchStdEvent": "EXC_IRQ" + }, + { + "ArchStdEvent": "EXC_FIQ" + }, + { + "ArchStdEvent": "EXC_HVC" + }, + { + "ArchStdEvent": "EXC_TRAP_PABORT" + }, + { + "ArchStdEvent": "EXC_TRAP_DABORT" + }, + { + "ArchStdEvent": "EXC_TRAP_OTHER" + }, + { + "ArchStdEvent": "EXC_TRAP_IRQ" + }, + { + "ArchStdEvent": "EXC_TRAP_FIQ" + }, + { + "ArchStdEvent": "EXC_TAKEN" + }, + { + "ArchStdEvent": "EXC_RETURN" + }, + { + "ArchStdEvent": "EXC_SMC" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/instruction.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/instruction.json new file mode 100644 index 0000000000000..a6a20f541e333 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/instruction.json @@ -0,0 +1,128 @@ +[ + { + "ArchStdEvent": "SW_INCR" + }, + { + "ArchStdEvent": "ST_RETIRED" + }, + { + "ArchStdEvent": "LD_SPEC" + }, + { + "ArchStdEvent": "ST_SPEC" + }, + { + "ArchStdEvent": "LDST_SPEC" + }, + { + "ArchStdEvent": "DP_SPEC" + }, + { + "ArchStdEvent": "ASE_SPEC" + }, + { + "ArchStdEvent": "VFP_SPEC" + }, + { + "ArchStdEvent": "PC_WRITE_SPEC" + }, + { + "ArchStdEvent": "BR_IMMED_RETIRED" + }, + { + "ArchStdEvent": "BR_RETURN_RETIRED" + }, + { + "ArchStdEvent": "CRYPTO_SPEC" + }, + { + "ArchStdEvent": "ISB_SPEC" + }, + { + "ArchStdEvent": "DSB_SPEC" + }, + { + "ArchStdEvent": "DMB_SPEC" + }, + { + "ArchStdEvent": "RC_LD_SPEC" + }, + { + "ArchStdEvent": "RC_ST_SPEC" + }, + { + "ArchStdEvent": "INST_RETIRED" + }, + { + "ArchStdEvent": "CID_WRITE_RETIRED" + }, + { + "ArchStdEvent": "PC_WRITE_RETIRED" + }, + { + "ArchStdEvent": "INST_SPEC" + }, + { + "ArchStdEvent": "TTBR_WRITE_RETIRED" + }, + { + "ArchStdEvent": "BR_RETIRED" + }, + { + "ArchStdEvent": "BR_MIS_PRED_RETIRED" + }, + { + "ArchStdEvent": "OP_RETIRED" + }, + { + "ArchStdEvent": "OP_SPEC" + }, + { + "PublicDescription": "Operation speculatively executed - ASE Scalar", + "EventCode": "0xd210", + "EventName": "ASE_SCALAR_SPEC", + "BriefDescription": "Operation speculatively executed - ASE Scalar" + }, + { + "PublicDescription": "Operation speculatively executed - ASE Vector", + "EventCode": "0xd211", + "EventName": "ASE_VECTOR_SPEC", + "BriefDescription": "Operation speculatively executed - ASE Vector" + }, + { + "PublicDescription": "Barrier speculatively executed, CSDB", + "EventCode": "0x7f", + "EventName": "CSDB_SPEC", + "BriefDescription": "Barrier speculatively executed, CSDB" + }, + { + "PublicDescription": "Prefetch sent to L2.", + "EventCode": "0xd106", + "EventName": "ICF_PREFETCH_DISPATCH", + "BriefDescription": "Prefetch sent to L2." + }, + { + "PublicDescription": "Prefetch response received but was dropped since we don't support inflight upgrades.", + "EventCode": "0xd107", + "EventName": "ICF_PREFETCH_DROPPED_NO_UPGRADE", + "BriefDescription": "Prefetch response received but was dropped since we don't support inflight upgrades." + }, + { + "PublicDescription": "Prefetch request missed TLB.", + "EventCode": "0xd108", + "EventName": "ICF_PREFETCH_DROPPED_TLB_MISS", + "BriefDescription": "Prefetch request missed TLB." + }, + { + "PublicDescription": "Prefetch request dropped since duplicate was found in TLB.", + "EventCode": "0xd109", + "EventName": "ICF_PREFETCH_DROPPED_DUPLICATE", + "BriefDescription": "Prefetch request dropped since duplicate was found in TLB." + }, + { + "PublicDescription": "Prefetch request dropped since it was found in cache.", + "EventCode": "0xd10a", + "EventName": "ICF_PREFETCH_DROPPED_CACHE_HIT", + "BriefDescription": "Prefetch request dropped since it was found in cache." + } +] diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/intrinsic.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/intrinsic.json new file mode 100644 index 0000000000000..7ecffb989ae04 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/intrinsic.json @@ -0,0 +1,14 @@ +[ + { + "ArchStdEvent": "LDREX_SPEC" + }, + { + "ArchStdEvent": "STREX_PASS_SPEC" + }, + { + "ArchStdEvent": "STREX_FAIL_SPEC" + }, + { + "ArchStdEvent": "STREX_SPEC" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/memory.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/memory.json new file mode 100644 index 0000000000000..a211d94aacde6 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/memory.json @@ -0,0 +1,41 @@ +[ + { + "ArchStdEvent": "LD_RETIRED" + }, + { + "ArchStdEvent": "MEM_ACCESS_RD" + }, + { + "ArchStdEvent": "MEM_ACCESS_WR" + }, + { + "ArchStdEvent": "LD_ALIGN_LAT" + }, + { + "ArchStdEvent": "ST_ALIGN_LAT" + }, + { + "ArchStdEvent": "MEM_ACCESS" + }, + { + "ArchStdEvent": "MEMORY_ERROR" + }, + { + "ArchStdEvent": "LDST_ALIGN_LAT" + }, + { + "ArchStdEvent": "MEM_ACCESS_CHECKED" + }, + { + "ArchStdEvent": "MEM_ACCESS_CHECKED_RD" + }, + { + "ArchStdEvent": "MEM_ACCESS_CHECKED_WR" + }, + { + "PublicDescription": "Flushes due to memory hazards", + "EventCode": "0x121", + "EventName": "BPU_FLUSH_MEM_FAULT", + "BriefDescription": "Flushes due to memory hazards" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/metrics.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/metrics.json new file mode 100644 index 0000000000000..c5d1d22bd034b --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/metrics.json @@ -0,0 +1,442 @@ +[ + { + "MetricName": "branch_miss_pred_rate", + "MetricExpr": "BR_MIS_PRED / BR_PRED", + "BriefDescription": "Branch predictor misprediction rate. May not count branches that are never resolved because they are in the misprediction shadow of an earlier branch", + "MetricGroup": "branch", + "ScaleUnit": "100%" + }, + { + "MetricName": "bus_utilization", + "MetricExpr": "BUS_ACCESS / (BUS_CYCLES * 1)", + "BriefDescription": "Core-to-uncore bus utilization", + "MetricGroup": "Bus", + "ScaleUnit": "100percent of bus cycles" + }, + { + "MetricName": "l1d_cache_miss_ratio", + "MetricExpr": "L1D_CACHE_REFILL / L1D_CACHE", + "BriefDescription": "This metric measures the ratio of level 1 data cache accesses missed to the total number of level 1 data cache accesses. This gives an indication of the effectiveness of the level 1 data cache.", + "MetricGroup": "Miss_Ratio;L1D_Cache_Effectiveness", + "ScaleUnit": "1per cache access" + }, + { + "MetricName": "l1i_cache_miss_ratio", + "MetricExpr": "L1I_CACHE_REFILL / L1I_CACHE", + "BriefDescription": "This metric measures the ratio of level 1 instruction cache accesses missed to the total number of level 1 instruction cache accesses. This gives an indication of the effectiveness of the level 1 instruction cache.", + "MetricGroup": "Miss_Ratio;L1I_Cache_Effectiveness", + "ScaleUnit": "1per cache access" + }, + { + "MetricName": "Miss_Ratio;l1d_cache_read_miss", + "MetricExpr": "L1D_CACHE_LMISS_RD / L1D_CACHE_RD", + "BriefDescription": "L1D cache read miss rate", + "MetricGroup": "Cache", + "ScaleUnit": "1per cache read access" + }, + { + "MetricName": "l2_cache_miss_ratio", + "MetricExpr": "L2D_CACHE_REFILL / L2D_CACHE", + "BriefDescription": "This metric measures the ratio of level 2 cache accesses missed to the total number of level 2 cache accesses. This gives an indication of the effectiveness of the level 2 cache, which is a unified cache that stores both data and instruction. Note that cache accesses in this cache are either data memory access or instruction fetch as this is a unified cache.", + "MetricGroup": "Miss_Ratio;L2_Cache_Effectiveness", + "ScaleUnit": "1per cache access" + }, + { + "MetricName": "l1i_cache_read_miss_rate", + "MetricExpr": "L1I_CACHE_LMISS / L1I_CACHE", + "BriefDescription": "L1I cache read miss rate", + "MetricGroup": "Cache", + "ScaleUnit": "1per cache access" + }, + { + "MetricName": "l2d_cache_read_miss_rate", + "MetricExpr": "L2D_CACHE_LMISS_RD / L2D_CACHE_RD", + "BriefDescription": "L2 cache read miss rate", + "MetricGroup": "Cache", + "ScaleUnit": "1per cache read access" + }, + { + "MetricName": "l1d_cache_miss_mpki", + "MetricExpr": "(L1D_CACHE_LMISS_RD * 1e3) / INST_RETIRED", + "BriefDescription": "Misses per thousand instructions (data)", + "MetricGroup": "Cache", + "ScaleUnit": "1MPKI" + }, + { + "MetricName": "l1i_cache_miss_mpki", + "MetricExpr": "(L1I_CACHE_LMISS * 1e3) / INST_RETIRED", + "BriefDescription": "Misses per thousand instructions (instruction)", + "MetricGroup": "Cache", + "ScaleUnit": "1MPKI" + }, + { + "MetricName": "simd_percentage", + "MetricExpr": "ASE_SPEC / INST_SPEC", + "BriefDescription": "This metric measures advanced SIMD operations as a percentage of total operations speculatively executed.", + "MetricGroup": "Operation_Mix", + "ScaleUnit": "100percent of operations" + }, + { + "MetricName": "crypto_percentage", + "MetricExpr": "CRYPTO_SPEC / INST_SPEC", + "BriefDescription": "This metric measures crypto operations as a percentage of operations speculatively executed.", + "MetricGroup": "Operation_Mix", + "ScaleUnit": "100percent of operations" + }, + { + "MetricName": "gflops", + "MetricExpr": "VFP_SPEC / (duration_time * 1e9)", + "BriefDescription": "Giga-floating point operations per second", + "MetricGroup": "InstructionMix" + }, + { + "MetricName": "integer_dp_percentage", + "MetricExpr": "DP_SPEC / INST_SPEC", + "BriefDescription": "This metric measures scalar integer operations as a percentage of operations speculatively executed.", + "MetricGroup": "Operation_Mix", + "ScaleUnit": "100percent of operations" + }, + { + "MetricName": "ipc", + "MetricExpr": "INST_RETIRED / CPU_CYCLES", + "BriefDescription": "This metric measures the number of instructions retired per cycle.", + "MetricGroup": "General", + "ScaleUnit": "1per cycle" + }, + { + "MetricName": "load_percentage", + "MetricExpr": "LD_SPEC / INST_SPEC", + "BriefDescription": "This metric measures load operations as a percentage of operations speculatively executed.", + "MetricGroup": "Operation_Mix", + "ScaleUnit": "100percent of operations" + }, + { + "MetricName": "load_store_spec_rate", + "MetricExpr": "LDST_SPEC / INST_SPEC", + "BriefDescription": "The rate of load or store instructions speculatively executed to overall instructions speclatively executed", + "MetricGroup": "Operation_Mix", + "ScaleUnit": "100percent of operations" + }, + { + "MetricName": "retired_mips", + "MetricExpr": "INST_RETIRED / (duration_time * 1e6)", + "BriefDescription": "Millions of instructions per second", + "MetricGroup": "InstructionMix" + }, + { + "MetricName": "spec_utilization_mips", + "MetricExpr": "INST_SPEC / (duration_time * 1e6)", + "BriefDescription": "Millions of instructions per second", + "MetricGroup": "PEutilization" + }, + { + "MetricName": "pc_write_spec_rate", + "MetricExpr": "PC_WRITE_SPEC / INST_SPEC", + "BriefDescription": "The rate of software change of the PC speculatively executed to overall instructions speclatively executed", + "MetricGroup": "Operation_Mix", + "ScaleUnit": "100percent of operations" + }, + { + "MetricName": "store_percentage", + "MetricExpr": "ST_SPEC / INST_SPEC", + "BriefDescription": "This metric measures store operations as a percentage of operations speculatively executed.", + "MetricGroup": "Operation_Mix", + "ScaleUnit": "100percent of operations" + }, + { + "MetricName": "scalar_fp_percentage", + "MetricExpr": "VFP_SPEC / INST_SPEC", + "BriefDescription": "This metric measures scalar floating point operations as a percentage of operations speculatively executed.", + "MetricGroup": "Operation_Mix", + "ScaleUnit": "100percent of operations" + }, + { + "MetricName": "retired_rate", + "MetricExpr": "OP_RETIRED / OP_SPEC", + "BriefDescription": "Of all the micro-operations issued, what percentage are retired(committed)", + "MetricGroup": "General", + "ScaleUnit": "100%" + }, + { + "MetricName": "wasted", + "MetricExpr": "1 - (OP_RETIRED / (CPU_CYCLES * #slots))", + "BriefDescription": "Of all the micro-operations issued, what proportion are lost", + "MetricGroup": "General", + "ScaleUnit": "100%" + }, + { + "MetricName": "wasted_rate", + "MetricExpr": "1 - OP_RETIRED / OP_SPEC", + "BriefDescription": "Of all the micro-operations issued, what percentage are not retired(committed)", + "MetricGroup": "General", + "ScaleUnit": "100%" + }, + { + "MetricName": "stall_backend_cache_rate", + "MetricExpr": "STALL_BACKEND_CACHE / CPU_CYCLES", + "BriefDescription": "Proportion of cycles stalled and no operations issued to backend and cache miss", + "MetricGroup": "Stall", + "ScaleUnit": "100percent of cycles" + }, + { + "MetricName": "stall_backend_resource_rate", + "MetricExpr": "STALL_BACKEND_RESOURCE / CPU_CYCLES", + "BriefDescription": "Proportion of cycles stalled and no operations issued to backend and resource full", + "MetricGroup": "Stall", + "ScaleUnit": "100percent of cycles" + }, + { + "MetricName": "stall_backend_tlb_rate", + "MetricExpr": "STALL_BACKEND_TLB / CPU_CYCLES", + "BriefDescription": "Proportion of cycles stalled and no operations issued to backend and TLB miss", + "MetricGroup": "Stall", + "ScaleUnit": "100percent of cycles" + }, + { + "MetricName": "stall_frontend_cache_rate", + "MetricExpr": "STALL_FRONTEND_CACHE / CPU_CYCLES", + "BriefDescription": "Proportion of cycles stalled and no ops delivered from frontend and cache miss", + "MetricGroup": "Stall", + "ScaleUnit": "100percent of cycles" + }, + { + "MetricName": "stall_frontend_tlb_rate", + "MetricExpr": "STALL_FRONTEND_TLB / CPU_CYCLES", + "BriefDescription": "Proportion of cycles stalled and no ops delivered from frontend and TLB miss", + "MetricGroup": "Stall", + "ScaleUnit": "100percent of cycles" + }, + { + "MetricName": "dtlb_walk_ratio", + "MetricExpr": "DTLB_WALK / L1D_TLB", + "BriefDescription": "This metric measures the ratio of data TLB Walks to the total number of data TLB accesses. This gives an indication of the effectiveness of the data TLB accesses.", + "MetricGroup": "Miss_Ratio;DTLB_Effectiveness", + "ScaleUnit": "1per TLB access" + }, + { + "MetricName": "itlb_walk_ratio", + "MetricExpr": "ITLB_WALK / L1I_TLB", + "BriefDescription": "This metric measures the ratio of instruction TLB Walks to the total number of instruction TLB accesses. This gives an indication of the effectiveness of the instruction TLB accesses.", + "MetricGroup": "Miss_Ratio;ITLB_Effectiveness", + "ScaleUnit": "1per TLB access" + }, + { + "ArchStdEvent": "backend_bound" + }, + { + "ArchStdEvent": "frontend_bound", + "MetricExpr": "100 - (retired_fraction + slots_lost_misspeculation_fraction + backend_bound)" + }, + { + "MetricName": "slots_lost_misspeculation_fraction", + "MetricExpr": "(OP_SPEC - OP_RETIRED) / (CPU_CYCLES * #slots)", + "BriefDescription": "Fraction of slots lost due to misspeculation", + "DefaultMetricgroupName": "TopdownL1", + "MetricGroup": "Default;TopdownL1", + "ScaleUnit": "100percent of slots" + }, + { + "MetricName": "retired_fraction", + "MetricExpr": "OP_RETIRED / (CPU_CYCLES * #slots)", + "BriefDescription": "Fraction of slots retiring, useful work", + "DefaultMetricgroupName": "TopdownL1", + "MetricGroup": "Default;TopdownL1", + "ScaleUnit": "100percent of slots" + }, + { + "MetricName": "backend_core", + "MetricExpr": "(backend_bound / 100) - backend_memory", + "BriefDescription": "Fraction of slots the CPU was stalled due to backend non-memory subsystem issues", + "MetricGroup": "TopdownL2", + "ScaleUnit": "100%" + }, + { + "MetricName": "backend_memory", + "MetricExpr": "(STALL_BACKEND_TLB + STALL_BACKEND_CACHE) / CPU_CYCLES", + "BriefDescription": "Fraction of slots the CPU was stalled due to backend memory subsystem issues (cache/tlb miss)", + "MetricGroup": "TopdownL2", + "ScaleUnit": "100%" + }, + { + "MetricName": "branch_mispredict", + "MetricExpr": "(BR_MIS_PRED_RETIRED / GPC_FLUSH) * slots_lost_misspeculation_fraction", + "BriefDescription": "Fraction of slots lost due to branch misprediciton", + "MetricGroup": "TopdownL2", + "ScaleUnit": "1percent of slots" + }, + { + "MetricName": "frontend_bandwidth", + "MetricExpr": "frontend_bound - frontend_latency", + "BriefDescription": "Fraction of slots the CPU did not dispatch at full bandwidth - able to dispatch partial slots only (1, 2, or 3 uops)", + "MetricGroup": "TopdownL2", + "ScaleUnit": "1percent of slots" + }, + { + "MetricName": "frontend_latency", + "MetricExpr": "(STALL_FRONTEND - ((STALL_SLOT_FRONTEND - ((frontend_bound / 100) * CPU_CYCLES * #slots)) / #slots)) / CPU_CYCLES", + "BriefDescription": "Fraction of slots the CPU was stalled due to frontend latency issues (cache/tlb miss); nothing to dispatch", + "MetricGroup": "TopdownL2", + "ScaleUnit": "100percent of slots" + }, + { + "MetricName": "other_miss_pred", + "MetricExpr": "slots_lost_misspeculation_fraction - branch_mispredict", + "BriefDescription": "Fraction of slots lost due to other/non-branch misprediction misspeculation", + "MetricGroup": "TopdownL2", + "ScaleUnit": "1percent of slots" + }, + { + "MetricName": "pipe_utilization", + "MetricExpr": "100 * ((IXU_NUM_UOPS_ISSUED + FSU_ISSUED) / (CPU_CYCLES * 6))", + "BriefDescription": "Fraction of execute slots utilized", + "MetricGroup": "TopdownL2", + "ScaleUnit": "1percent of slots" + }, + { + "MetricName": "d_cache_l2_miss_rate", + "MetricExpr": "STALL_BACKEND_MEM / CPU_CYCLES", + "BriefDescription": "Fraction of cycles the CPU was stalled due to data L2 cache miss", + "MetricGroup": "TopdownL3", + "ScaleUnit": "100percent of cycles" + }, + { + "MetricName": "d_cache_miss_rate", + "MetricExpr": "STALL_BACKEND_CACHE / CPU_CYCLES", + "BriefDescription": "Fraction of cycles the CPU was stalled due to data cache miss", + "MetricGroup": "TopdownL3", + "ScaleUnit": "100percent of cycles" + }, + { + "MetricName": "d_tlb_miss_rate", + "MetricExpr": "STALL_BACKEND_TLB / CPU_CYCLES", + "BriefDescription": "Fraction of cycles the CPU was stalled due to data TLB miss", + "MetricGroup": "TopdownL3", + "ScaleUnit": "100percent of cycles" + }, + { + "MetricName": "fsu_pipe_utilization", + "MetricExpr": "FSU_ISSUED / (CPU_CYCLES * 2)", + "BriefDescription": "Fraction of FSU execute slots utilized", + "MetricGroup": "TopdownL3", + "ScaleUnit": "100percent of slots" + }, + { + "MetricName": "i_cache_miss_rate", + "MetricExpr": "STALL_FRONTEND_CACHE / CPU_CYCLES", + "BriefDescription": "Fraction of cycles the CPU was stalled due to instruction cache miss", + "MetricGroup": "TopdownL3", + "ScaleUnit": "100percent of slots" + }, + { + "MetricName": "i_tlb_miss_rate", + "MetricExpr": "STALL_FRONTEND_TLB / CPU_CYCLES", + "BriefDescription": "Fraction of cycles the CPU was stalled due to instruction TLB miss", + "MetricGroup": "TopdownL3", + "ScaleUnit": "100percent of slots" + }, + { + "MetricName": "ixu_pipe_utilization", + "MetricExpr": "IXU_NUM_UOPS_ISSUED / (CPU_CYCLES * #slots)", + "BriefDescription": "Fraction of IXU execute slots utilized", + "MetricGroup": "TopdownL3", + "ScaleUnit": "100percent of slots" + }, + { + "MetricName": "stall_recovery_rate", + "MetricExpr": "IDR_STALL_FLUSH / CPU_CYCLES", + "BriefDescription": "Fraction of cycles the CPU was stalled due to flush recovery", + "MetricGroup": "TopdownL3", + "ScaleUnit": "100percent of slots" + }, + { + "MetricName": "stall_fsu_sched_rate", + "MetricExpr": "IDR_STALL_FSU_SCHED / CPU_CYCLES", + "BriefDescription": "Fraction of cycles the CPU was stalled and FSU was full", + "MetricGroup": "TopdownL4", + "ScaleUnit": "100percent of cycles" + }, + { + "MetricName": "stall_ixu_sched_rate", + "MetricExpr": "IDR_STALL_IXU_SCHED / CPU_CYCLES", + "BriefDescription": "Fraction of cycles the CPU was stalled and IXU was full", + "MetricGroup": "TopdownL4", + "ScaleUnit": "100percent of cycles" + }, + { + "MetricName": "stall_lob_id_rate", + "MetricExpr": "IDR_STALL_LOB_ID / CPU_CYCLES", + "BriefDescription": "Fraction of cycles the CPU was stalled and LOB was full", + "MetricGroup": "TopdownL4", + "ScaleUnit": "100percent of cycles" + }, + { + "MetricName": "stall_rob_id_rate", + "MetricExpr": "IDR_STALL_ROB_ID / CPU_CYCLES", + "BriefDescription": "Fraction of cycles the CPU was stalled and ROB was full", + "MetricGroup": "TopdownL4", + "ScaleUnit": "100percent of cycles" + }, + { + "MetricName": "stall_sob_id_rate", + "MetricExpr": "IDR_STALL_SOB_ID / CPU_CYCLES", + "BriefDescription": "Fraction of cycles the CPU was stalled and SOB was full", + "MetricGroup": "TopdownL4", + "ScaleUnit": "100percent of cycles" + }, + { + "MetricName": "l1d_cache_access_demand", + "MetricExpr": "L1D_CACHE_RW / L1D_CACHE", + "BriefDescription": "L1D cache access - demand", + "MetricGroup": "Cache", + "ScaleUnit": "100percent of cache acceses" + }, + { + "MetricName": "l1d_cache_access_prefetces", + "MetricExpr": "L1D_CACHE_PRFM / L1D_CACHE", + "BriefDescription": "L1D cache access - prefetch", + "MetricGroup": "Cache", + "ScaleUnit": "100percent of cache acceses" + }, + { + "MetricName": "l1d_cache_demand_misses", + "MetricExpr": "L1D_CACHE_REFILL_RW / L1D_CACHE", + "BriefDescription": "L1D cache demand misses", + "MetricGroup": "Cache", + "ScaleUnit": "100percent of cache acceses" + }, + { + "MetricName": "l1d_cache_demand_misses_read", + "MetricExpr": "L1D_CACHE_REFILL_RD / L1D_CACHE", + "BriefDescription": "L1D cache demand misses - read", + "MetricGroup": "Cache", + "ScaleUnit": "100percent of cache acceses" + }, + { + "MetricName": "l1d_cache_demand_misses_write", + "MetricExpr": "L1D_CACHE_REFILL_WR / L1D_CACHE", + "BriefDescription": "L1D cache demand misses - write", + "MetricGroup": "Cache", + "ScaleUnit": "100percent of cache acceses" + }, + { + "MetricName": "l1d_cache_prefetch_misses", + "MetricExpr": "L1D_CACHE_REFILL_PRFM / L1D_CACHE", + "BriefDescription": "L1D cache prefetch misses", + "MetricGroup": "Cache", + "ScaleUnit": "100percent of cache acceses" + }, + { + "MetricName": "ase_scalar_mix", + "MetricExpr": "ASE_SCALAR_SPEC / OP_SPEC", + "BriefDescription": "Proportion of advanced SIMD data processing operations (excluding DP_SPEC/LD_SPEC) scalar operations", + "MetricGroup": "Instructions", + "ScaleUnit": "100percent of cache acceses" + }, + { + "MetricName": "ase_vector_mix", + "MetricExpr": "ASE_VECTOR_SPEC / OP_SPEC", + "BriefDescription": "Proportion of advanced SIMD data processing operations (excluding DP_SPEC/LD_SPEC) vector operations", + "MetricGroup": "Instructions", + "ScaleUnit": "100percent of cache acceses" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/mmu.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/mmu.json new file mode 100644 index 0000000000000..66d83b680651e --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/mmu.json @@ -0,0 +1,170 @@ +[ + { + "PublicDescription": "Level 2 data translation buffer allocation", + "EventCode": "0xD800", + "EventName": "MMU_D_OTB_ALLOC", + "BriefDescription": "Level 2 data translation buffer allocation" + }, + { + "PublicDescription": "Data TLB translation cache hit on S1L2 walk cache entry", + "EventCode": "0xd801", + "EventName": "MMU_D_TRANS_CACHE_HIT_S1L2_WALK", + "BriefDescription": "Data TLB translation cache hit on S1L2 walk cache entry" + }, + { + "PublicDescription": "Data TLB translation cache hit on S1L1 walk cache entry", + "EventCode": "0xd802", + "EventName": "MMU_D_TRANS_CACHE_HIT_S1L1_WALK", + "BriefDescription": "Data TLB translation cache hit on S1L1 walk cache entry" + }, + { + "PublicDescription": "Data TLB translation cache hit on S1L0 walk cache entry", + "EventCode": "0xd803", + "EventName": "MMU_D_TRANS_CACHE_HIT_S1L0_WALK", + "BriefDescription": "Data TLB translation cache hit on S1L0 walk cache entry" + }, + { + "PublicDescription": "Data TLB translation cache hit on S2L2 walk cache entry", + "EventCode": "0xd804", + "EventName": "MMU_D_TRANS_CACHE_HIT_S2L2_WALK", + "BriefDescription": "Data TLB translation cache hit on S2L2 walk cache entry" + }, + { + "PublicDescrition": "Data TLB translation cache hit on S2L1 walk cache entry", + "EventCode": "0xd805", + "EventName": "MMU_D_TRANS_CACHE_HIT_S2L1_WALK", + "BriefDescription": "Data TLB translation cache hit on S2L1 walk cache entry" + }, + { + "PublicDescrition": "Data TLB translation cache hit on S2L0 walk cache entry", + "EventCode": "0xd806", + "EventName": "MMU_D_TRANS_CACHE_HIT_S2L0_WALK", + "BriefDescription": "Data TLB translation cache hit on S2L0 walk cache entry" + }, + { + "PublicDescrition": "Data-side S1 page walk cache lookup", + "EventCode": "0xd807", + "EventName": "MMU_D_S1_WALK_CACHE_LOOKUP", + "BriefDescription": "Data-side S1 page walk cache lookup" + }, + { + "PublicDescrition": "Data-side S1 page walk cache refill", + "EventCode": "0xd808", + "EventName": "MMU_D_S1_WALK_CACHE_REFILL", + "BriefDescription": "Data-side S1 page walk cache refill" + }, + { + "PublicDescrition": "Data-side S2 page walk cache lookup", + "EventCode": "0xd809", + "EventName": "MMU_D_S2_WALK_CACHE_LOOKUP", + "BriefDescription": "Data-side S2 page walk cache lookup" + }, + { + "PublicDescrition": "Data-side S2 page walk cache refill", + "EventCode": "0xd80a", + "EventName": "MMU_D_S2_WALK_CACHE_REFILL", + "BriefDescription": "Data-side S2 page walk cache refill" + }, + { + "PublicDescription": "Data-side S1 table walk fault", + "EventCode": "0xD80B", + "EventName": "MMU_D_S1_WALK_FAULT", + "BriefDescription": "Data-side S1 table walk fault" + }, + { + "PublicDescription": "Data-side S2 table walk fault", + "EventCode": "0xD80C", + "EventName": "MMU_D_S2_WALK_FAULT", + "BriefDescription": "Data-side S2 table walk fault" + }, + { + "PublicDescription": "Data-side table walk steps or descriptor fetches", + "EventCode": "0xD80D", + "EventName": "MMU_D_WALK_STEPS", + "BriefDescription": "Data-side table walk steps or descriptor fetches" + }, + { + "PublicDescription": "Level 2 instruction translation buffer allocation", + "EventCode": "0xD900", + "EventName": "MMU_I_OTB_ALLOC", + "BriefDescription": "Level 2 instruction translation buffer allocation" + }, + { + "PublicDescrition": "Instruction TLB translation cache hit on S1L2 walk cache entry", + "EventCode": "0xd901", + "EventName": "MMU_I_TRANS_CACHE_HIT_S1L2_WALK", + "BriefDescription": "Instruction TLB translation cache hit on S1L2 walk cache entry" + }, + { + "PublicDescrition": "Instruction TLB translation cache hit on S1L1 walk cache entry", + "EventCode": "0xd902", + "EventName": "MMU_I_TRANS_CACHE_HIT_S1L1_WALK", + "BriefDescription": "Instruction TLB translation cache hit on S1L1 walk cache entry" + }, + { + "PublicDescrition": "Instruction TLB translation cache hit on S1L0 walk cache entry", + "EventCode": "0xd903", + "EventName": "MMU_I_TRANS_CACHE_HIT_S1L0_WALK", + "BriefDescription": "Instruction TLB translation cache hit on S1L0 walk cache entry" + }, + { + "PublicDescrition": "Instruction TLB translation cache hit on S2L2 walk cache entry", + "EventCode": "0xd904", + "EventName": "MMU_I_TRANS_CACHE_HIT_S2L2_WALK", + "BriefDescription": "Instruction TLB translation cache hit on S2L2 walk cache entry" + }, + { + "PublicDescrition": "Instruction TLB translation cache hit on S2L1 walk cache entry", + "EventCode": "0xd905", + "EventName": "MMU_I_TRANS_CACHE_HIT_S2L1_WALK", + "BriefDescription": "Instruction TLB translation cache hit on S2L1 walk cache entry" + }, + { + "PublicDescrition": "Instruction TLB translation cache hit on S2L0 walk cache entry", + "EventCode": "0xd906", + "EventName": "MMU_I_TRANS_CACHE_HIT_S2L0_WALK", + "BriefDescription": "Instruction TLB translation cache hit on S2L0 walk cache entry" + }, + { + "PublicDescrition": "Instruction-side S1 page walk cache lookup", + "EventCode": "0xd907", + "EventName": "MMU_I_S1_WALK_CACHE_LOOKUP", + "BriefDescription": "Instruction-side S1 page walk cache lookup" + }, + { + "PublicDescrition": "Instruction-side S1 page walk cache refill", + "EventCode": "0xd908", + "EventName": "MMU_I_S1_WALK_CACHE_REFILL", + "BriefDescription": "Instruction-side S1 page walk cache refill" + }, + { + "PublicDescrition": "Instruction-side S2 page walk cache lookup", + "EventCode": "0xd909", + "EventName": "MMU_I_S2_WALK_CACHE_LOOKUP", + "BriefDescription": "Instruction-side S2 page walk cache lookup" + }, + { + "PublicDescrition": "Instruction-side S2 page walk cache refill", + "EventCode": "0xd90a", + "EventName": "MMU_I_S2_WALK_CACHE_REFILL", + "BriefDescription": "Instruction-side S2 page walk cache refill" + }, + { + "PublicDescription": "Instruction-side S1 table walk fault", + "EventCode": "0xD90B", + "EventName": "MMU_I_S1_WALK_FAULT", + "BriefDescription": "Instruction-side S1 table walk fault" + }, + { + "PublicDescription": "Instruction-side S2 table walk fault", + "EventCode": "0xD90C", + "EventName": "MMU_I_S2_WALK_FAULT", + "BriefDescription": "Instruction-side S2 table walk fault" + }, + { + "PublicDescription": "Instruction-side table walk steps or descriptor fetches", + "EventCode": "0xD90D", + "EventName": "MMU_I_WALK_STEPS", + "BriefDescription": "Instruction-side table walk steps or descriptor fetches" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/pipeline.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/pipeline.json new file mode 100644 index 0000000000000..2fb2d1f183fc7 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/pipeline.json @@ -0,0 +1,41 @@ +[ + { + "ArchStdEvent": "STALL_FRONTEND", + "Errata": "Errata AC03_CPU_29", + "BriefDescription": "Impacted by errata, use metrics instead -" + }, + { + "ArchStdEvent": "STALL_BACKEND" + }, + { + "ArchStdEvent": "STALL", + "Errata": "Errata AC03_CPU_29", + "BriefDescription": "Impacted by errata, use metrics instead -" + }, + { + "ArchStdEvent": "STALL_SLOT_BACKEND" + }, + { + "ArchStdEvent": "STALL_SLOT_FRONTEND", + "Errata": "Errata AC03_CPU_29", + "BriefDescription": "Impacted by errata, use metrics instead -" + }, + { + "ArchStdEvent": "STALL_SLOT" + }, + { + "ArchStdEvent": "STALL_BACKEND_MEM" + }, + { + "PublicDescription": "Frontend stall cycles, TLB", + "EventCode": "0x815c", + "EventName": "STALL_FRONTEND_TLB", + "BriefDescription": "Frontend stall cycles, TLB" + }, + { + "PublicDescription": "Backend stall cycles, TLB", + "EventCode": "0x8167", + "EventName": "STALL_BACKEND_TLB", + "BriefDescription": "Backend stall cycles, TLB" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/spe.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/spe.json new file mode 100644 index 0000000000000..20f2165c85fec --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/spe.json @@ -0,0 +1,14 @@ +[ + { + "ArchStdEvent": "SAMPLE_POP" + }, + { + "ArchStdEvent": "SAMPLE_FEED" + }, + { + "ArchStdEvent": "SAMPLE_FILTRATE" + }, + { + "ArchStdEvent": "SAMPLE_COLLISION" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/mapfile.csv b/tools/perf/pmu-events/arch/arm64/mapfile.csv index 5b58db5032c11..f4d1ca4d1493d 100644 --- a/tools/perf/pmu-events/arch/arm64/mapfile.csv +++ b/tools/perf/pmu-events/arch/arm64/mapfile.csv @@ -42,3 +42,4 @@ 0x00000000480fd010,v1,hisilicon/hip08,core 0x00000000500f0000,v1,ampere/emag,core 0x00000000c00fac30,v1,ampere/ampereone,core +0x00000000c00fac40,v1,ampere/ampereonex,core -- GitLab From b809fc656e763296f227b9b31e8f225e5977a8af Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 29 Nov 2023 13:34:25 -0800 Subject: [PATCH 0194/1290] perf build: Shellcheck support for OUTPUT directory Migrate Makefile.tests to Build so that variables like rule_mkdir are defined via Makefile.build (needed so the output directory can be created). This requires SHELLCHECK being exported and the clean rule tweaking to remove the files in find. Change find "-perm -o=x" as it was failing on my Debian based Linux kernel tree, switch to using "-executable". Adding a filename prefix of "." to the shellcheck log files is a pain and error prone in make, remove this prefix and just add the shellcheck log files to .gitignore. Fix the command echo so that running the test is displayed. Fixes: 1638b11ef8156c85 ("perf tools: Add perf binary dependent rule for shellcheck log in Makefile.perf") Reviewed-by: Athira Jajeev Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231129213428.2227448-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/.gitignore | 3 +++ tools/perf/Makefile.perf | 30 ++++++++++-------------------- tools/perf/tests/Build | 14 ++++++++++++++ tools/perf/tests/Makefile.tests | 22 ---------------------- 4 files changed, 27 insertions(+), 42 deletions(-) delete mode 100644 tools/perf/tests/Makefile.tests diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore index ee5c14f3b8b19..f5b81d439387a 100644 --- a/tools/perf/.gitignore +++ b/tools/perf/.gitignore @@ -39,6 +39,9 @@ trace/beauty/generated/ pmu-events/pmu-events.c pmu-events/jevents pmu-events/metric_test.log +tests/shell/*.shellcheck_log +tests/shell/coresight/*.shellcheck_log +tests/shell/lib/*.shellcheck_log feature/ libapi/ libbpf/ diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 824cbc0af7d7a..1ab2a908f240a 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -229,8 +229,15 @@ else force_fixdep := $(config) endif +# Runs shellcheck on perf test shell scripts +ifeq ($(NO_SHELLCHECK),1) + SHELLCHECK := +else + SHELLCHECK := $(shell which shellcheck 2> /dev/null) +endif + export srctree OUTPUT RM CC CXX LD AR CFLAGS CXXFLAGS V BISON FLEX AWK -export HOSTCC HOSTLD HOSTAR HOSTCFLAGS +export HOSTCC HOSTLD HOSTAR HOSTCFLAGS SHELLCHECK include $(srctree)/tools/build/Makefile.include @@ -673,23 +680,7 @@ $(PERF_IN): prepare FORCE $(PMU_EVENTS_IN): FORCE prepare $(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=pmu-events obj=pmu-events -# Runs shellcheck on perf test shell scripts - -SHELLCHECK := $(shell which shellcheck 2> /dev/null) - -ifeq ($(NO_SHELLCHECK),1) -SHELLCHECK := -endif - -ifneq ($(SHELLCHECK),) -SHELLCHECK_TEST: FORCE prepare - $(Q)$(MAKE) -f $(srctree)/tools/perf/tests/Makefile.tests -else -SHELLCHECK_TEST: - @: -endif - -$(OUTPUT)perf: $(PERFLIBS) $(PERF_IN) $(PMU_EVENTS_IN) SHELLCHECK_TEST +$(OUTPUT)perf: $(PERFLIBS) $(PERF_IN) $(PMU_EVENTS_IN) $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) \ $(PERF_IN) $(PMU_EVENTS_IN) $(LIBS) -o $@ @@ -1152,9 +1143,8 @@ bpf-skel-clean: $(call QUIET_CLEAN, bpf-skel) $(RM) -r $(SKEL_TMP_OUT) $(SKELETONS) clean:: $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean $(LIBSYMBOL)-clean $(LIBPERF)-clean fixdep-clean python-clean bpf-skel-clean tests-coresight-targets-clean - $(Q)$(MAKE) -f $(srctree)/tools/perf/tests/Makefile.tests clean $(call QUIET_CLEAN, core-objs) $(RM) $(LIBPERF_A) $(OUTPUT)perf-archive $(OUTPUT)perf-iostat $(LANG_BINDINGS) - $(Q)find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete + $(Q)find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete -o -name '*.shellcheck_log' -delete $(Q)$(RM) $(OUTPUT).config-detected $(call QUIET_CLEAN, core-progs) $(RM) $(ALL_PROGRAMS) perf perf-read-vdso32 perf-read-vdsox32 $(OUTPUT)$(LIBJVMTI).so $(call QUIET_CLEAN, core-gen) $(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope* $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)FEATURE-DUMP $(OUTPUT)util/*-bison* $(OUTPUT)util/*-flex* \ diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build index 2b45ffa462a6c..53ba9c3e20e05 100644 --- a/tools/perf/tests/Build +++ b/tools/perf/tests/Build @@ -77,3 +77,17 @@ CFLAGS_python-use.o += -DPYTHONPATH="BUILD_STR($(OUTPUT)python)" -DPYTHON="BUI CFLAGS_dwarf-unwind.o += -fno-optimize-sibling-calls perf-y += workloads/ + +ifdef SHELLCHECK + SHELL_TESTS := $(shell find tests/shell -executable -type f -name '*.sh') + TEST_LOGS := $(SHELL_TESTS:tests/shell/%=shell/%.shellcheck_log) +else + SHELL_TESTS := + TEST_LOGS := +endif + +$(OUTPUT)%.shellcheck_log: % + $(call rule_mkdir) + $(Q)$(call echo-cmd,test)shellcheck -a -S warning "$<" > $@ || (cat $@ && rm $@ && false) + +perf-y += $(TEST_LOGS) diff --git a/tools/perf/tests/Makefile.tests b/tools/perf/tests/Makefile.tests deleted file mode 100644 index fdaca5f7a946d..0000000000000 --- a/tools/perf/tests/Makefile.tests +++ /dev/null @@ -1,22 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# Athira Rajeev , 2023 - -PROGS := $(shell find tests/shell -perm -o=x -type f -name '*.sh') -FILE_NAME := $(notdir $(PROGS)) -FILE_NAME := $(FILE_NAME:%=.%) -LOGS := $(join $(dir $(PROGS)),$(FILE_NAME)) -LOGS := $(LOGS:%=%.shellcheck_log) - -.PHONY: all -all: SHELLCHECK_RUN - @: - -SHELLCHECK_RUN: $(LOGS) - -.%.shellcheck_log: % - $(call rule_mkdir) - $(Q)$(call frecho-cmd,test)@shellcheck -S warning "$<" > $@ || (cat $@ && rm $@ && false) - -clean: - $(eval log_files := $(shell find . -name '.*.shellcheck_log')) - @rm -rf $(log_files) -- GitLab From 8226e4a3b35f525d11934484ee5979399ff8b7dc Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 29 Nov 2023 13:34:27 -0800 Subject: [PATCH 0195/1290] perf test: Use common python setup library Avoid replicated logic by having a common library to set the PYTHON environment variable. Reviewed-by: Athira Jajeev Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231129213428.2227448-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/lib/setup_python.sh | 16 ++++++++++++++++ tools/perf/tests/shell/stat+json_output.sh | 16 +++------------- tools/perf/tests/shell/stat_metrics_values.sh | 14 ++++---------- .../tests/shell/test_perf_data_converter_json.sh | 13 +++---------- 4 files changed, 26 insertions(+), 33 deletions(-) create mode 100644 tools/perf/tests/shell/lib/setup_python.sh diff --git a/tools/perf/tests/shell/lib/setup_python.sh b/tools/perf/tests/shell/lib/setup_python.sh new file mode 100644 index 0000000000000..c2fce1793538d --- /dev/null +++ b/tools/perf/tests/shell/lib/setup_python.sh @@ -0,0 +1,16 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +if [ "x$PYTHON" = "x" ] +then + python3 --version >/dev/null 2>&1 && PYTHON=python3 +fi +if [ "x$PYTHON" = "x" ] +then + python --version >/dev/null 2>&1 && PYTHON=python +fi +if [ "x$PYTHON" = "x" ] +then + echo Skipping test, python not detected please set environment variable PYTHON. + exit 2 +fi diff --git a/tools/perf/tests/shell/stat+json_output.sh b/tools/perf/tests/shell/stat+json_output.sh index 196e22672c50c..3bc900533a5d6 100755 --- a/tools/perf/tests/shell/stat+json_output.sh +++ b/tools/perf/tests/shell/stat+json_output.sh @@ -8,20 +8,10 @@ set -e skip_test=0 +shelldir=$(dirname "$0") +# shellcheck source=lib/setup_python.sh +. "${shelldir}"/lib/setup_python.sh pythonchecker=$(dirname $0)/lib/perf_json_output_lint.py -if [ "x$PYTHON" == "x" ] -then - if which python3 > /dev/null - then - PYTHON=python3 - elif which python > /dev/null - then - PYTHON=python - else - echo Skipping test, python not detected please set environment variable PYTHON. - exit 2 - fi -fi stat_output=$(mktemp /tmp/__perf_test.stat_output.json.XXXXX) diff --git a/tools/perf/tests/shell/stat_metrics_values.sh b/tools/perf/tests/shell/stat_metrics_values.sh index ad94c936de7e8..7ca172599aa6c 100755 --- a/tools/perf/tests/shell/stat_metrics_values.sh +++ b/tools/perf/tests/shell/stat_metrics_values.sh @@ -1,16 +1,10 @@ #!/bin/bash # perf metrics value validation # SPDX-License-Identifier: GPL-2.0 -if [ "x$PYTHON" == "x" ] -then - if which python3 > /dev/null - then - PYTHON=python3 - else - echo Skipping test, python3 not detected please set environment variable PYTHON. - exit 2 - fi -fi + +shelldir=$(dirname "$0") +# shellcheck source=lib/setup_python.sh +. "${shelldir}"/lib/setup_python.sh grep -q GenuineIntel /proc/cpuinfo || { echo Skipping non-Intel; exit 2; } diff --git a/tools/perf/tests/shell/test_perf_data_converter_json.sh b/tools/perf/tests/shell/test_perf_data_converter_json.sh index 6ded58f98f55b..c4f1b59d116f6 100755 --- a/tools/perf/tests/shell/test_perf_data_converter_json.sh +++ b/tools/perf/tests/shell/test_perf_data_converter_json.sh @@ -6,16 +6,9 @@ set -e err=0 -if [ "$PYTHON" = "" ] ; then - if which python3 > /dev/null ; then - PYTHON=python3 - elif which python > /dev/null ; then - PYTHON=python - else - echo Skipping test, python not detected please set environment variable PYTHON. - exit 2 - fi -fi +shelldir=$(dirname "$0") +# shellcheck source=lib/setup_python.sh +. "${shelldir}"/lib/setup_python.sh perfdata=$(mktemp /tmp/__perf_test.perf.data.XXXXX) result=$(mktemp /tmp/__perf_test.output.json.XXXXX) -- GitLab From 7d723ef83b8070ab2304df22ed952dc627385406 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 29 Nov 2023 13:34:28 -0800 Subject: [PATCH 0196/1290] perf test: Add basic 'perf list --json" test Test that JSON output produces valid JSON. Reviewed-by: Athira Jajeev Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Jajeev Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231129213428.2227448-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/list.sh | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100755 tools/perf/tests/shell/list.sh diff --git a/tools/perf/tests/shell/list.sh b/tools/perf/tests/shell/list.sh new file mode 100755 index 0000000000000..22b004f2b23ec --- /dev/null +++ b/tools/perf/tests/shell/list.sh @@ -0,0 +1,19 @@ +#!/bin/sh +# perf list tests +# SPDX-License-Identifier: GPL-2.0 + +set -e +err=0 + +shelldir=$(dirname "$0") +# shellcheck source=lib/setup_python.sh +. "${shelldir}"/lib/setup_python.sh + +test_list_json() { + echo "Json output test" + perf list -j | $PYTHON -m json.tool + echo "Json output test [Success]" +} + +test_list_json +exit $err -- GitLab From 9eef41014fe01287dae79fe208b9b433b13040bb Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Thu, 23 Nov 2023 21:31:10 +0530 Subject: [PATCH 0197/1290] perf vendor events powerpc: Update datasource event name to fix duplicate events Running "perf list" on powerpc fails with segfault as below: $ ./perf list Segmentation fault (core dumped) $ This happens because of duplicate events in the JSON list. The powerpc JSON event list contains some event with same event name, but different event code. They are: - PM_INST_FROM_L3MISS (Present in datasource and frontend) - PM_MRK_DATA_FROM_L2MISS (Present in datasource and marked) - PM_MRK_INST_FROM_L3MISS (Present in datasource and marked) - PM_MRK_DATA_FROM_L3MISS (Present in datasource and marked) pmu_events_table__num_events() uses the value from table_pmu->num_entries which includes duplicate events as well. This causes issue during "perf list" and results in a segmentation fault. Since both event codes are valid, append _DSRC to the Data Source events (datasource.json), so that they would have a unique name. Also add PM_DATA_FROM_L2MISS_DSRC and PM_DATA_FROM_L3MISS_DSRC events. With the fix, 'perf list' works as expected. Fixes: fc143580753348c6 ("perf vendor events power10: Update JSON/events") Signed-off-by: Athira Jajeev Tested-by: Disha Goel Cc: Adrian Hunter Cc: Disha Goel Cc: Ian Rogers Cc: James Clark Cc: Jiri Olsa Cc: Kajol Jain Cc: linuxppc-dev@lists.ozlabs.org Cc: Madhavan Srinivasan Cc: Namhyung Kim Link: https://lore.kernel.org/r/20231123160110.94090-1-atrajeev@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/powerpc/power10/datasource.json | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/tools/perf/pmu-events/arch/powerpc/power10/datasource.json b/tools/perf/pmu-events/arch/powerpc/power10/datasource.json index 6b0356f2d3013..0eeaaf1a95b86 100644 --- a/tools/perf/pmu-events/arch/powerpc/power10/datasource.json +++ b/tools/perf/pmu-events/arch/powerpc/power10/datasource.json @@ -99,6 +99,11 @@ "EventName": "PM_INST_FROM_L2MISS", "BriefDescription": "The processor's instruction cache was reloaded from a source beyond the local core's L2 due to a demand miss." }, + { + "EventCode": "0x0003C0000000C040", + "EventName": "PM_DATA_FROM_L2MISS_DSRC", + "BriefDescription": "The processor's L1 data cache was reloaded from a source beyond the local core's L2 due to a demand miss." + }, { "EventCode": "0x000380000010C040", "EventName": "PM_INST_FROM_L2MISS_ALL", @@ -161,9 +166,14 @@ }, { "EventCode": "0x000780000000C040", - "EventName": "PM_INST_FROM_L3MISS", + "EventName": "PM_INST_FROM_L3MISS_DSRC", "BriefDescription": "The processor's instruction cache was reloaded from beyond the local core's L3 due to a demand miss." }, + { + "EventCode": "0x0007C0000000C040", + "EventName": "PM_DATA_FROM_L3MISS_DSRC", + "BriefDescription": "The processor's L1 data cache was reloaded from beyond the local core's L3 due to a demand miss." + }, { "EventCode": "0x000780000010C040", "EventName": "PM_INST_FROM_L3MISS_ALL", @@ -981,7 +991,7 @@ }, { "EventCode": "0x0003C0000000C142", - "EventName": "PM_MRK_DATA_FROM_L2MISS", + "EventName": "PM_MRK_DATA_FROM_L2MISS_DSRC", "BriefDescription": "The processor's L1 data cache was reloaded from a source beyond the local core's L2 due to a demand miss for a marked instruction." }, { @@ -1046,12 +1056,12 @@ }, { "EventCode": "0x000780000000C142", - "EventName": "PM_MRK_INST_FROM_L3MISS", + "EventName": "PM_MRK_INST_FROM_L3MISS_DSRC", "BriefDescription": "The processor's instruction cache was reloaded from beyond the local core's L3 due to a demand miss for a marked instruction." }, { "EventCode": "0x0007C0000000C142", - "EventName": "PM_MRK_DATA_FROM_L3MISS", + "EventName": "PM_MRK_DATA_FROM_L3MISS_DSRC", "BriefDescription": "The processor's L1 data cache was reloaded from beyond the local core's L3 due to a demand miss for a marked instruction." }, { -- GitLab From a4320085a6c694326dd8db46f563d52d1a826f07 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 28 Nov 2023 12:39:40 -0800 Subject: [PATCH 0198/1290] perf mem: Fix error on hybrid related to availability of mem event in a PMU The below error can be triggered on a hybrid machine. $ perf mem record -t load sleep 1 event syntax error: 'breakpoint/mem-loads,ldlat=30/P' \___ Bad event or PMU Unable to find PMU or event on a PMU of 'breakpoint' In the perf_mem_events__record_args(), the current perf never checks the availability of a mem event on a given PMU. All the PMUs will be added to the perf mem event list. Perf errors out for the unsupported PMU. Extend perf_mem_event__supported() and take a PMU into account. Check the mem event for each PMU before adding it to the perf mem event list. Optimize the perf_mem_events__init() a little bit. The function is to check whether the mem events are supported in the system. It doesn't need to scan all PMUs. Just return with the first supported PMU is good enough. Fixes: 5752c20f3787c9bc ("perf mem: Scan all PMUs instead of just core ones") Reported-by: Ammy Yi Signed-off-by: Kan Liang Tested-by: Ammy Yi Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Link: https://lore.kernel.org/r/20231128203940.3964287-1-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/mem-events.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c index 954b235e12e51..3a2e3687878c1 100644 --- a/tools/perf/util/mem-events.c +++ b/tools/perf/util/mem-events.c @@ -100,11 +100,14 @@ int perf_mem_events__parse(const char *str) return -1; } -static bool perf_mem_event__supported(const char *mnt, char *sysfs_name) +static bool perf_mem_event__supported(const char *mnt, struct perf_pmu *pmu, + struct perf_mem_event *e) { + char sysfs_name[100]; char path[PATH_MAX]; struct stat st; + scnprintf(sysfs_name, sizeof(sysfs_name), e->sysfs_name, pmu->name); scnprintf(path, PATH_MAX, "%s/devices/%s", mnt, sysfs_name); return !stat(path, &st); } @@ -120,7 +123,6 @@ int perf_mem_events__init(void) for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) { struct perf_mem_event *e = perf_mem_events__ptr(j); - char sysfs_name[100]; struct perf_pmu *pmu = NULL; /* @@ -136,12 +138,12 @@ int perf_mem_events__init(void) * of core PMU. */ while ((pmu = perf_pmus__scan(pmu)) != NULL) { - scnprintf(sysfs_name, sizeof(sysfs_name), e->sysfs_name, pmu->name); - e->supported |= perf_mem_event__supported(mnt, sysfs_name); + e->supported |= perf_mem_event__supported(mnt, pmu, e); + if (e->supported) { + found = true; + break; + } } - - if (e->supported) - found = true; } return found ? 0 : -ENOENT; @@ -167,13 +169,10 @@ static void perf_mem_events__print_unsupport_hybrid(struct perf_mem_event *e, int idx) { const char *mnt = sysfs__mount(); - char sysfs_name[100]; struct perf_pmu *pmu = NULL; while ((pmu = perf_pmus__scan(pmu)) != NULL) { - scnprintf(sysfs_name, sizeof(sysfs_name), e->sysfs_name, - pmu->name); - if (!perf_mem_event__supported(mnt, sysfs_name)) { + if (!perf_mem_event__supported(mnt, pmu, e)) { pr_err("failed: event '%s' not supported\n", perf_mem_events__name(idx, pmu->name)); } @@ -183,6 +182,7 @@ static void perf_mem_events__print_unsupport_hybrid(struct perf_mem_event *e, int perf_mem_events__record_args(const char **rec_argv, int *argv_nr, char **rec_tmp, int *tmp_nr) { + const char *mnt = sysfs__mount(); int i = *argv_nr, k = 0; struct perf_mem_event *e; @@ -211,6 +211,9 @@ int perf_mem_events__record_args(const char **rec_argv, int *argv_nr, while ((pmu = perf_pmus__scan(pmu)) != NULL) { const char *s = perf_mem_events__name(j, pmu->name); + if (!perf_mem_event__supported(mnt, pmu, e)) + continue; + rec_argv[i++] = "-e"; if (s) { char *copy = strdup(s); -- GitLab From 144081ef78c36e255c08b74da86ef68e6cf0a221 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 20 Nov 2023 11:04:08 -0800 Subject: [PATCH 0199/1290] perf test: Add basic 'perf diff' test There are some old bug reports on perf diff crashing: https://rhaas.blogspot.com/2012/06/perf-good-bad-ugly.html Happening across them I was prompted to add two very basic tests that will give some 'perf diff' coverage. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Jajeev Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231120190408.281826-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/diff.sh | 101 +++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100755 tools/perf/tests/shell/diff.sh diff --git a/tools/perf/tests/shell/diff.sh b/tools/perf/tests/shell/diff.sh new file mode 100755 index 0000000000000..2131857636888 --- /dev/null +++ b/tools/perf/tests/shell/diff.sh @@ -0,0 +1,101 @@ +#!/bin/sh +# perf diff tests +# SPDX-License-Identifier: GPL-2.0 + +set -e + +err=0 +perfdata1=$(mktemp /tmp/__perf_test.perf.data.XXXXX) +perfdata2=$(mktemp /tmp/__perf_test.perf.data.XXXXX) +perfdata3=$(mktemp /tmp/__perf_test.perf.data.XXXXX) +testprog="perf test -w thloop" +testsym="test_loop" + +cleanup() { + rm -rf "${perfdata1}" + rm -rf "${perfdata1}".old + rm -rf "${perfdata2}" + rm -rf "${perfdata2}".old + rm -rf "${perfdata3}" + rm -rf "${perfdata3}".old + + trap - EXIT TERM INT +} + +trap_cleanup() { + cleanup + exit 1 +} +trap trap_cleanup EXIT TERM INT + +make_data() { + file="$1" + if ! perf record -o "${file}" ${testprog} 2> /dev/null + then + echo "Workload record [Failed record]" + echo 1 + return + fi + if ! perf report -i "${file}" -q | grep -q "${testsym}" + then + echo "Workload record [Failed missing output]" + echo 1 + return + fi + echo 0 +} + +test_two_files() { + echo "Basic two file diff test" + err=$(make_data "${perfdata1}") + if [ $err != 0 ] + then + return + fi + err=$(make_data "${perfdata2}") + if [ $err != 0 ] + then + return + fi + + if ! perf diff "${perfdata1}" "${perfdata2}" | grep -q "${testsym}" + then + echo "Basic two file diff test [Failed diff]" + err=1 + return + fi + echo "Basic two file diff test [Success]" +} + +test_three_files() { + echo "Basic three file diff test" + err=$(make_data "${perfdata1}") + if [ $err != 0 ] + then + return + fi + err=$(make_data "${perfdata2}") + if [ $err != 0 ] + then + return + fi + err=$(make_data "${perfdata3}") + if [ $err != 0 ] + then + return + fi + + if ! perf diff "${perfdata1}" "${perfdata2}" "${perfdata3}" | grep -q "${testsym}" + then + echo "Basic three file diff test [Failed diff]" + err=1 + return + fi + echo "Basic three file diff test [Success]" +} + +test_two_files +test_three_files + +cleanup +exit $err -- GitLab From 018b04248543f49d677011d4615f243a39a155a8 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 30 Jun 2023 09:00:29 +0100 Subject: [PATCH 0200/1290] perf bench sched-seccomp-notify: Fix spelling mistake "synchronious" -> "synchronous" There is a spelling mistake in an option description. Fix it. Signed-off-by: Colin Ian King Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: kernel-janitors@vger.kernel.org Link: https://lore.kernel.org/r/20230630080029.15614-1-colin.i.king@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bench/sched-seccomp-notify.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/bench/sched-seccomp-notify.c b/tools/perf/bench/sched-seccomp-notify.c index a01c40131493b..269c1f4a6852c 100644 --- a/tools/perf/bench/sched-seccomp-notify.c +++ b/tools/perf/bench/sched-seccomp-notify.c @@ -32,7 +32,7 @@ static bool sync_mode; static const struct option options[] = { OPT_U64('l', "loop", &loops, "Specify number of loops"), OPT_BOOLEAN('s', "sync-mode", &sync_mode, - "Enable the synchronious mode for seccomp notifications"), + "Enable the synchronous mode for seccomp notifications"), OPT_END() }; -- GitLab From eb2eac0c7b6180332ced94d2979f3d3c7d22aaff Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 20 Nov 2023 16:04:20 -0800 Subject: [PATCH 0201/1290] perf evsel: Fallback to "task-clock" when not system wide When the "cycles" event isn't available evsel will fallback to the "cpu-clock" software event. "task-clock" is similar to "cpu-clock" but only runs when the process is running. Falling back to "cpu-clock" when not system wide leads to confusion, by falling back to "task-clock" it is hoped the confusion is less. Pass the target to determine if "task-clock" is more appropriate. Update a nearby comment and debug string for the change. Signed-off-by: Ian Rogers Tested-by: Athira Rajeev Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Ajay Kaher Cc: Alexander Shishkin Cc: Alexey Makhalov Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Yang Jihong Link: https://lore.kernel.org/r/20231121000420.368075-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 2 +- tools/perf/builtin-stat.c | 2 +- tools/perf/builtin-top.c | 2 +- tools/perf/util/evsel.c | 18 ++++++++++-------- tools/perf/util/evsel.h | 3 ++- 5 files changed, 15 insertions(+), 12 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 9b4f3805ca92f..db814eadc16b9 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1360,7 +1360,7 @@ static int record__open(struct record *rec) evlist__for_each_entry(evlist, pos) { try_again: if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) { - if (evsel__fallback(pos, errno, msg, sizeof(msg))) { + if (evsel__fallback(pos, &opts->target, errno, msg, sizeof(msg))) { if (verbose > 0) ui__warning("%s\n", msg); goto try_again; diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index a3af805a1d572..d8e5d6f7a87a6 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -653,7 +653,7 @@ static enum counter_recovery stat_handle_error(struct evsel *counter) if ((evsel__leader(counter) != counter) || !(counter->core.leader->nr_members > 1)) return COUNTER_SKIP; - } else if (evsel__fallback(counter, errno, msg, sizeof(msg))) { + } else if (evsel__fallback(counter, &target, errno, msg, sizeof(msg))) { if (verbose > 0) ui__warning("%s\n", msg); return COUNTER_RETRY; diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index ea8c7eca5eeed..1e42bd1c7d5a5 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1044,7 +1044,7 @@ try_again: perf_top_overwrite_fallback(top, counter)) goto try_again; - if (evsel__fallback(counter, errno, msg, sizeof(msg))) { + if (evsel__fallback(counter, &opts->target, errno, msg, sizeof(msg))) { if (verbose > 0) ui__warning("%s\n", msg); goto try_again; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index a5da74e3a517b..532f34d9fcb50 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2853,7 +2853,8 @@ u64 evsel__intval_common(struct evsel *evsel, struct perf_sample *sample, const #endif -bool evsel__fallback(struct evsel *evsel, int err, char *msg, size_t msgsize) +bool evsel__fallback(struct evsel *evsel, struct target *target, int err, + char *msg, size_t msgsize) { int paranoid; @@ -2861,18 +2862,19 @@ bool evsel__fallback(struct evsel *evsel, int err, char *msg, size_t msgsize) evsel->core.attr.type == PERF_TYPE_HARDWARE && evsel->core.attr.config == PERF_COUNT_HW_CPU_CYCLES) { /* - * If it's cycles then fall back to hrtimer based - * cpu-clock-tick sw counter, which is always available even if - * no PMU support. + * If it's cycles then fall back to hrtimer based cpu-clock sw + * counter, which is always available even if no PMU support. * * PPC returns ENXIO until 2.6.37 (behavior changed with commit * b0a873e). */ - scnprintf(msg, msgsize, "%s", -"The cycles event is not supported, trying to fall back to cpu-clock-ticks"); - evsel->core.attr.type = PERF_TYPE_SOFTWARE; - evsel->core.attr.config = PERF_COUNT_SW_CPU_CLOCK; + evsel->core.attr.config = target__has_cpu(target) + ? PERF_COUNT_SW_CPU_CLOCK + : PERF_COUNT_SW_TASK_CLOCK; + scnprintf(msg, msgsize, + "The cycles event is not supported, trying to fall back to %s", + target__has_cpu(target) ? "cpu-clock" : "task-clock"); zfree(&evsel->name); return true; diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index f19ac9f027efc..efbb6e848287f 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -460,7 +460,8 @@ static inline bool evsel__is_clock(const struct evsel *evsel) evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK); } -bool evsel__fallback(struct evsel *evsel, int err, char *msg, size_t msgsize); +bool evsel__fallback(struct evsel *evsel, struct target *target, int err, + char *msg, size_t msgsize); int evsel__open_strerror(struct evsel *evsel, struct target *target, int err, char *msg, size_t size); -- GitLab From 030ac3cad28992ae9099a857848861053273cc8f Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 27 Nov 2023 14:08:20 -0800 Subject: [PATCH 0202/1290] perf record: Be lazier in allocating lost samples buffer Wait until a lost sample occurs to allocate the lost samples buffer, often the buffer isn't necessary. This saves a 64kb allocation and 5.3kb of peak memory consumption. Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Jajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Guilherme Amadio Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Miguel Ojeda Cc: Ming Wang Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231127220902.1315692-9-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index db814eadc16b9..eb5a398ddb1d8 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1924,21 +1924,13 @@ static void __record__save_lost_samples(struct record *rec, struct evsel *evsel, static void record__read_lost_samples(struct record *rec) { struct perf_session *session = rec->session; - struct perf_record_lost_samples *lost; + struct perf_record_lost_samples *lost = NULL; struct evsel *evsel; /* there was an error during record__open */ if (session->evlist == NULL) return; - lost = zalloc(PERF_SAMPLE_MAX_SIZE); - if (lost == NULL) { - pr_debug("Memory allocation failed\n"); - return; - } - - lost->header.type = PERF_RECORD_LOST_SAMPLES; - evlist__for_each_entry(session->evlist, evsel) { struct xyarray *xy = evsel->core.sample_id; u64 lost_count; @@ -1961,6 +1953,14 @@ static void record__read_lost_samples(struct record *rec) } if (count.lost) { + if (!lost) { + lost = zalloc(PERF_SAMPLE_MAX_SIZE); + if (!lost) { + pr_debug("Memory allocation failed\n"); + return; + } + lost->header.type = PERF_RECORD_LOST_SAMPLES; + } __record__save_lost_samples(rec, evsel, lost, x, y, count.lost, 0); } @@ -1968,9 +1968,18 @@ static void record__read_lost_samples(struct record *rec) } lost_count = perf_bpf_filter__lost_count(evsel); - if (lost_count) + if (lost_count) { + if (!lost) { + lost = zalloc(PERF_SAMPLE_MAX_SIZE); + if (!lost) { + pr_debug("Memory allocation failed\n"); + return; + } + lost->header.type = PERF_RECORD_LOST_SAMPLES; + } __record__save_lost_samples(rec, evsel, lost, 0, 0, lost_count, PERF_RECORD_MISC_LOST_SAMPLES_BPF); + } } out: free(lost); -- GitLab From d0acce68285e8645038a72c6792483160ca36e5a Mon Sep 17 00:00:00 2001 From: Chengen Du Date: Thu, 30 Nov 2023 21:57:23 +0800 Subject: [PATCH 0203/1290] perf symbols: Parse NOTE segments until the build id is found In the ELF file, multiple NOTE segments may exist. To locate the build id, the process shall persist in parsing NOTE segments until the build id is found. Signed-off-by: Chengen Du Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231130135723.17562-1-chengen.du@canonical.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/symbol-minimal.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/symbol-minimal.c b/tools/perf/util/symbol-minimal.c index a81a14769bd10..1da8b713509c5 100644 --- a/tools/perf/util/symbol-minimal.c +++ b/tools/perf/util/symbol-minimal.c @@ -159,9 +159,10 @@ int filename__read_build_id(const char *filename, struct build_id *bid) goto out_free; ret = read_build_id(buf, buf_size, bid, need_swap); - if (ret == 0) + if (ret == 0) { ret = bid->size; - break; + break; + } } } else { Elf64_Ehdr ehdr; @@ -210,9 +211,10 @@ int filename__read_build_id(const char *filename, struct build_id *bid) goto out_free; ret = read_build_id(buf, buf_size, bid, need_swap); - if (ret == 0) + if (ret == 0) { ret = bid->size; - break; + break; + } } } out_free: -- GitLab From 407a3898d72ee0020b8cc9dd28b88c8eb8d68214 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 5 Dec 2023 08:49:24 -0800 Subject: [PATCH 0204/1290] perf test shell diff: Skip test if test_loop symbol is missing in the perf binary The diff test depends on finding the symbol test_loop in perf and will fail if perf has been stripped and no debug object is available. In that case, skip the test instead. Suggested-by: Adrian Hunter Signed-off-by: Ian Rogers Tested-by: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231205164924.835682-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/diff.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/perf/tests/shell/diff.sh b/tools/perf/tests/shell/diff.sh index 2131857636888..14b87af88703b 100755 --- a/tools/perf/tests/shell/diff.sh +++ b/tools/perf/tests/shell/diff.sh @@ -9,8 +9,15 @@ perfdata1=$(mktemp /tmp/__perf_test.perf.data.XXXXX) perfdata2=$(mktemp /tmp/__perf_test.perf.data.XXXXX) perfdata3=$(mktemp /tmp/__perf_test.perf.data.XXXXX) testprog="perf test -w thloop" + +shelldir=$(dirname "$0") +# shellcheck source=lib/perf_has_symbol.sh +. "${shelldir}"/lib/perf_has_symbol.sh + testsym="test_loop" +skip_test_missing_symbol ${testsym} + cleanup() { rm -rf "${perfdata1}" rm -rf "${perfdata1}".old -- GitLab From 9fa688ea341231837915f996b61f6e0a330d3b38 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 27 Nov 2023 14:08:24 -0800 Subject: [PATCH 0205/1290] perf map: Simplify map_ip/unmap_ip and make 'struct map' smaller When mapping an IP it is either an identity mapping or a DSO relative mapping, so a single bit is required in the struct to identify this. The current code uses function pointers, adding 2 pointers per map and also pushing the size of a map beyond 1 cache line. Switch to using a byte to identify the mapping type (as well as priv and erange_warned), to avoid any masking. Change struct maps's layout to avoid holes. Before: ``` struct map { u64 start; /* 0 8 */ u64 end; /* 8 8 */ _Bool erange_warned:1; /* 16: 0 1 */ _Bool priv:1; /* 16: 1 1 */ /* XXX 6 bits hole, try to pack */ /* XXX 3 bytes hole, try to pack */ u32 prot; /* 20 4 */ u64 pgoff; /* 24 8 */ u64 reloc; /* 32 8 */ u64 (*map_ip)(const struct map *, u64); /* 40 8 */ u64 (*unmap_ip)(const struct map *, u64); /* 48 8 */ struct dso * dso; /* 56 8 */ /* --- cacheline 1 boundary (64 bytes) --- */ refcount_t refcnt; /* 64 4 */ u32 flags; /* 68 4 */ /* size: 72, cachelines: 2, members: 12 */ /* sum members: 68, holes: 1, sum holes: 3 */ /* sum bitfield members: 2 bits, bit holes: 1, sum bit holes: 6 bits */ /* last cacheline: 8 bytes */ }; ``` After: ``` struct map { u64 start; /* 0 8 */ u64 end; /* 8 8 */ u64 pgoff; /* 16 8 */ u64 reloc; /* 24 8 */ struct dso * dso; /* 32 8 */ refcount_t refcnt; /* 40 4 */ u32 prot; /* 44 4 */ u32 flags; /* 48 4 */ enum mapping_type mapping_type:8; /* 52: 0 4 */ /* Bitfield combined with next fields */ _Bool erange_warned; /* 53 1 */ _Bool priv; /* 54 1 */ /* size: 56, cachelines: 1, members: 11 */ /* padding: 1 */ /* last cacheline: 56 bytes */ }; ``` Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Jajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Guilherme Amadio Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Miguel Ojeda Cc: Ming Wang Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231127220902.1315692-13-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 3 +- tools/perf/util/map.c | 20 +-------- tools/perf/util/map.h | 83 +++++++++++++++++++----------------- tools/perf/util/symbol-elf.c | 6 +-- tools/perf/util/symbol.c | 6 +-- 5 files changed, 50 insertions(+), 68 deletions(-) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index a985d004aa8d8..c5de5363b5e75 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -1359,8 +1359,7 @@ __machine__create_kernel_maps(struct machine *machine, struct dso *kernel) if (machine->vmlinux_map == NULL) return -ENOMEM; - map__set_map_ip(machine->vmlinux_map, identity__map_ip); - map__set_unmap_ip(machine->vmlinux_map, identity__map_ip); + map__set_mapping_type(machine->vmlinux_map, MAPPING_TYPE__IDENTITY); return maps__insert(machine__kernel_maps(machine), machine->vmlinux_map); } diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index f64b830044217..54c67cb7ecefa 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -109,8 +109,7 @@ void map__init(struct map *map, u64 start, u64 end, u64 pgoff, struct dso *dso) map__set_pgoff(map, pgoff); map__set_reloc(map, 0); map__set_dso(map, dso__get(dso)); - map__set_map_ip(map, map__dso_map_ip); - map__set_unmap_ip(map, map__dso_unmap_ip); + map__set_mapping_type(map, MAPPING_TYPE__DSO); map__set_erange_warned(map, false); refcount_set(map__refcnt(map), 1); } @@ -172,7 +171,7 @@ struct map *map__new(struct machine *machine, u64 start, u64 len, map__init(result, start, start + len, pgoff, dso); if (anon || no_dso) { - map->map_ip = map->unmap_ip = identity__map_ip; + map->mapping_type = MAPPING_TYPE__IDENTITY; /* * Set memory without DSO as loaded. All map__find_* @@ -630,18 +629,3 @@ struct maps *map__kmaps(struct map *map) } return kmap->kmaps; } - -u64 map__dso_map_ip(const struct map *map, u64 ip) -{ - return ip - map__start(map) + map__pgoff(map); -} - -u64 map__dso_unmap_ip(const struct map *map, u64 ip) -{ - return ip + map__start(map) - map__pgoff(map); -} - -u64 identity__map_ip(const struct map *map __maybe_unused, u64 ip) -{ - return ip; -} diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h index 1b53d53adc866..3a3b7757da5fd 100644 --- a/tools/perf/util/map.h +++ b/tools/perf/util/map.h @@ -16,23 +16,25 @@ struct dso; struct maps; struct machine; +enum mapping_type { + /* map__map_ip/map__unmap_ip are given as offsets in the DSO. */ + MAPPING_TYPE__DSO, + /* map__map_ip/map__unmap_ip are just the given ip value. */ + MAPPING_TYPE__IDENTITY, +}; + DECLARE_RC_STRUCT(map) { u64 start; u64 end; - bool erange_warned:1; - bool priv:1; - u32 prot; u64 pgoff; u64 reloc; - - /* ip -> dso rip */ - u64 (*map_ip)(const struct map *, u64); - /* dso rip -> ip */ - u64 (*unmap_ip)(const struct map *, u64); - struct dso *dso; refcount_t refcnt; + u32 prot; u32 flags; + enum mapping_type mapping_type:8; + bool erange_warned; + bool priv; }; struct kmap; @@ -41,38 +43,11 @@ struct kmap *__map__kmap(struct map *map); struct kmap *map__kmap(struct map *map); struct maps *map__kmaps(struct map *map); -/* ip -> dso rip */ -u64 map__dso_map_ip(const struct map *map, u64 ip); -/* dso rip -> ip */ -u64 map__dso_unmap_ip(const struct map *map, u64 ip); -/* Returns ip */ -u64 identity__map_ip(const struct map *map __maybe_unused, u64 ip); - static inline struct dso *map__dso(const struct map *map) { return RC_CHK_ACCESS(map)->dso; } -static inline u64 map__map_ip(const struct map *map, u64 ip) -{ - return RC_CHK_ACCESS(map)->map_ip(map, ip); -} - -static inline u64 map__unmap_ip(const struct map *map, u64 ip) -{ - return RC_CHK_ACCESS(map)->unmap_ip(map, ip); -} - -static inline void *map__map_ip_ptr(struct map *map) -{ - return RC_CHK_ACCESS(map)->map_ip; -} - -static inline void* map__unmap_ip_ptr(struct map *map) -{ - return RC_CHK_ACCESS(map)->unmap_ip; -} - static inline u64 map__start(const struct map *map) { return RC_CHK_ACCESS(map)->start; @@ -123,6 +98,34 @@ static inline size_t map__size(const struct map *map) return map__end(map) - map__start(map); } +/* ip -> dso rip */ +static inline u64 map__dso_map_ip(const struct map *map, u64 ip) +{ + return ip - map__start(map) + map__pgoff(map); +} + +/* dso rip -> ip */ +static inline u64 map__dso_unmap_ip(const struct map *map, u64 ip) +{ + return ip + map__start(map) - map__pgoff(map); +} + +static inline u64 map__map_ip(const struct map *map, u64 ip) +{ + if ((RC_CHK_ACCESS(map)->mapping_type) == MAPPING_TYPE__DSO) + return map__dso_map_ip(map, ip); + else + return ip; +} + +static inline u64 map__unmap_ip(const struct map *map, u64 ip) +{ + if ((RC_CHK_ACCESS(map)->mapping_type) == MAPPING_TYPE__DSO) + return map__dso_unmap_ip(map, ip); + else + return ip; +} + /* rip/ip <-> addr suitable for passing to `objdump --start-address=` */ u64 map__rip_2objdump(struct map *map, u64 rip); @@ -294,13 +297,13 @@ static inline void map__set_dso(struct map *map, struct dso *dso) RC_CHK_ACCESS(map)->dso = dso; } -static inline void map__set_map_ip(struct map *map, u64 (*map_ip)(const struct map *map, u64 ip)) +static inline void map__set_mapping_type(struct map *map, enum mapping_type type) { - RC_CHK_ACCESS(map)->map_ip = map_ip; + RC_CHK_ACCESS(map)->mapping_type = type; } -static inline void map__set_unmap_ip(struct map *map, u64 (*unmap_ip)(const struct map *map, u64 rip)) +static inline enum mapping_type map__mapping_type(struct map *map) { - RC_CHK_ACCESS(map)->unmap_ip = unmap_ip; + return RC_CHK_ACCESS(map)->mapping_type; } #endif /* __PERF_MAP_H */ diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index 9e7eeaf616b86..4b934ed3bfd13 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -1392,8 +1392,7 @@ static int dso__process_kernel_symbol(struct dso *dso, struct map *map, map__set_start(map, shdr->sh_addr + ref_reloc(kmap)); map__set_end(map, map__start(map) + shdr->sh_size); map__set_pgoff(map, shdr->sh_offset); - map__set_map_ip(map, map__dso_map_ip); - map__set_unmap_ip(map, map__dso_unmap_ip); + map__set_mapping_type(map, MAPPING_TYPE__DSO); /* Ensure maps are correctly ordered */ if (kmaps) { int err; @@ -1455,8 +1454,7 @@ static int dso__process_kernel_symbol(struct dso *dso, struct map *map, map__set_end(curr_map, map__start(curr_map) + shdr->sh_size); map__set_pgoff(curr_map, shdr->sh_offset); } else { - map__set_map_ip(curr_map, identity__map_ip); - map__set_unmap_ip(curr_map, identity__map_ip); + map__set_mapping_type(curr_map, MAPPING_TYPE__IDENTITY); } curr_dso->symtab_type = dso->symtab_type; if (maps__insert(kmaps, curr_map)) diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 82cc74b9358e0..314c0263bf3c0 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -956,8 +956,7 @@ static int maps__split_kallsyms(struct maps *kmaps, struct dso *dso, u64 delta, return -1; } - map__set_map_ip(curr_map, identity__map_ip); - map__set_unmap_ip(curr_map, identity__map_ip); + map__set_mapping_type(curr_map, MAPPING_TYPE__IDENTITY); if (maps__insert(kmaps, curr_map)) { dso__put(ndso); return -1; @@ -1475,8 +1474,7 @@ static int dso__load_kcore(struct dso *dso, struct map *map, map__set_start(map, map__start(new_map)); map__set_end(map, map__end(new_map)); map__set_pgoff(map, map__pgoff(new_map)); - map__set_map_ip(map, map__map_ip_ptr(new_map)); - map__set_unmap_ip(map, map__unmap_ip_ptr(new_map)); + map__set_mapping_type(map, map__mapping_type(new_map)); /* Ensure maps are correctly ordered */ map_ref = map__get(map); maps__remove(kmaps, map_ref); -- GitLab From 0f6ab6a3fb7e380a1277f8288f315724ed517114 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 27 Nov 2023 14:08:25 -0800 Subject: [PATCH 0206/1290] perf maps: Move symbol maps functions to maps.c Move the find and certain other symbol maps__* functions to maps.c for better abstraction. Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Jajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Guilherme Amadio Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Miguel Ojeda Cc: Ming Wang Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231127220902.1315692-14-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/maps.c | 238 +++++++++++++++++++++++++++++++++++++ tools/perf/util/maps.h | 12 ++ tools/perf/util/symbol.c | 248 --------------------------------------- tools/perf/util/symbol.h | 1 - 4 files changed, 250 insertions(+), 249 deletions(-) diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c index 233438c95b531..9a011aed4b754 100644 --- a/tools/perf/util/maps.c +++ b/tools/perf/util/maps.c @@ -475,3 +475,241 @@ struct map_rb_node *map_rb_node__next(struct map_rb_node *node) return rb_entry(next, struct map_rb_node, rb_node); } + +static int map__strcmp(const void *a, const void *b) +{ + const struct map *map_a = *(const struct map **)a; + const struct map *map_b = *(const struct map **)b; + const struct dso *dso_a = map__dso(map_a); + const struct dso *dso_b = map__dso(map_b); + int ret = strcmp(dso_a->short_name, dso_b->short_name); + + if (ret == 0 && map_a != map_b) { + /* + * Ensure distinct but name equal maps have an order in part to + * aid reference counting. + */ + ret = (int)map__start(map_a) - (int)map__start(map_b); + if (ret == 0) + ret = (int)((intptr_t)map_a - (intptr_t)map_b); + } + + return ret; +} + +static int map__strcmp_name(const void *name, const void *b) +{ + const struct dso *dso = map__dso(*(const struct map **)b); + + return strcmp(name, dso->short_name); +} + +void __maps__sort_by_name(struct maps *maps) +{ + qsort(maps__maps_by_name(maps), maps__nr_maps(maps), sizeof(struct map *), map__strcmp); +} + +static int map__groups__sort_by_name_from_rbtree(struct maps *maps) +{ + struct map_rb_node *rb_node; + struct map **maps_by_name = realloc(maps__maps_by_name(maps), + maps__nr_maps(maps) * sizeof(struct map *)); + int i = 0; + + if (maps_by_name == NULL) + return -1; + + up_read(maps__lock(maps)); + down_write(maps__lock(maps)); + + RC_CHK_ACCESS(maps)->maps_by_name = maps_by_name; + RC_CHK_ACCESS(maps)->nr_maps_allocated = maps__nr_maps(maps); + + maps__for_each_entry(maps, rb_node) + maps_by_name[i++] = map__get(rb_node->map); + + __maps__sort_by_name(maps); + + up_write(maps__lock(maps)); + down_read(maps__lock(maps)); + + return 0; +} + +static struct map *__maps__find_by_name(struct maps *maps, const char *name) +{ + struct map **mapp; + + if (maps__maps_by_name(maps) == NULL && + map__groups__sort_by_name_from_rbtree(maps)) + return NULL; + + mapp = bsearch(name, maps__maps_by_name(maps), maps__nr_maps(maps), + sizeof(*mapp), map__strcmp_name); + if (mapp) + return *mapp; + return NULL; +} + +struct map *maps__find_by_name(struct maps *maps, const char *name) +{ + struct map_rb_node *rb_node; + struct map *map; + + down_read(maps__lock(maps)); + + + if (RC_CHK_ACCESS(maps)->last_search_by_name) { + const struct dso *dso = map__dso(RC_CHK_ACCESS(maps)->last_search_by_name); + + if (strcmp(dso->short_name, name) == 0) { + map = RC_CHK_ACCESS(maps)->last_search_by_name; + goto out_unlock; + } + } + /* + * If we have maps->maps_by_name, then the name isn't in the rbtree, + * as maps->maps_by_name mirrors the rbtree when lookups by name are + * made. + */ + map = __maps__find_by_name(maps, name); + if (map || maps__maps_by_name(maps) != NULL) + goto out_unlock; + + /* Fallback to traversing the rbtree... */ + maps__for_each_entry(maps, rb_node) { + struct dso *dso; + + map = rb_node->map; + dso = map__dso(map); + if (strcmp(dso->short_name, name) == 0) { + RC_CHK_ACCESS(maps)->last_search_by_name = map; + goto out_unlock; + } + } + map = NULL; + +out_unlock: + up_read(maps__lock(maps)); + return map; +} + +void maps__fixup_end(struct maps *maps) +{ + struct map_rb_node *prev = NULL, *curr; + + down_write(maps__lock(maps)); + + maps__for_each_entry(maps, curr) { + if (prev != NULL && !map__end(prev->map)) + map__set_end(prev->map, map__start(curr->map)); + + prev = curr; + } + + /* + * We still haven't the actual symbols, so guess the + * last map final address. + */ + if (curr && !map__end(curr->map)) + map__set_end(curr->map, ~0ULL); + + up_write(maps__lock(maps)); +} + +/* + * Merges map into maps by splitting the new map within the existing map + * regions. + */ +int maps__merge_in(struct maps *kmaps, struct map *new_map) +{ + struct map_rb_node *rb_node; + LIST_HEAD(merged); + int err = 0; + + maps__for_each_entry(kmaps, rb_node) { + struct map *old_map = rb_node->map; + + /* no overload with this one */ + if (map__end(new_map) < map__start(old_map) || + map__start(new_map) >= map__end(old_map)) + continue; + + if (map__start(new_map) < map__start(old_map)) { + /* + * |new...... + * |old.... + */ + if (map__end(new_map) < map__end(old_map)) { + /* + * |new......| -> |new..| + * |old....| -> |old....| + */ + map__set_end(new_map, map__start(old_map)); + } else { + /* + * |new.............| -> |new..| |new..| + * |old....| -> |old....| + */ + struct map_list_node *m = map_list_node__new(); + + if (!m) { + err = -ENOMEM; + goto out; + } + + m->map = map__clone(new_map); + if (!m->map) { + free(m); + err = -ENOMEM; + goto out; + } + + map__set_end(m->map, map__start(old_map)); + list_add_tail(&m->node, &merged); + map__add_pgoff(new_map, map__end(old_map) - map__start(new_map)); + map__set_start(new_map, map__end(old_map)); + } + } else { + /* + * |new...... + * |old.... + */ + if (map__end(new_map) < map__end(old_map)) { + /* + * |new..| -> x + * |old.........| -> |old.........| + */ + map__put(new_map); + new_map = NULL; + break; + } else { + /* + * |new......| -> |new...| + * |old....| -> |old....| + */ + map__add_pgoff(new_map, map__end(old_map) - map__start(new_map)); + map__set_start(new_map, map__end(old_map)); + } + } + } + +out: + while (!list_empty(&merged)) { + struct map_list_node *old_node; + + old_node = list_entry(merged.next, struct map_list_node, node); + list_del_init(&old_node->node); + if (!err) + err = maps__insert(kmaps, old_node->map); + map__put(old_node->map); + free(old_node); + } + + if (new_map) { + if (!err) + err = maps__insert(kmaps, new_map); + map__put(new_map); + } + return err; +} diff --git a/tools/perf/util/maps.h b/tools/perf/util/maps.h index 83144e0645ed4..a689149be8c43 100644 --- a/tools/perf/util/maps.h +++ b/tools/perf/util/maps.h @@ -21,6 +21,16 @@ struct map_rb_node { struct map *map; }; +struct map_list_node { + struct list_head node; + struct map *map; +}; + +static inline struct map_list_node *map_list_node__new(void) +{ + return malloc(sizeof(struct map_list_node)); +} + struct map_rb_node *maps__first(struct maps *maps); struct map_rb_node *map_rb_node__next(struct map_rb_node *node); struct map_rb_node *maps__find_node(struct maps *maps, struct map *map); @@ -133,4 +143,6 @@ int maps__merge_in(struct maps *kmaps, struct map *new_map); void __maps__sort_by_name(struct maps *maps); +void maps__fixup_end(struct maps *maps); + #endif // __PERF_MAPS_H diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 314c0263bf3c0..1cc42b8d8afb7 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -48,11 +48,6 @@ static bool symbol__is_idle(const char *name); int vmlinux_path__nr_entries; char **vmlinux_path; -struct map_list_node { - struct list_head node; - struct map *map; -}; - struct symbol_conf symbol_conf = { .nanosecs = false, .use_modules = true, @@ -90,11 +85,6 @@ static enum dso_binary_type binary_type_symtab[] = { #define DSO_BINARY_TYPE__SYMTAB_CNT ARRAY_SIZE(binary_type_symtab) -static struct map_list_node *map_list_node__new(void) -{ - return malloc(sizeof(struct map_list_node)); -} - static bool symbol_type__filter(char symbol_type) { symbol_type = toupper(symbol_type); @@ -270,29 +260,6 @@ void symbols__fixup_end(struct rb_root_cached *symbols, bool is_kallsyms) curr->end = roundup(curr->start, 4096) + 4096; } -void maps__fixup_end(struct maps *maps) -{ - struct map_rb_node *prev = NULL, *curr; - - down_write(maps__lock(maps)); - - maps__for_each_entry(maps, curr) { - if (prev != NULL && !map__end(prev->map)) - map__set_end(prev->map, map__start(curr->map)); - - prev = curr; - } - - /* - * We still haven't the actual symbols, so guess the - * last map final address. - */ - if (curr && !map__end(curr->map)) - map__set_end(curr->map, ~0ULL); - - up_write(maps__lock(maps)); -} - struct symbol *symbol__new(u64 start, u64 len, u8 binding, u8 type, const char *name) { size_t namelen = strlen(name) + 1; @@ -1270,103 +1237,6 @@ static int kcore_mapfn(u64 start, u64 len, u64 pgoff, void *data) return 0; } -/* - * Merges map into maps by splitting the new map within the existing map - * regions. - */ -int maps__merge_in(struct maps *kmaps, struct map *new_map) -{ - struct map_rb_node *rb_node; - LIST_HEAD(merged); - int err = 0; - - maps__for_each_entry(kmaps, rb_node) { - struct map *old_map = rb_node->map; - - /* no overload with this one */ - if (map__end(new_map) < map__start(old_map) || - map__start(new_map) >= map__end(old_map)) - continue; - - if (map__start(new_map) < map__start(old_map)) { - /* - * |new...... - * |old.... - */ - if (map__end(new_map) < map__end(old_map)) { - /* - * |new......| -> |new..| - * |old....| -> |old....| - */ - map__set_end(new_map, map__start(old_map)); - } else { - /* - * |new.............| -> |new..| |new..| - * |old....| -> |old....| - */ - struct map_list_node *m = map_list_node__new(); - - if (!m) { - err = -ENOMEM; - goto out; - } - - m->map = map__clone(new_map); - if (!m->map) { - free(m); - err = -ENOMEM; - goto out; - } - - map__set_end(m->map, map__start(old_map)); - list_add_tail(&m->node, &merged); - map__add_pgoff(new_map, map__end(old_map) - map__start(new_map)); - map__set_start(new_map, map__end(old_map)); - } - } else { - /* - * |new...... - * |old.... - */ - if (map__end(new_map) < map__end(old_map)) { - /* - * |new..| -> x - * |old.........| -> |old.........| - */ - map__put(new_map); - new_map = NULL; - break; - } else { - /* - * |new......| -> |new...| - * |old....| -> |old....| - */ - map__add_pgoff(new_map, map__end(old_map) - map__start(new_map)); - map__set_start(new_map, map__end(old_map)); - } - } - } - -out: - while (!list_empty(&merged)) { - struct map_list_node *old_node; - - old_node = list_entry(merged.next, struct map_list_node, node); - list_del_init(&old_node->node); - if (!err) - err = maps__insert(kmaps, old_node->map); - map__put(old_node->map); - free(old_node); - } - - if (new_map) { - if (!err) - err = maps__insert(kmaps, new_map); - map__put(new_map); - } - return err; -} - static int dso__load_kcore(struct dso *dso, struct map *map, const char *kallsyms_filename) { @@ -2065,124 +1935,6 @@ out: return ret; } -static int map__strcmp(const void *a, const void *b) -{ - const struct map *map_a = *(const struct map **)a; - const struct map *map_b = *(const struct map **)b; - const struct dso *dso_a = map__dso(map_a); - const struct dso *dso_b = map__dso(map_b); - int ret = strcmp(dso_a->short_name, dso_b->short_name); - - if (ret == 0 && map_a != map_b) { - /* - * Ensure distinct but name equal maps have an order in part to - * aid reference counting. - */ - ret = (int)map__start(map_a) - (int)map__start(map_b); - if (ret == 0) - ret = (int)((intptr_t)map_a - (intptr_t)map_b); - } - - return ret; -} - -static int map__strcmp_name(const void *name, const void *b) -{ - const struct dso *dso = map__dso(*(const struct map **)b); - - return strcmp(name, dso->short_name); -} - -void __maps__sort_by_name(struct maps *maps) -{ - qsort(maps__maps_by_name(maps), maps__nr_maps(maps), sizeof(struct map *), map__strcmp); -} - -static int map__groups__sort_by_name_from_rbtree(struct maps *maps) -{ - struct map_rb_node *rb_node; - struct map **maps_by_name = realloc(maps__maps_by_name(maps), - maps__nr_maps(maps) * sizeof(struct map *)); - int i = 0; - - if (maps_by_name == NULL) - return -1; - - up_read(maps__lock(maps)); - down_write(maps__lock(maps)); - - RC_CHK_ACCESS(maps)->maps_by_name = maps_by_name; - RC_CHK_ACCESS(maps)->nr_maps_allocated = maps__nr_maps(maps); - - maps__for_each_entry(maps, rb_node) - maps_by_name[i++] = map__get(rb_node->map); - - __maps__sort_by_name(maps); - - up_write(maps__lock(maps)); - down_read(maps__lock(maps)); - - return 0; -} - -static struct map *__maps__find_by_name(struct maps *maps, const char *name) -{ - struct map **mapp; - - if (maps__maps_by_name(maps) == NULL && - map__groups__sort_by_name_from_rbtree(maps)) - return NULL; - - mapp = bsearch(name, maps__maps_by_name(maps), maps__nr_maps(maps), - sizeof(*mapp), map__strcmp_name); - if (mapp) - return *mapp; - return NULL; -} - -struct map *maps__find_by_name(struct maps *maps, const char *name) -{ - struct map_rb_node *rb_node; - struct map *map; - - down_read(maps__lock(maps)); - - - if (RC_CHK_ACCESS(maps)->last_search_by_name) { - const struct dso *dso = map__dso(RC_CHK_ACCESS(maps)->last_search_by_name); - - if (strcmp(dso->short_name, name) == 0) { - map = RC_CHK_ACCESS(maps)->last_search_by_name; - goto out_unlock; - } - } - /* - * If we have maps->maps_by_name, then the name isn't in the rbtree, - * as maps->maps_by_name mirrors the rbtree when lookups by name are - * made. - */ - map = __maps__find_by_name(maps, name); - if (map || maps__maps_by_name(maps) != NULL) - goto out_unlock; - - /* Fallback to traversing the rbtree... */ - maps__for_each_entry(maps, rb_node) { - struct dso *dso; - - map = rb_node->map; - dso = map__dso(map); - if (strcmp(dso->short_name, name) == 0) { - RC_CHK_ACCESS(maps)->last_search_by_name = map; - goto out_unlock; - } - } - map = NULL; - -out_unlock: - up_read(maps__lock(maps)); - return map; -} - int dso__load_vmlinux(struct dso *dso, struct map *map, const char *vmlinux, bool vmlinux_allocated) { diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index af87c46b3f89e..071837ddce2ac 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -189,7 +189,6 @@ void __symbols__insert(struct rb_root_cached *symbols, struct symbol *sym, void symbols__insert(struct rb_root_cached *symbols, struct symbol *sym); void symbols__fixup_duplicate(struct rb_root_cached *symbols); void symbols__fixup_end(struct rb_root_cached *symbols, bool is_kallsyms); -void maps__fixup_end(struct maps *maps); typedef int (*mapfn_t)(u64 start, u64 len, u64 pgoff, void *data); int file__read_maps(int fd, bool exe, mapfn_t mapfn, void *data, -- GitLab From 01261d8a0f082b1a926d14ecb3ae05e52c477c74 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 27 Nov 2023 14:08:26 -0800 Subject: [PATCH 0207/1290] perf thread: Add missing RC_CHK_EQUAL Comparing pointers without RC_CHK_ACCESS means the indirect object will be compared rather than the underlying maps when REFCNT_CHECKING is enabled. Fix by adding missing RC_CHK_EQUAL. Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Jajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Guilherme Amadio Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Miguel Ojeda Cc: Ming Wang Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231127220902.1315692-15-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/thread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index fe5e6991ae4b4..b9c2039c4230c 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -385,7 +385,7 @@ static int thread__clone_maps(struct thread *thread, struct thread *parent, bool if (thread__pid(thread) == thread__pid(parent)) return thread__prepare_access(thread); - if (thread__maps(thread) == thread__maps(parent)) { + if (RC_CHK_EQUAL(thread__maps(thread), thread__maps(parent))) { pr_debug("broken map groups on thread %d/%d parent %d/%d\n", thread__pid(thread), thread__tid(thread), thread__pid(parent), thread__tid(parent)); -- GitLab From 0713ab3bd169da82c35eefd012b07b715e4ebcf7 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 6 Dec 2023 10:35:33 -0800 Subject: [PATCH 0208/1290] perf stat: Exit perf stat if parse groups fails Metrics were added by a callback but commit a4b8cfcabb1d90ec ("perf stat: Delay metric parsing") postponed this to allow optimizations based on the CPU configuration. In doing so it stopped errors in metric parsing from causing 'perf stat' termination. This change adds the termination for bad metric names back in. Fixes: a4b8cfcabb1d90ec ("perf stat: Delay metric parsing") Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Closes: https://lore.kernel.org/lkml/ZXByT1K6enTh2EHT@kernel.org/ Link: https://lore.kernel.org/r/20231206183533.972028-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index d8e5d6f7a87a6..d22228eddccbf 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -2695,15 +2695,19 @@ int cmd_stat(int argc, const char **argv) */ if (metrics) { const char *pmu = parse_events_option_args.pmu_filter ?: "all"; + int ret = metricgroup__parse_groups(evsel_list, pmu, metrics, + stat_config.metric_no_group, + stat_config.metric_no_merge, + stat_config.metric_no_threshold, + stat_config.user_requested_cpu_list, + stat_config.system_wide, + &stat_config.metric_events); - metricgroup__parse_groups(evsel_list, pmu, metrics, - stat_config.metric_no_group, - stat_config.metric_no_merge, - stat_config.metric_no_threshold, - stat_config.user_requested_cpu_list, - stat_config.system_wide, - &stat_config.metric_events); zfree(&metrics); + if (ret) { + status = ret; + goto out; + } } if (add_default_attributes()) -- GitLab From 64d9799d6dd04601227f602ae961e3f3d2f1f02b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 6 Dec 2023 09:45:25 -0800 Subject: [PATCH 0209/1290] backlight: ili922x: Drop kernel-doc for local macros Don't use kernel-doc notation for the local macros START_BYTE() and CHECK_FREQ_REG(). This prevents these kernel-doc warnings: ili922x.c:85: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * START_BYTE(id, rs, rw) ili922x.c:85: warning: missing initial short description on line: * START_BYTE(id, rs, rw) ili922x.c:118: warning: expecting prototype for CHECK_FREQ_REG(spi_device s, spi_transfer x)(). Prototype was for CHECK_FREQ_REG() instead Signed-off-by: Randy Dunlap Reviewed-by: Daniel Thompson Link: https://lore.kernel.org/r/20231206174525.14960-1-rdunlap@infradead.org Signed-off-by: Lee Jones --- drivers/video/backlight/ili922x.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/video/backlight/ili922x.c b/drivers/video/backlight/ili922x.c index e7b6bd827986f..206e77afc56f2 100644 --- a/drivers/video/backlight/ili922x.c +++ b/drivers/video/backlight/ili922x.c @@ -81,7 +81,7 @@ #define START_RW_WRITE 0 #define START_RW_READ 1 -/** +/* * START_BYTE(id, rs, rw) * * Set the start byte according to the required operation. @@ -100,7 +100,7 @@ #define START_BYTE(id, rs, rw) \ (0x70 | (((id) & 0x01) << 2) | (((rs) & 0x01) << 1) | ((rw) & 0x01)) -/** +/* * CHECK_FREQ_REG(spi_device s, spi_transfer x) - Check the frequency * for the SPI transfer. According to the datasheet, the controller * accept higher frequency for the GRAM transfer, but it requires -- GitLab From 769ff5283f0d7edc819743f183d51af077411107 Mon Sep 17 00:00:00 2001 From: Su Hui Date: Thu, 30 Nov 2023 13:11:56 +0800 Subject: [PATCH 0210/1290] backlight: ili922x: Add an error code check in ili922x_write() Clang static analyzer complains that value stored to 'ret' is never read. Return the error code when spi_sync() failed. Signed-off-by: Su Hui Reviewed-by: Daniel Thompson Link: https://lore.kernel.org/r/20231130051155.1235972-1-suhui@nfschina.com Signed-off-by: Lee Jones --- drivers/video/backlight/ili922x.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/video/backlight/ili922x.c b/drivers/video/backlight/ili922x.c index 206e77afc56f2..c8e0e655dc867 100644 --- a/drivers/video/backlight/ili922x.c +++ b/drivers/video/backlight/ili922x.c @@ -269,6 +269,10 @@ static int ili922x_write(struct spi_device *spi, u8 reg, u16 value) spi_message_add_tail(&xfer_regindex, &msg); ret = spi_sync(spi, &msg); + if (ret < 0) { + dev_err(&spi->dev, "Error sending SPI message 0x%x", ret); + return ret; + } spi_message_init(&msg); tbuf[0] = set_tx_byte(START_BYTE(ili922x_id, START_RS_REG, -- GitLab From 9d03194a36345796d4f0f8d6b72eb770a45d614e Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 28 Nov 2023 09:54:34 -0800 Subject: [PATCH 0211/1290] perf annotate: Introduce global annotation_options The annotation options are to control the behavior of objdump and the output. It's basically used by 'perf annotate' but 'perf report' and 'perf top' can call it on TUI dynamically. But it doesn't need to have a copy of annotation options in many places. As most of the work is done in the util/annotate.c file, add a global variable and set/use it instead of having their own copies. Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231128175441.721579-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 43 +++++++++++++++++------------------ tools/perf/util/annotate.c | 3 +++ tools/perf/util/annotate.h | 2 ++ 3 files changed, 26 insertions(+), 22 deletions(-) diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index a9129b51d5118..67b36a7a12e3e 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -45,7 +45,6 @@ struct perf_annotate { struct perf_tool tool; struct perf_session *session; - struct annotation_options opts; #ifdef HAVE_SLANG_SUPPORT bool use_tui; #endif @@ -318,9 +317,9 @@ static int hist_entry__tty_annotate(struct hist_entry *he, struct perf_annotate *ann) { if (!ann->use_stdio2) - return symbol__tty_annotate(&he->ms, evsel, &ann->opts); + return symbol__tty_annotate(&he->ms, evsel, &annotate_opts); - return symbol__tty_annotate2(&he->ms, evsel, &ann->opts); + return symbol__tty_annotate2(&he->ms, evsel, &annotate_opts); } static void hists__find_annotations(struct hists *hists, @@ -376,14 +375,14 @@ find_next: return; } - ret = annotate(he, evsel, &ann->opts, NULL); + ret = annotate(he, evsel, &annotate_opts, NULL); if (!ret || !ann->skip_missing) return; /* skip missing symbols */ nd = rb_next(nd); } else if (use_browser == 1) { - key = hist_entry__tui_annotate(he, evsel, NULL, &ann->opts); + key = hist_entry__tui_annotate(he, evsel, NULL, &annotate_opts); switch (key) { case -1: @@ -425,9 +424,9 @@ static int __cmd_annotate(struct perf_annotate *ann) goto out; } - if (!ann->opts.objdump_path) { + if (!annotate_opts.objdump_path) { ret = perf_env__lookup_objdump(&session->header.env, - &ann->opts.objdump_path); + &annotate_opts.objdump_path); if (ret) goto out; } @@ -561,9 +560,9 @@ int cmd_annotate(int argc, const char **argv) "file", "vmlinux pathname"), OPT_BOOLEAN('m', "modules", &symbol_conf.use_modules, "load module symbols - WARNING: use only with -k and LIVE kernel"), - OPT_BOOLEAN('l', "print-line", &annotate.opts.print_lines, + OPT_BOOLEAN('l', "print-line", &annotate_opts.print_lines, "print matching source lines (may be slow)"), - OPT_BOOLEAN('P', "full-paths", &annotate.opts.full_path, + OPT_BOOLEAN('P', "full-paths", &annotate_opts.full_path, "Don't shorten the displayed pathnames"), OPT_BOOLEAN(0, "skip-missing", &annotate.skip_missing, "Skip symbols that cannot be annotated"), @@ -574,15 +573,15 @@ int cmd_annotate(int argc, const char **argv) OPT_CALLBACK(0, "symfs", NULL, "directory", "Look for files with symbols relative to this directory", symbol__config_symfs), - OPT_BOOLEAN(0, "source", &annotate.opts.annotate_src, + OPT_BOOLEAN(0, "source", &annotate_opts.annotate_src, "Interleave source code with assembly code (default)"), - OPT_BOOLEAN(0, "asm-raw", &annotate.opts.show_asm_raw, + OPT_BOOLEAN(0, "asm-raw", &annotate_opts.show_asm_raw, "Display raw encoding of assembly instructions (default)"), OPT_STRING('M', "disassembler-style", &disassembler_style, "disassembler style", "Specify disassembler style (e.g. -M intel for intel syntax)"), - OPT_STRING(0, "prefix", &annotate.opts.prefix, "prefix", + OPT_STRING(0, "prefix", &annotate_opts.prefix, "prefix", "Add prefix to source file path names in programs (with --prefix-strip)"), - OPT_STRING(0, "prefix-strip", &annotate.opts.prefix_strip, "N", + OPT_STRING(0, "prefix-strip", &annotate_opts.prefix_strip, "N", "Strip first N entries of source file path name in programs (with --prefix)"), OPT_STRING(0, "objdump", &objdump_path, "path", "objdump binary to use for disassembly and annotations"), @@ -601,7 +600,7 @@ int cmd_annotate(int argc, const char **argv) OPT_CALLBACK_DEFAULT(0, "stdio-color", NULL, "mode", "'always' (default), 'never' or 'auto' only applicable to --stdio mode", stdio__config_color, "always"), - OPT_CALLBACK(0, "percent-type", &annotate.opts, "local-period", + OPT_CALLBACK(0, "percent-type", &annotate_opts, "local-period", "Set percent type local/global-period/hits", annotate_parse_percent_type), OPT_CALLBACK(0, "percent-limit", &annotate, "percent", @@ -617,13 +616,13 @@ int cmd_annotate(int argc, const char **argv) set_option_flag(options, 0, "show-total-period", PARSE_OPT_EXCLUSIVE); set_option_flag(options, 0, "show-nr-samples", PARSE_OPT_EXCLUSIVE); - annotation_options__init(&annotate.opts); + annotation_options__init(&annotate_opts); ret = hists__init(); if (ret < 0) return ret; - annotation_config__init(&annotate.opts); + annotation_config__init(&annotate_opts); argc = parse_options(argc, argv, options, annotate_usage, 0); if (argc) { @@ -638,13 +637,13 @@ int cmd_annotate(int argc, const char **argv) } if (disassembler_style) { - annotate.opts.disassembler_style = strdup(disassembler_style); - if (!annotate.opts.disassembler_style) + annotate_opts.disassembler_style = strdup(disassembler_style); + if (!annotate_opts.disassembler_style) return -ENOMEM; } if (objdump_path) { - annotate.opts.objdump_path = strdup(objdump_path); - if (!annotate.opts.objdump_path) + annotate_opts.objdump_path = strdup(objdump_path); + if (!annotate_opts.objdump_path) return -ENOMEM; } if (addr2line_path) { @@ -653,7 +652,7 @@ int cmd_annotate(int argc, const char **argv) return -ENOMEM; } - if (annotate_check_args(&annotate.opts) < 0) + if (annotate_check_args(&annotate_opts) < 0) return -EINVAL; #ifdef HAVE_GTK2_SUPPORT @@ -734,7 +733,7 @@ out_delete: #ifndef NDEBUG perf_session__delete(annotate.session); #endif - annotation_options__exit(&annotate.opts); + annotation_options__exit(&annotate_opts); return ret; } diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 9a828dc601c75..77b78001b94d0 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -57,6 +57,9 @@ #include +/* global annotation options */ +struct annotation_options annotate_opts; + static regex_t file_lineno; static struct ins_ops *ins__find(struct arch *arch, const char *name); diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index b64a2be287b30..8c1a070725faa 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -105,6 +105,8 @@ struct annotation_options { unsigned int percent_type; }; +extern struct annotation_options annotate_opts; + enum { ANNOTATION__OFFSET_JUMP_TARGETS = 1, ANNOTATION__OFFSET_CALL, -- GitLab From 14953f038d6b30e3dc9d1aa4d4584ac505e5a8ec Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 28 Nov 2023 09:54:35 -0800 Subject: [PATCH 0212/1290] perf report: Convert to the global annotation_options Use the global option and drop the local copy. Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231128175441.721579-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 121a2781323c8..90f98953587c0 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -98,7 +98,6 @@ struct report { bool skip_empty; int max_stack; struct perf_read_values show_threads_values; - struct annotation_options annotation_opts; const char *pretty_printing_style; const char *cpu_list; const char *symbol_filter_str; @@ -542,7 +541,7 @@ static int evlist__tui_block_hists_browse(struct evlist *evlist, struct report * ret = report__browse_block_hists(&rep->block_reports[i++].hist, rep->min_percent, pos, &rep->session->header.env, - &rep->annotation_opts); + &annotate_opts); if (ret != 0) return ret; } @@ -670,7 +669,7 @@ static int report__browse_hists(struct report *rep) } ret = evlist__tui_browse_hists(evlist, help, NULL, rep->min_percent, - &session->header.env, true, &rep->annotation_opts); + &session->header.env, true, &annotate_opts); /* * Usually "ret" is the last pressed key, and we only * care if the key notifies us to switch data file. @@ -745,7 +744,7 @@ static int hists__resort_cb(struct hist_entry *he, void *arg) if (rep->symbol_ipc && sym && !sym->annotate2) { struct evsel *evsel = hists_to_evsel(he->hists); - symbol__annotate2(&he->ms, evsel, &rep->annotation_opts, NULL); + symbol__annotate2(&he->ms, evsel, &annotate_opts, NULL); } return 0; @@ -1341,15 +1340,15 @@ int cmd_report(int argc, const char **argv) "list of cpus to profile"), OPT_BOOLEAN('I', "show-info", &report.show_full_info, "Display extended information about perf.data file"), - OPT_BOOLEAN(0, "source", &report.annotation_opts.annotate_src, + OPT_BOOLEAN(0, "source", &annotate_opts.annotate_src, "Interleave source code with assembly code (default)"), - OPT_BOOLEAN(0, "asm-raw", &report.annotation_opts.show_asm_raw, + OPT_BOOLEAN(0, "asm-raw", &annotate_opts.show_asm_raw, "Display raw encoding of assembly instructions (default)"), OPT_STRING('M', "disassembler-style", &disassembler_style, "disassembler style", "Specify disassembler style (e.g. -M intel for intel syntax)"), - OPT_STRING(0, "prefix", &report.annotation_opts.prefix, "prefix", + OPT_STRING(0, "prefix", &annotate_opts.prefix, "prefix", "Add prefix to source file path names in programs (with --prefix-strip)"), - OPT_STRING(0, "prefix-strip", &report.annotation_opts.prefix_strip, "N", + OPT_STRING(0, "prefix-strip", &annotate_opts.prefix_strip, "N", "Strip first N entries of source file path name in programs (with --prefix)"), OPT_BOOLEAN(0, "show-total-period", &symbol_conf.show_total_period, "Show a column with the sum of periods"), @@ -1401,7 +1400,7 @@ int cmd_report(int argc, const char **argv) "Time span of interest (start,stop)"), OPT_BOOLEAN(0, "inline", &symbol_conf.inline_name, "Show inline function"), - OPT_CALLBACK(0, "percent-type", &report.annotation_opts, "local-period", + OPT_CALLBACK(0, "percent-type", &annotate_opts, "local-period", "Set percent type local/global-period/hits", annotate_parse_percent_type), OPT_BOOLEAN(0, "ns", &symbol_conf.nanosecs, "Show times in nanosecs"), @@ -1433,7 +1432,7 @@ int cmd_report(int argc, const char **argv) */ symbol_conf.keep_exited_threads = true; - annotation_options__init(&report.annotation_opts); + annotation_options__init(&annotate_opts); ret = perf_config(report__config, &report); if (ret) @@ -1452,13 +1451,13 @@ int cmd_report(int argc, const char **argv) } if (disassembler_style) { - report.annotation_opts.disassembler_style = strdup(disassembler_style); - if (!report.annotation_opts.disassembler_style) + annotate_opts.disassembler_style = strdup(disassembler_style); + if (!annotate_opts.disassembler_style) return -ENOMEM; } if (objdump_path) { - report.annotation_opts.objdump_path = strdup(objdump_path); - if (!report.annotation_opts.objdump_path) + annotate_opts.objdump_path = strdup(objdump_path); + if (!annotate_opts.objdump_path) return -ENOMEM; } if (addr2line_path) { @@ -1467,7 +1466,7 @@ int cmd_report(int argc, const char **argv) return -ENOMEM; } - if (annotate_check_args(&report.annotation_opts) < 0) { + if (annotate_check_args(&annotate_opts) < 0) { ret = -EINVAL; goto exit; } @@ -1699,7 +1698,7 @@ repeat: */ symbol_conf.priv_size += sizeof(u32); } - annotation_config__init(&report.annotation_opts); + annotation_config__init(&annotate_opts); } if (symbol__init(&session->header.env) < 0) @@ -1753,7 +1752,7 @@ error: zstd_fini(&(session->zstd_data)); perf_session__delete(session); exit: - annotation_options__exit(&report.annotation_opts); + annotation_options__exit(&annotate_opts); free(sort_order_help); free(field_order_help); return ret; -- GitLab From c9a21a872c69032cb9a94ebc171649c0c28141d7 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 28 Nov 2023 09:54:36 -0800 Subject: [PATCH 0213/1290] perf top: Convert to the global annotation_options Use the global option and drop the local copy. Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231128175441.721579-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-top.c | 44 ++++++++++++++++++++-------------------- tools/perf/util/top.h | 1 - 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 1e42bd1c7d5a5..a6495aa5898ed 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -147,7 +147,7 @@ static int perf_top__parse_source(struct perf_top *top, struct hist_entry *he) return err; } - err = symbol__annotate(&he->ms, evsel, &top->annotation_opts, NULL); + err = symbol__annotate(&he->ms, evsel, &annotate_opts, NULL); if (err == 0) { top->sym_filter_entry = he; } else { @@ -261,9 +261,9 @@ static void perf_top__show_details(struct perf_top *top) goto out_unlock; printf("Showing %s for %s\n", evsel__name(top->sym_evsel), symbol->name); - printf(" Events Pcnt (>=%d%%)\n", top->annotation_opts.min_pcnt); + printf(" Events Pcnt (>=%d%%)\n", annotate_opts.min_pcnt); - more = symbol__annotate_printf(&he->ms, top->sym_evsel, &top->annotation_opts); + more = symbol__annotate_printf(&he->ms, top->sym_evsel, &annotate_opts); if (top->evlist->enabled) { if (top->zero) @@ -450,7 +450,7 @@ static void perf_top__print_mapped_keys(struct perf_top *top) fprintf(stdout, "\t[f] profile display filter (count). \t(%d)\n", top->count_filter); - fprintf(stdout, "\t[F] annotate display filter (percent). \t(%d%%)\n", top->annotation_opts.min_pcnt); + fprintf(stdout, "\t[F] annotate display filter (percent). \t(%d%%)\n", annotate_opts.min_pcnt); fprintf(stdout, "\t[s] annotate symbol. \t(%s)\n", name?: "NULL"); fprintf(stdout, "\t[S] stop annotation.\n"); @@ -553,7 +553,7 @@ static bool perf_top__handle_keypress(struct perf_top *top, int c) prompt_integer(&top->count_filter, "Enter display event count filter"); break; case 'F': - prompt_percent(&top->annotation_opts.min_pcnt, + prompt_percent(&annotate_opts.min_pcnt, "Enter details display event filter (percent)"); break; case 'K': @@ -647,7 +647,7 @@ repeat: ret = evlist__tui_browse_hists(top->evlist, help, &hbt, top->min_percent, &top->session->header.env, !top->record_opts.overwrite, - &top->annotation_opts); + &annotate_opts); if (ret == K_RELOAD) { top->zero = true; goto repeat; @@ -1241,9 +1241,9 @@ static int __cmd_top(struct perf_top *top) pthread_t thread, thread_process; int ret; - if (!top->annotation_opts.objdump_path) { + if (!annotate_opts.objdump_path) { ret = perf_env__lookup_objdump(&top->session->header.env, - &top->annotation_opts.objdump_path); + &annotate_opts.objdump_path); if (ret) return ret; } @@ -1536,9 +1536,9 @@ int cmd_top(int argc, const char **argv) "only consider symbols in these comms"), OPT_STRING(0, "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]", "only consider these symbols"), - OPT_BOOLEAN(0, "source", &top.annotation_opts.annotate_src, + OPT_BOOLEAN(0, "source", &annotate_opts.annotate_src, "Interleave source code with assembly code (default)"), - OPT_BOOLEAN(0, "asm-raw", &top.annotation_opts.show_asm_raw, + OPT_BOOLEAN(0, "asm-raw", &annotate_opts.show_asm_raw, "Display raw encoding of assembly instructions (default)"), OPT_BOOLEAN(0, "demangle-kernel", &symbol_conf.demangle_kernel, "Enable kernel symbol demangling"), @@ -1549,9 +1549,9 @@ int cmd_top(int argc, const char **argv) "addr2line binary to use for line numbers"), OPT_STRING('M', "disassembler-style", &disassembler_style, "disassembler style", "Specify disassembler style (e.g. -M intel for intel syntax)"), - OPT_STRING(0, "prefix", &top.annotation_opts.prefix, "prefix", + OPT_STRING(0, "prefix", &annotate_opts.prefix, "prefix", "Add prefix to source file path names in programs (with --prefix-strip)"), - OPT_STRING(0, "prefix-strip", &top.annotation_opts.prefix_strip, "N", + OPT_STRING(0, "prefix-strip", &annotate_opts.prefix_strip, "N", "Strip first N entries of source file path name in programs (with --prefix)"), OPT_STRING('u', "uid", &target->uid_str, "user", "user to profile"), OPT_CALLBACK(0, "percent-limit", &top, "percent", @@ -1609,10 +1609,10 @@ int cmd_top(int argc, const char **argv) if (status < 0) return status; - annotation_options__init(&top.annotation_opts); + annotation_options__init(&annotate_opts); - top.annotation_opts.min_pcnt = 5; - top.annotation_opts.context = 4; + annotate_opts.min_pcnt = 5; + annotate_opts.context = 4; top.evlist = evlist__new(); if (top.evlist == NULL) @@ -1642,13 +1642,13 @@ int cmd_top(int argc, const char **argv) usage_with_options(top_usage, options); if (disassembler_style) { - top.annotation_opts.disassembler_style = strdup(disassembler_style); - if (!top.annotation_opts.disassembler_style) + annotate_opts.disassembler_style = strdup(disassembler_style); + if (!annotate_opts.disassembler_style) return -ENOMEM; } if (objdump_path) { - top.annotation_opts.objdump_path = strdup(objdump_path); - if (!top.annotation_opts.objdump_path) + annotate_opts.objdump_path = strdup(objdump_path); + if (!annotate_opts.objdump_path) return -ENOMEM; } if (addr2line_path) { @@ -1661,7 +1661,7 @@ int cmd_top(int argc, const char **argv) if (status) goto out_delete_evlist; - if (annotate_check_args(&top.annotation_opts) < 0) + if (annotate_check_args(&annotate_opts) < 0) goto out_delete_evlist; if (!top.evlist->core.nr_entries) { @@ -1787,7 +1787,7 @@ int cmd_top(int argc, const char **argv) if (status < 0) goto out_delete_evlist; - annotation_config__init(&top.annotation_opts); + annotation_config__init(&annotate_opts); symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL); status = symbol__init(NULL); @@ -1840,7 +1840,7 @@ int cmd_top(int argc, const char **argv) out_delete_evlist: evlist__delete(top.evlist); perf_session__delete(top.session); - annotation_options__exit(&top.annotation_opts); + annotation_options__exit(&annotate_opts); return status; } diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h index a8b0d79bd96cf..4c5588dbb1317 100644 --- a/tools/perf/util/top.h +++ b/tools/perf/util/top.h @@ -21,7 +21,6 @@ struct perf_top { struct perf_tool tool; struct evlist *evlist, *sb_evlist; struct record_opts record_opts; - struct annotation_options annotation_opts; struct evswitch evswitch; /* * Symbols will be added here in perf_event__process_sample and will -- GitLab From 41fd3cacd29f47f6b9c6474b27c5b0513786c4e9 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 28 Nov 2023 09:54:37 -0800 Subject: [PATCH 0214/1290] perf annotate: Use global annotation_options Now it can directly use the global options and no need to pass it as an argument. Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231128175441.721579-5-namhyung@kernel.org [ Fixup build with GTK2=1 ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 7 +- tools/perf/builtin-report.c | 2 +- tools/perf/builtin-top.c | 4 +- tools/perf/ui/browsers/annotate.c | 6 +- tools/perf/ui/gtk/annotate.c | 6 +- tools/perf/ui/gtk/gtk.h | 2 - tools/perf/util/annotate.c | 118 ++++++++++++++---------------- tools/perf/util/annotate.h | 15 ++-- 8 files changed, 71 insertions(+), 89 deletions(-) diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 67b36a7a12e3e..9c1e2b2b5bc0b 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -317,9 +317,9 @@ static int hist_entry__tty_annotate(struct hist_entry *he, struct perf_annotate *ann) { if (!ann->use_stdio2) - return symbol__tty_annotate(&he->ms, evsel, &annotate_opts); + return symbol__tty_annotate(&he->ms, evsel); - return symbol__tty_annotate2(&he->ms, evsel, &annotate_opts); + return symbol__tty_annotate2(&he->ms, evsel); } static void hists__find_annotations(struct hists *hists, @@ -365,7 +365,6 @@ find_next: int ret; int (*annotate)(struct hist_entry *he, struct evsel *evsel, - struct annotation_options *options, struct hist_browser_timer *hbt); annotate = dlsym(perf_gtk_handle, @@ -375,7 +374,7 @@ find_next: return; } - ret = annotate(he, evsel, &annotate_opts, NULL); + ret = annotate(he, evsel, NULL); if (!ret || !ann->skip_missing) return; diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 90f98953587c0..2b86651615cd7 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -744,7 +744,7 @@ static int hists__resort_cb(struct hist_entry *he, void *arg) if (rep->symbol_ipc && sym && !sym->annotate2) { struct evsel *evsel = hists_to_evsel(he->hists); - symbol__annotate2(&he->ms, evsel, &annotate_opts, NULL); + symbol__annotate2(&he->ms, evsel, NULL); } return 0; diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index a6495aa5898ed..46a61634701b7 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -147,7 +147,7 @@ static int perf_top__parse_source(struct perf_top *top, struct hist_entry *he) return err; } - err = symbol__annotate(&he->ms, evsel, &annotate_opts, NULL); + err = symbol__annotate(&he->ms, evsel, NULL); if (err == 0) { top->sym_filter_entry = he; } else { @@ -263,7 +263,7 @@ static void perf_top__show_details(struct perf_top *top) printf("Showing %s for %s\n", evsel__name(top->sym_evsel), symbol->name); printf(" Events Pcnt (>=%d%%)\n", annotate_opts.min_pcnt); - more = symbol__annotate_printf(&he->ms, top->sym_evsel, &annotate_opts); + more = symbol__annotate_printf(&he->ms, top->sym_evsel); if (top->evlist->enabled) { if (top->zero) diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index 163f916fff68e..ed0e692afdbe9 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -114,7 +114,7 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int if (!browser->navkeypressed) ops.width += 1; - annotation_line__write(al, notes, &ops, ab->opts); + annotation_line__write(al, notes, &ops); if (ops.current_entry) ab->selection = al; @@ -884,7 +884,7 @@ show_sup_ins: continue; } case 'P': - map_symbol__annotation_dump(ms, evsel, browser->opts); + map_symbol__annotation_dump(ms, evsel); continue; case 't': if (symbol_conf.show_total_period) { @@ -979,7 +979,7 @@ int symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel, return -1; if (not_annotated) { - err = symbol__annotate2(ms, evsel, opts, &browser.arch); + err = symbol__annotate2(ms, evsel, &browser.arch); if (err) { char msg[BUFSIZ]; dso->annotate_warned = true; diff --git a/tools/perf/ui/gtk/annotate.c b/tools/perf/ui/gtk/annotate.c index 2effac77ca8c6..394861245fd3e 100644 --- a/tools/perf/ui/gtk/annotate.c +++ b/tools/perf/ui/gtk/annotate.c @@ -162,7 +162,6 @@ static int perf_gtk__annotate_symbol(GtkWidget *window, struct map_symbol *ms, } static int symbol__gtk_annotate(struct map_symbol *ms, struct evsel *evsel, - struct annotation_options *options, struct hist_browser_timer *hbt) { struct dso *dso = map__dso(ms->map); @@ -176,7 +175,7 @@ static int symbol__gtk_annotate(struct map_symbol *ms, struct evsel *evsel, if (dso->annotate_warned) return -1; - err = symbol__annotate(ms, evsel, options, NULL); + err = symbol__annotate(ms, evsel, NULL); if (err) { char msg[BUFSIZ]; dso->annotate_warned = true; @@ -244,10 +243,9 @@ static int symbol__gtk_annotate(struct map_symbol *ms, struct evsel *evsel, int hist_entry__gtk_annotate(struct hist_entry *he, struct evsel *evsel, - struct annotation_options *options, struct hist_browser_timer *hbt) { - return symbol__gtk_annotate(&he->ms, evsel, options, hbt); + return symbol__gtk_annotate(&he->ms, evsel, hbt); } void perf_gtk__show_annotations(void) diff --git a/tools/perf/ui/gtk/gtk.h b/tools/perf/ui/gtk/gtk.h index 1e84dceb52671..a2b497f03fd6e 100644 --- a/tools/perf/ui/gtk/gtk.h +++ b/tools/perf/ui/gtk/gtk.h @@ -56,13 +56,11 @@ struct evsel; struct evlist; struct hist_entry; struct hist_browser_timer; -struct annotation_options; int evlist__gtk_browse_hists(struct evlist *evlist, const char *help, struct hist_browser_timer *hbt, float min_pcnt); int hist_entry__gtk_annotate(struct hist_entry *he, struct evsel *evsel, - struct annotation_options *options, struct hist_browser_timer *hbt); void perf_gtk__show_annotations(void); diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 77b78001b94d0..daff9af552f4f 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1896,7 +1896,6 @@ static int symbol__disassemble_bpf(struct symbol *sym, struct annotate_args *args) { struct annotation *notes = symbol__annotation(sym); - struct annotation_options *opts = args->options; struct bpf_prog_linfo *prog_linfo = NULL; struct bpf_prog_info_node *info_node; int len = sym->end - sym->start; @@ -2006,7 +2005,7 @@ static int symbol__disassemble_bpf(struct symbol *sym, prev_buf_size = buf_size; fflush(s); - if (!opts->hide_src_code && srcline) { + if (!annotate_opts.hide_src_code && srcline) { args->offset = -1; args->line = strdup(srcline); args->line_nr = 0; @@ -2129,7 +2128,7 @@ static char *expand_tabs(char *line, char **storage, size_t *storage_len) static int symbol__disassemble(struct symbol *sym, struct annotate_args *args) { - struct annotation_options *opts = args->options; + struct annotation_options *opts = &annotate_opts; struct map *map = args->ms.map; struct dso *dso = map__dso(map); char *command; @@ -2380,13 +2379,13 @@ void symbol__calc_percent(struct symbol *sym, struct evsel *evsel) } int symbol__annotate(struct map_symbol *ms, struct evsel *evsel, - struct annotation_options *options, struct arch **parch) + struct arch **parch) { struct symbol *sym = ms->sym; struct annotation *notes = symbol__annotation(sym); struct annotate_args args = { .evsel = evsel, - .options = options, + .options = &annotate_opts, }; struct perf_env *env = evsel__env(evsel); const char *arch_name = perf_env__arch(env); @@ -2414,7 +2413,7 @@ int symbol__annotate(struct map_symbol *ms, struct evsel *evsel, } args.ms = *ms; - if (notes->options && notes->options->full_addr) + if (annotate_opts.full_addr) notes->start = map__objdump_2mem(ms->map, ms->sym->start); else notes->start = map__rip_2objdump(ms->map, ms->sym->start); @@ -2422,12 +2421,12 @@ int symbol__annotate(struct map_symbol *ms, struct evsel *evsel, return symbol__disassemble(sym, &args); } -static void insert_source_line(struct rb_root *root, struct annotation_line *al, - struct annotation_options *opts) +static void insert_source_line(struct rb_root *root, struct annotation_line *al) { struct annotation_line *iter; struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; + unsigned int percent_type = annotate_opts.percent_type; int i, ret; while (*p != NULL) { @@ -2438,7 +2437,7 @@ static void insert_source_line(struct rb_root *root, struct annotation_line *al, if (ret == 0) { for (i = 0; i < al->data_nr; i++) { iter->data[i].percent_sum += annotation_data__percent(&al->data[i], - opts->percent_type); + percent_type); } return; } @@ -2451,7 +2450,7 @@ static void insert_source_line(struct rb_root *root, struct annotation_line *al, for (i = 0; i < al->data_nr; i++) { al->data[i].percent_sum = annotation_data__percent(&al->data[i], - opts->percent_type); + percent_type); } rb_link_node(&al->rb_node, parent, p); @@ -2573,8 +2572,7 @@ static int annotated_source__addr_fmt_width(struct list_head *lines, u64 start) return 0; } -int symbol__annotate_printf(struct map_symbol *ms, struct evsel *evsel, - struct annotation_options *opts) +int symbol__annotate_printf(struct map_symbol *ms, struct evsel *evsel) { struct map *map = ms->map; struct symbol *sym = ms->sym; @@ -2585,6 +2583,7 @@ int symbol__annotate_printf(struct map_symbol *ms, struct evsel *evsel, struct annotation *notes = symbol__annotation(sym); struct sym_hist *h = annotation__histogram(notes, evsel->core.idx); struct annotation_line *pos, *queue = NULL; + struct annotation_options *opts = &annotate_opts; u64 start = map__rip_2objdump(map, sym->start); int printed = 2, queue_len = 0, addr_fmt_width; int more = 0; @@ -2713,8 +2712,7 @@ static void FILE__write_graph(void *fp, int graph) fputs(s, fp); } -static int symbol__annotate_fprintf2(struct symbol *sym, FILE *fp, - struct annotation_options *opts) +static int symbol__annotate_fprintf2(struct symbol *sym, FILE *fp) { struct annotation *notes = symbol__annotation(sym); struct annotation_write_ops wops = { @@ -2731,7 +2729,7 @@ static int symbol__annotate_fprintf2(struct symbol *sym, FILE *fp, list_for_each_entry(al, ¬es->src->source, node) { if (annotation_line__filter(al, notes)) continue; - annotation_line__write(al, notes, &wops, opts); + annotation_line__write(al, notes, &wops); fputc('\n', fp); wops.first_line = false; } @@ -2739,8 +2737,7 @@ static int symbol__annotate_fprintf2(struct symbol *sym, FILE *fp, return 0; } -int map_symbol__annotation_dump(struct map_symbol *ms, struct evsel *evsel, - struct annotation_options *opts) +int map_symbol__annotation_dump(struct map_symbol *ms, struct evsel *evsel) { const char *ev_name = evsel__name(evsel); char buf[1024]; @@ -2762,7 +2759,7 @@ int map_symbol__annotation_dump(struct map_symbol *ms, struct evsel *evsel, fprintf(fp, "%s() %s\nEvent: %s\n\n", ms->sym->name, map__dso(ms->map)->long_name, ev_name); - symbol__annotate_fprintf2(ms->sym, fp, opts); + symbol__annotate_fprintf2(ms->sym, fp); fclose(fp); err = 0; @@ -2939,24 +2936,24 @@ void annotation__init_column_widths(struct annotation *notes, struct symbol *sym void annotation__update_column_widths(struct annotation *notes) { - if (notes->options->use_offset) + if (annotate_opts.use_offset) notes->widths.target = notes->widths.min_addr; - else if (notes->options->full_addr) + else if (annotate_opts.full_addr) notes->widths.target = BITS_PER_LONG / 4; else notes->widths.target = notes->widths.max_addr; notes->widths.addr = notes->widths.target; - if (notes->options->show_nr_jumps) + if (annotate_opts.show_nr_jumps) notes->widths.addr += notes->widths.jumps + 1; } void annotation__toggle_full_addr(struct annotation *notes, struct map_symbol *ms) { - notes->options->full_addr = !notes->options->full_addr; + annotate_opts.full_addr = !annotate_opts.full_addr; - if (notes->options->full_addr) + if (annotate_opts.full_addr) notes->start = map__objdump_2mem(ms->map, ms->sym->start); else notes->start = map__rip_2objdump(ms->map, ms->sym->start); @@ -2965,8 +2962,7 @@ void annotation__toggle_full_addr(struct annotation *notes, struct map_symbol *m } static void annotation__calc_lines(struct annotation *notes, struct map *map, - struct rb_root *root, - struct annotation_options *opts) + struct rb_root *root) { struct annotation_line *al; struct rb_root tmp_root = RB_ROOT; @@ -2979,7 +2975,7 @@ static void annotation__calc_lines(struct annotation *notes, struct map *map, double percent; percent = annotation_data__percent(&al->data[i], - opts->percent_type); + annotate_opts.percent_type); if (percent > percent_max) percent_max = percent; @@ -2990,22 +2986,20 @@ static void annotation__calc_lines(struct annotation *notes, struct map *map, al->path = get_srcline(map__dso(map), notes->start + al->offset, NULL, false, true, notes->start + al->offset); - insert_source_line(&tmp_root, al, opts); + insert_source_line(&tmp_root, al); } resort_source_line(root, &tmp_root); } -static void symbol__calc_lines(struct map_symbol *ms, struct rb_root *root, - struct annotation_options *opts) +static void symbol__calc_lines(struct map_symbol *ms, struct rb_root *root) { struct annotation *notes = symbol__annotation(ms->sym); - annotation__calc_lines(notes, ms->map, root, opts); + annotation__calc_lines(notes, ms->map, root); } -int symbol__tty_annotate2(struct map_symbol *ms, struct evsel *evsel, - struct annotation_options *opts) +int symbol__tty_annotate2(struct map_symbol *ms, struct evsel *evsel) { struct dso *dso = map__dso(ms->map); struct symbol *sym = ms->sym; @@ -3014,7 +3008,7 @@ int symbol__tty_annotate2(struct map_symbol *ms, struct evsel *evsel, char buf[1024]; int err; - err = symbol__annotate2(ms, evsel, opts, NULL); + err = symbol__annotate2(ms, evsel, NULL); if (err) { char msg[BUFSIZ]; @@ -3024,31 +3018,31 @@ int symbol__tty_annotate2(struct map_symbol *ms, struct evsel *evsel, return -1; } - if (opts->print_lines) { - srcline_full_filename = opts->full_path; - symbol__calc_lines(ms, &source_line, opts); + if (annotate_opts.print_lines) { + srcline_full_filename = annotate_opts.full_path; + symbol__calc_lines(ms, &source_line); print_summary(&source_line, dso->long_name); } hists__scnprintf_title(hists, buf, sizeof(buf)); fprintf(stdout, "%s, [percent: %s]\n%s() %s\n", - buf, percent_type_str(opts->percent_type), sym->name, dso->long_name); - symbol__annotate_fprintf2(sym, stdout, opts); + buf, percent_type_str(annotate_opts.percent_type), sym->name, + dso->long_name); + symbol__annotate_fprintf2(sym, stdout); annotated_source__purge(symbol__annotation(sym)->src); return 0; } -int symbol__tty_annotate(struct map_symbol *ms, struct evsel *evsel, - struct annotation_options *opts) +int symbol__tty_annotate(struct map_symbol *ms, struct evsel *evsel) { struct dso *dso = map__dso(ms->map); struct symbol *sym = ms->sym; struct rb_root source_line = RB_ROOT; int err; - err = symbol__annotate(ms, evsel, opts, NULL); + err = symbol__annotate(ms, evsel, NULL); if (err) { char msg[BUFSIZ]; @@ -3060,13 +3054,13 @@ int symbol__tty_annotate(struct map_symbol *ms, struct evsel *evsel, symbol__calc_percent(sym, evsel); - if (opts->print_lines) { - srcline_full_filename = opts->full_path; - symbol__calc_lines(ms, &source_line, opts); + if (annotate_opts.print_lines) { + srcline_full_filename = annotate_opts.full_path; + symbol__calc_lines(ms, &source_line); print_summary(&source_line, dso->long_name); } - symbol__annotate_printf(ms, evsel, opts); + symbol__annotate_printf(ms, evsel); annotated_source__purge(symbol__annotation(sym)->src); @@ -3127,7 +3121,7 @@ call_like: obj__printf(obj, " "); } - disasm_line__scnprintf(dl, bf, size, !notes->options->use_offset, notes->widths.max_ins_name); + disasm_line__scnprintf(dl, bf, size, !annotate_opts.use_offset, notes->widths.max_ins_name); } static void ipc_coverage_string(char *bf, int size, struct annotation *notes) @@ -3210,7 +3204,7 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati else obj__printf(obj, "%*s ", ANNOTATION__IPC_WIDTH - 1, "IPC"); - if (!notes->options->show_minmax_cycle) { + if (!annotate_opts.show_minmax_cycle) { if (al->cycles && al->cycles->avg) obj__printf(obj, "%*" PRIu64 " ", ANNOTATION__CYCLES_WIDTH - 1, al->cycles->avg); @@ -3254,7 +3248,7 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati if (!*al->line) obj__printf(obj, "%-*s", width - pcnt_width - cycles_width, " "); else if (al->offset == -1) { - if (al->line_nr && notes->options->show_linenr) + if (al->line_nr && annotate_opts.show_linenr) printed = scnprintf(bf, sizeof(bf), "%-*d ", notes->widths.addr + 1, al->line_nr); else printed = scnprintf(bf, sizeof(bf), "%-*s ", notes->widths.addr, " "); @@ -3264,15 +3258,15 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati u64 addr = al->offset; int color = -1; - if (!notes->options->use_offset) + if (!annotate_opts.use_offset) addr += notes->start; - if (!notes->options->use_offset) { + if (!annotate_opts.use_offset) { printed = scnprintf(bf, sizeof(bf), "%" PRIx64 ": ", addr); } else { if (al->jump_sources && - notes->options->offset_level >= ANNOTATION__OFFSET_JUMP_TARGETS) { - if (notes->options->show_nr_jumps) { + annotate_opts.offset_level >= ANNOTATION__OFFSET_JUMP_TARGETS) { + if (annotate_opts.show_nr_jumps) { int prev; printed = scnprintf(bf, sizeof(bf), "%*d ", notes->widths.jumps, @@ -3286,9 +3280,9 @@ print_addr: printed = scnprintf(bf, sizeof(bf), "%*" PRIx64 ": ", notes->widths.target, addr); } else if (ins__is_call(&disasm_line(al)->ins) && - notes->options->offset_level >= ANNOTATION__OFFSET_CALL) { + annotate_opts.offset_level >= ANNOTATION__OFFSET_CALL) { goto print_addr; - } else if (notes->options->offset_level == ANNOTATION__MAX_OFFSET_LEVEL) { + } else if (annotate_opts.offset_level == ANNOTATION__MAX_OFFSET_LEVEL) { goto print_addr; } else { printed = scnprintf(bf, sizeof(bf), "%-*s ", @@ -3310,19 +3304,18 @@ print_addr: } void annotation_line__write(struct annotation_line *al, struct annotation *notes, - struct annotation_write_ops *wops, - struct annotation_options *opts) + struct annotation_write_ops *wops) { __annotation_line__write(al, notes, wops->first_line, wops->current_entry, wops->change_color, wops->width, wops->obj, - opts->percent_type, + annotate_opts.percent_type, wops->set_color, wops->set_percent_color, wops->set_jumps_percent_color, wops->printf, wops->write_graph); } int symbol__annotate2(struct map_symbol *ms, struct evsel *evsel, - struct annotation_options *options, struct arch **parch) + struct arch **parch) { struct symbol *sym = ms->sym; struct annotation *notes = symbol__annotation(sym); @@ -3336,11 +3329,11 @@ int symbol__annotate2(struct map_symbol *ms, struct evsel *evsel, if (evsel__is_group_event(evsel)) nr_pcnt = evsel->core.nr_members; - err = symbol__annotate(ms, evsel, options, parch); + err = symbol__annotate(ms, evsel, parch); if (err) goto out_free_offsets; - notes->options = options; + notes->options = &annotate_opts; symbol__calc_percent(sym, evsel); @@ -3468,10 +3461,9 @@ static unsigned int parse_percent_type(char *str1, char *str2) return type; } -int annotate_parse_percent_type(const struct option *opt, const char *_str, +int annotate_parse_percent_type(const struct option *opt __maybe_unused, const char *_str, int unset __maybe_unused) { - struct annotation_options *opts = opt->value; unsigned int type; char *str1, *str2; int err = -1; @@ -3490,7 +3482,7 @@ int annotate_parse_percent_type(const struct option *opt, const char *_str, if (type == (unsigned int) -1) type = parse_percent_type(str2, str1); if (type != (unsigned int) -1) { - opts->percent_type = type; + annotate_opts.percent_type = type; err = 0; } diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 8c1a070725faa..7bf29baa43f54 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -224,8 +224,7 @@ struct annotation_write_ops { }; void annotation_line__write(struct annotation_line *al, struct annotation *notes, - struct annotation_write_ops *ops, - struct annotation_options *opts); + struct annotation_write_ops *ops); int __annotation__scnprintf_samples_period(struct annotation *notes, char *bf, size_t size, @@ -375,11 +374,9 @@ void symbol__annotate_zero_histograms(struct symbol *sym); int symbol__annotate(struct map_symbol *ms, struct evsel *evsel, - struct annotation_options *options, struct arch **parch); int symbol__annotate2(struct map_symbol *ms, struct evsel *evsel, - struct annotation_options *options, struct arch **parch); enum symbol_disassemble_errno { @@ -406,20 +403,18 @@ enum symbol_disassemble_errno { int symbol__strerror_disassemble(struct map_symbol *ms, int errnum, char *buf, size_t buflen); -int symbol__annotate_printf(struct map_symbol *ms, struct evsel *evsel, - struct annotation_options *options); +int symbol__annotate_printf(struct map_symbol *ms, struct evsel *evsel); void symbol__annotate_zero_histogram(struct symbol *sym, int evidx); void symbol__annotate_decay_histogram(struct symbol *sym, int evidx); void annotated_source__purge(struct annotated_source *as); -int map_symbol__annotation_dump(struct map_symbol *ms, struct evsel *evsel, - struct annotation_options *opts); +int map_symbol__annotation_dump(struct map_symbol *ms, struct evsel *evsel); bool ui__has_annotation(void); -int symbol__tty_annotate(struct map_symbol *ms, struct evsel *evsel, struct annotation_options *opts); +int symbol__tty_annotate(struct map_symbol *ms, struct evsel *evsel); -int symbol__tty_annotate2(struct map_symbol *ms, struct evsel *evsel, struct annotation_options *opts); +int symbol__tty_annotate2(struct map_symbol *ms, struct evsel *evsel); #ifdef HAVE_SLANG_SUPPORT int symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel, -- GitLab From 22197fb296913f83c7182befd2a8b23bf042f279 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 28 Nov 2023 09:54:38 -0800 Subject: [PATCH 0215/1290] perf ui/browser/annotate: Use global annotation_options Now it can use the global options and no need save local browser options separately. Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231128175441.721579-6-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 2 +- tools/perf/builtin-report.c | 8 ++-- tools/perf/builtin-top.c | 3 +- tools/perf/ui/browsers/annotate.c | 65 ++++++++++++++----------------- tools/perf/ui/browsers/hists.c | 34 ++++++---------- tools/perf/ui/browsers/hists.h | 2 - tools/perf/util/annotate.h | 6 +-- tools/perf/util/block-info.c | 6 +-- tools/perf/util/block-info.h | 3 +- tools/perf/util/hist.h | 25 ++++-------- 10 files changed, 59 insertions(+), 95 deletions(-) diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 9c1e2b2b5bc0b..d17213bd83327 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -381,7 +381,7 @@ find_next: /* skip missing symbols */ nd = rb_next(nd); } else if (use_browser == 1) { - key = hist_entry__tui_annotate(he, evsel, NULL, &annotate_opts); + key = hist_entry__tui_annotate(he, evsel, NULL); switch (key) { case -1: diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 2b86651615cd7..bc0d986c1e0c1 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -540,8 +540,7 @@ static int evlist__tui_block_hists_browse(struct evlist *evlist, struct report * evlist__for_each_entry(evlist, pos) { ret = report__browse_block_hists(&rep->block_reports[i++].hist, rep->min_percent, pos, - &rep->session->header.env, - &annotate_opts); + &rep->session->header.env); if (ret != 0) return ret; } @@ -573,8 +572,7 @@ static int evlist__tty_browse_hists(struct evlist *evlist, struct report *rep, c if (rep->total_cycles_mode) { report__browse_block_hists(&rep->block_reports[i++].hist, - rep->min_percent, pos, - NULL, NULL); + rep->min_percent, pos, NULL); continue; } @@ -669,7 +667,7 @@ static int report__browse_hists(struct report *rep) } ret = evlist__tui_browse_hists(evlist, help, NULL, rep->min_percent, - &session->header.env, true, &annotate_opts); + &session->header.env, true); /* * Usually "ret" is the last pressed key, and we only * care if the key notifies us to switch data file. diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 46a61634701b7..b5222d2419832 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -646,8 +646,7 @@ repeat: } ret = evlist__tui_browse_hists(top->evlist, help, &hbt, top->min_percent, - &top->session->header.env, !top->record_opts.overwrite, - &annotate_opts); + &top->session->header.env, !top->record_opts.overwrite); if (ret == K_RELOAD) { top->zero = true; goto repeat; diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index ed0e692afdbe9..fda17c1f20319 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -27,7 +27,6 @@ struct annotate_browser { struct rb_node *curr_hot; struct annotation_line *selection; struct arch *arch; - struct annotation_options *opts; bool searching_backwards; char search_bf[128]; }; @@ -97,7 +96,7 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int struct annotation_write_ops ops = { .first_line = row == 0, .current_entry = is_current_entry, - .change_color = (!notes->options->hide_src_code && + .change_color = (!annotate_opts.hide_src_code && (!is_current_entry || (browser->use_navkeypressed && !browser->navkeypressed))), @@ -128,7 +127,7 @@ static int is_fused(struct annotate_browser *ab, struct disasm_line *cursor) while (pos && pos->al.offset == -1) { pos = list_prev_entry(pos, al.node); - if (!ab->opts->hide_src_code) + if (!annotate_opts.hide_src_code) diff++; } @@ -195,7 +194,7 @@ static void annotate_browser__draw_current_jump(struct ui_browser *browser) return; } - if (notes->options->hide_src_code) { + if (annotate_opts.hide_src_code) { from = cursor->al.idx_asm; to = target->idx_asm; } else { @@ -224,7 +223,7 @@ static unsigned int annotate_browser__refresh(struct ui_browser *browser) int ret = ui_browser__list_head_refresh(browser); int pcnt_width = annotation__pcnt_width(notes); - if (notes->options->jump_arrows) + if (annotate_opts.jump_arrows) annotate_browser__draw_current_jump(browser); ui_browser__set_color(browser, HE_COLORSET_NORMAL); @@ -258,7 +257,7 @@ static void disasm_rb_tree__insert(struct annotate_browser *browser, parent = *p; l = rb_entry(parent, struct annotation_line, rb_node); - if (disasm__cmp(al, l, browser->opts->percent_type) < 0) + if (disasm__cmp(al, l, annotate_opts.percent_type) < 0) p = &(*p)->rb_left; else p = &(*p)->rb_right; @@ -294,11 +293,10 @@ static void annotate_browser__set_top(struct annotate_browser *browser, static void annotate_browser__set_rb_top(struct annotate_browser *browser, struct rb_node *nd) { - struct annotation *notes = browser__annotation(&browser->b); struct annotation_line * pos = rb_entry(nd, struct annotation_line, rb_node); u32 idx = pos->idx; - if (notes->options->hide_src_code) + if (annotate_opts.hide_src_code) idx = pos->idx_asm; annotate_browser__set_top(browser, pos, idx); browser->curr_hot = nd; @@ -331,7 +329,7 @@ static void annotate_browser__calc_percent(struct annotate_browser *browser, double percent; percent = annotation_data__percent(&pos->al.data[i], - browser->opts->percent_type); + annotate_opts.percent_type); if (max_percent < percent) max_percent = percent; @@ -380,12 +378,12 @@ static bool annotate_browser__toggle_source(struct annotate_browser *browser) browser->b.seek(&browser->b, offset, SEEK_CUR); al = list_entry(browser->b.top, struct annotation_line, node); - if (notes->options->hide_src_code) { + if (annotate_opts.hide_src_code) { if (al->idx_asm < offset) offset = al->idx; browser->b.nr_entries = notes->src->nr_entries; - notes->options->hide_src_code = false; + annotate_opts.hide_src_code = false; browser->b.seek(&browser->b, -offset, SEEK_CUR); browser->b.top_idx = al->idx - offset; browser->b.index = al->idx; @@ -403,7 +401,7 @@ static bool annotate_browser__toggle_source(struct annotate_browser *browser) offset = al->idx_asm; browser->b.nr_entries = notes->src->nr_asm_entries; - notes->options->hide_src_code = true; + annotate_opts.hide_src_code = true; browser->b.seek(&browser->b, -offset, SEEK_CUR); browser->b.top_idx = al->idx_asm - offset; browser->b.index = al->idx_asm; @@ -483,8 +481,8 @@ static bool annotate_browser__callq(struct annotate_browser *browser, target_ms.map = ms->map; target_ms.sym = dl->ops.target.sym; annotation__unlock(notes); - symbol__tui_annotate(&target_ms, evsel, hbt, browser->opts); - sym_title(ms->sym, ms->map, title, sizeof(title), browser->opts->percent_type); + symbol__tui_annotate(&target_ms, evsel, hbt); + sym_title(ms->sym, ms->map, title, sizeof(title), annotate_opts.percent_type); ui_browser__show_title(&browser->b, title); return true; } @@ -659,7 +657,6 @@ bool annotate_browser__continue_search_reverse(struct annotate_browser *browser, static int annotate_browser__show(struct ui_browser *browser, char *title, const char *help) { - struct annotate_browser *ab = container_of(browser, struct annotate_browser, b); struct map_symbol *ms = browser->priv; struct symbol *sym = ms->sym; char symbol_dso[SYM_TITLE_MAX_SIZE]; @@ -667,7 +664,7 @@ static int annotate_browser__show(struct ui_browser *browser, char *title, const if (ui_browser__show(browser, title, help) < 0) return -1; - sym_title(sym, ms->map, symbol_dso, sizeof(symbol_dso), ab->opts->percent_type); + sym_title(sym, ms->map, symbol_dso, sizeof(symbol_dso), annotate_opts.percent_type); ui_browser__gotorc_title(browser, 0, 0); ui_browser__set_color(browser, HE_COLORSET_ROOT); @@ -809,7 +806,7 @@ static int annotate_browser__run(struct annotate_browser *browser, annotate_browser__show(&browser->b, title, help); continue; case 'k': - notes->options->show_linenr = !notes->options->show_linenr; + annotate_opts.show_linenr = !annotate_opts.show_linenr; continue; case 'l': annotate_browser__show_full_location (&browser->b); @@ -822,18 +819,18 @@ static int annotate_browser__run(struct annotate_browser *browser, ui_helpline__puts(help); continue; case 'o': - notes->options->use_offset = !notes->options->use_offset; + annotate_opts.use_offset = !annotate_opts.use_offset; annotation__update_column_widths(notes); continue; case 'O': - if (++notes->options->offset_level > ANNOTATION__MAX_OFFSET_LEVEL) - notes->options->offset_level = ANNOTATION__MIN_OFFSET_LEVEL; + if (++annotate_opts.offset_level > ANNOTATION__MAX_OFFSET_LEVEL) + annotate_opts.offset_level = ANNOTATION__MIN_OFFSET_LEVEL; continue; case 'j': - notes->options->jump_arrows = !notes->options->jump_arrows; + annotate_opts.jump_arrows = !annotate_opts.jump_arrows; continue; case 'J': - notes->options->show_nr_jumps = !notes->options->show_nr_jumps; + annotate_opts.show_nr_jumps = !annotate_opts.show_nr_jumps; annotation__update_column_widths(notes); continue; case '/': @@ -897,15 +894,15 @@ show_sup_ins: annotation__update_column_widths(notes); continue; case 'c': - if (notes->options->show_minmax_cycle) - notes->options->show_minmax_cycle = false; + if (annotate_opts.show_minmax_cycle) + annotate_opts.show_minmax_cycle = false; else - notes->options->show_minmax_cycle = true; + annotate_opts.show_minmax_cycle = true; annotation__update_column_widths(notes); continue; case 'p': case 'b': - switch_percent_type(browser->opts, key == 'b'); + switch_percent_type(&annotate_opts, key == 'b'); hists__scnprintf_title(hists, title, sizeof(title)); annotate_browser__show(&browser->b, title, help); continue; @@ -932,26 +929,23 @@ out: } int map_symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel, - struct hist_browser_timer *hbt, - struct annotation_options *opts) + struct hist_browser_timer *hbt) { - return symbol__tui_annotate(ms, evsel, hbt, opts); + return symbol__tui_annotate(ms, evsel, hbt); } int hist_entry__tui_annotate(struct hist_entry *he, struct evsel *evsel, - struct hist_browser_timer *hbt, - struct annotation_options *opts) + struct hist_browser_timer *hbt) { /* reset abort key so that it can get Ctrl-C as a key */ SLang_reset_tty(); SLang_init_tty(0, 0, 0); - return map_symbol__tui_annotate(&he->ms, evsel, hbt, opts); + return map_symbol__tui_annotate(&he->ms, evsel, hbt); } int symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel, - struct hist_browser_timer *hbt, - struct annotation_options *opts) + struct hist_browser_timer *hbt) { struct symbol *sym = ms->sym; struct annotation *notes = symbol__annotation(sym); @@ -965,7 +959,6 @@ int symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel, .priv = ms, .use_navkeypressed = true, }, - .opts = opts, }; struct dso *dso; int ret = -1, err; @@ -996,7 +989,7 @@ int symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel, browser.b.entries = ¬es->src->source, browser.b.width += 18; /* Percentage */ - if (notes->options->hide_src_code) + if (annotate_opts.hide_src_code) ui_browser__init_asm_mode(&browser.b); ret = annotate_browser__run(&browser, evsel, hbt); diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index f4812b2268181..3061dea29e6b6 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -2250,8 +2250,7 @@ struct hist_browser *hist_browser__new(struct hists *hists) static struct hist_browser * perf_evsel_browser__new(struct evsel *evsel, struct hist_browser_timer *hbt, - struct perf_env *env, - struct annotation_options *annotation_opts) + struct perf_env *env) { struct hist_browser *browser = hist_browser__new(evsel__hists(evsel)); @@ -2259,7 +2258,6 @@ perf_evsel_browser__new(struct evsel *evsel, browser->hbt = hbt; browser->env = env; browser->title = hists_browser__scnprintf_title; - browser->annotation_opts = annotation_opts; } return browser; } @@ -2432,8 +2430,8 @@ do_annotate(struct hist_browser *browser, struct popup_action *act) struct hist_entry *he; int err; - if (!browser->annotation_opts->objdump_path && - perf_env__lookup_objdump(browser->env, &browser->annotation_opts->objdump_path)) + if (!annotate_opts.objdump_path && + perf_env__lookup_objdump(browser->env, &annotate_opts.objdump_path)) return 0; notes = symbol__annotation(act->ms.sym); @@ -2445,8 +2443,7 @@ do_annotate(struct hist_browser *browser, struct popup_action *act) else evsel = hists_to_evsel(browser->hists); - err = map_symbol__tui_annotate(&act->ms, evsel, browser->hbt, - browser->annotation_opts); + err = map_symbol__tui_annotate(&act->ms, evsel, browser->hbt); he = hist_browser__selected_entry(browser); /* * offer option to annotate the other branch source or target @@ -2943,11 +2940,10 @@ next: static int evsel__hists_browse(struct evsel *evsel, int nr_events, const char *helpline, bool left_exits, struct hist_browser_timer *hbt, float min_pcnt, - struct perf_env *env, bool warn_lost_event, - struct annotation_options *annotation_opts) + struct perf_env *env, bool warn_lost_event) { struct hists *hists = evsel__hists(evsel); - struct hist_browser *browser = perf_evsel_browser__new(evsel, hbt, env, annotation_opts); + struct hist_browser *browser = perf_evsel_browser__new(evsel, hbt, env); struct branch_info *bi = NULL; #define MAX_OPTIONS 16 char *options[MAX_OPTIONS]; @@ -3398,7 +3394,6 @@ out: struct evsel_menu { struct ui_browser b; struct evsel *selection; - struct annotation_options *annotation_opts; bool lost_events, lost_events_warned; float min_pcnt; struct perf_env *env; @@ -3499,8 +3494,7 @@ browse_hists: hbt->timer(hbt->arg); key = evsel__hists_browse(pos, nr_events, help, true, hbt, menu->min_pcnt, menu->env, - warn_lost_event, - menu->annotation_opts); + warn_lost_event); ui_browser__show_title(&menu->b, title); switch (key) { case K_TAB: @@ -3557,7 +3551,7 @@ static bool filter_group_entries(struct ui_browser *browser __maybe_unused, static int __evlist__tui_browse_hists(struct evlist *evlist, int nr_entries, const char *help, struct hist_browser_timer *hbt, float min_pcnt, struct perf_env *env, - bool warn_lost_event, struct annotation_options *annotation_opts) + bool warn_lost_event) { struct evsel *pos; struct evsel_menu menu = { @@ -3572,7 +3566,6 @@ static int __evlist__tui_browse_hists(struct evlist *evlist, int nr_entries, con }, .min_pcnt = min_pcnt, .env = env, - .annotation_opts = annotation_opts, }; ui_helpline__push("Press ESC to exit"); @@ -3607,8 +3600,7 @@ static bool evlist__single_entry(struct evlist *evlist) } int evlist__tui_browse_hists(struct evlist *evlist, const char *help, struct hist_browser_timer *hbt, - float min_pcnt, struct perf_env *env, bool warn_lost_event, - struct annotation_options *annotation_opts) + float min_pcnt, struct perf_env *env, bool warn_lost_event) { int nr_entries = evlist->core.nr_entries; @@ -3617,7 +3609,7 @@ single_entry: { struct evsel *first = evlist__first(evlist); return evsel__hists_browse(first, nr_entries, help, false, hbt, min_pcnt, - env, warn_lost_event, annotation_opts); + env, warn_lost_event); } } @@ -3635,7 +3627,7 @@ single_entry: { } return __evlist__tui_browse_hists(evlist, nr_entries, help, hbt, min_pcnt, env, - warn_lost_event, annotation_opts); + warn_lost_event); } static int block_hists_browser__title(struct hist_browser *browser, char *bf, @@ -3654,8 +3646,7 @@ static int block_hists_browser__title(struct hist_browser *browser, char *bf, } int block_hists_tui_browse(struct block_hist *bh, struct evsel *evsel, - float min_percent, struct perf_env *env, - struct annotation_options *annotation_opts) + float min_percent, struct perf_env *env) { struct hists *hists = &bh->block_hists; struct hist_browser *browser; @@ -3672,7 +3663,6 @@ int block_hists_tui_browse(struct block_hist *bh, struct evsel *evsel, browser->title = block_hists_browser__title; browser->min_pcnt = min_percent; browser->env = env; - browser->annotation_opts = annotation_opts; /* reset abort key so that it can get Ctrl-C as a key */ SLang_reset_tty(); diff --git a/tools/perf/ui/browsers/hists.h b/tools/perf/ui/browsers/hists.h index 1e938d9ffa5ee..de46f6c56b0ef 100644 --- a/tools/perf/ui/browsers/hists.h +++ b/tools/perf/ui/browsers/hists.h @@ -4,7 +4,6 @@ #include "ui/browser.h" -struct annotation_options; struct evsel; struct hist_browser { @@ -15,7 +14,6 @@ struct hist_browser { struct hist_browser_timer *hbt; struct pstack *pstack; struct perf_env *env; - struct annotation_options *annotation_opts; struct evsel *block_evsel; int print_seq; bool show_dso; diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 7bf29baa43f54..857c5fa0e6b16 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -418,13 +418,11 @@ int symbol__tty_annotate2(struct map_symbol *ms, struct evsel *evsel); #ifdef HAVE_SLANG_SUPPORT int symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel, - struct hist_browser_timer *hbt, - struct annotation_options *opts); + struct hist_browser_timer *hbt); #else static inline int symbol__tui_annotate(struct map_symbol *ms __maybe_unused, struct evsel *evsel __maybe_unused, - struct hist_browser_timer *hbt __maybe_unused, - struct annotation_options *opts __maybe_unused) + struct hist_browser_timer *hbt __maybe_unused) { return 0; } diff --git a/tools/perf/util/block-info.c b/tools/perf/util/block-info.c index 08f82c1f166c3..dec910989701e 100644 --- a/tools/perf/util/block-info.c +++ b/tools/perf/util/block-info.c @@ -464,8 +464,7 @@ void block_info__free_report(struct block_report *reps, int nr_reps) } int report__browse_block_hists(struct block_hist *bh, float min_percent, - struct evsel *evsel, struct perf_env *env, - struct annotation_options *annotation_opts) + struct evsel *evsel, struct perf_env *env) { int ret; @@ -477,8 +476,7 @@ int report__browse_block_hists(struct block_hist *bh, float min_percent, return 0; case 1: symbol_conf.report_individual_block = true; - ret = block_hists_tui_browse(bh, evsel, min_percent, - env, annotation_opts); + ret = block_hists_tui_browse(bh, evsel, min_percent, env); return ret; default: return -1; diff --git a/tools/perf/util/block-info.h b/tools/perf/util/block-info.h index 42e9dcc4cf0ab..96f53e89795e2 100644 --- a/tools/perf/util/block-info.h +++ b/tools/perf/util/block-info.h @@ -78,8 +78,7 @@ struct block_report *block_info__create_report(struct evlist *evlist, void block_info__free_report(struct block_report *reps, int nr_reps); int report__browse_block_hists(struct block_hist *bh, float min_percent, - struct evsel *evsel, struct perf_env *env, - struct annotation_options *annotation_opts); + struct evsel *evsel, struct perf_env *env); float block_info__total_cycles_percent(struct hist_entry *he); diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index afc9f1c7f4dc2..5d0db96609dff 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -457,7 +457,6 @@ struct hist_browser_timer { int refresh; }; -struct annotation_options; struct res_sample; enum rstype { @@ -473,16 +472,13 @@ struct block_hist; void attr_to_script(char *buf, struct perf_event_attr *attr); int map_symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel, - struct hist_browser_timer *hbt, - struct annotation_options *annotation_opts); + struct hist_browser_timer *hbt); int hist_entry__tui_annotate(struct hist_entry *he, struct evsel *evsel, - struct hist_browser_timer *hbt, - struct annotation_options *annotation_opts); + struct hist_browser_timer *hbt); int evlist__tui_browse_hists(struct evlist *evlist, const char *help, struct hist_browser_timer *hbt, - float min_pcnt, struct perf_env *env, bool warn_lost_event, - struct annotation_options *annotation_options); + float min_pcnt, struct perf_env *env, bool warn_lost_event); int script_browse(const char *script_opt, struct evsel *evsel); @@ -492,8 +488,7 @@ int res_sample_browse(struct res_sample *res_samples, int num_res, void res_sample_init(void); int block_hists_tui_browse(struct block_hist *bh, struct evsel *evsel, - float min_percent, struct perf_env *env, - struct annotation_options *annotation_opts); + float min_percent, struct perf_env *env); #else static inline int evlist__tui_browse_hists(struct evlist *evlist __maybe_unused, @@ -501,23 +496,20 @@ int evlist__tui_browse_hists(struct evlist *evlist __maybe_unused, struct hist_browser_timer *hbt __maybe_unused, float min_pcnt __maybe_unused, struct perf_env *env __maybe_unused, - bool warn_lost_event __maybe_unused, - struct annotation_options *annotation_options __maybe_unused) + bool warn_lost_event __maybe_unused) { return 0; } static inline int map_symbol__tui_annotate(struct map_symbol *ms __maybe_unused, struct evsel *evsel __maybe_unused, - struct hist_browser_timer *hbt __maybe_unused, - struct annotation_options *annotation_options __maybe_unused) + struct hist_browser_timer *hbt __maybe_unused) { return 0; } static inline int hist_entry__tui_annotate(struct hist_entry *he __maybe_unused, struct evsel *evsel __maybe_unused, - struct hist_browser_timer *hbt __maybe_unused, - struct annotation_options *annotation_opts __maybe_unused) + struct hist_browser_timer *hbt __maybe_unused) { return 0; } @@ -541,8 +533,7 @@ static inline void res_sample_init(void) {} static inline int block_hists_tui_browse(struct block_hist *bh __maybe_unused, struct evsel *evsel __maybe_unused, float min_percent __maybe_unused, - struct perf_env *env __maybe_unused, - struct annotation_options *annotation_opts __maybe_unused) + struct perf_env *env __maybe_unused) { return 0; } -- GitLab From 7f929aea21fd0be5e0d9ee5827d5b809daa69f29 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 28 Nov 2023 09:54:39 -0800 Subject: [PATCH 0216/1290] perf annotate: Ensure init/exit for global options Now it only cares about the global options so it can just handle it without the argument. Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231128175441.721579-7-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 8 ++++---- tools/perf/builtin-report.c | 8 ++++---- tools/perf/builtin-top.c | 8 ++++---- tools/perf/util/annotate.c | 19 +++++++++++-------- tools/perf/util/annotate.h | 8 ++++---- 5 files changed, 27 insertions(+), 24 deletions(-) diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index d17213bd83327..d880f1b039fdf 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -615,13 +615,13 @@ int cmd_annotate(int argc, const char **argv) set_option_flag(options, 0, "show-total-period", PARSE_OPT_EXCLUSIVE); set_option_flag(options, 0, "show-nr-samples", PARSE_OPT_EXCLUSIVE); - annotation_options__init(&annotate_opts); + annotation_options__init(); ret = hists__init(); if (ret < 0) return ret; - annotation_config__init(&annotate_opts); + annotation_config__init(); argc = parse_options(argc, argv, options, annotate_usage, 0); if (argc) { @@ -651,7 +651,7 @@ int cmd_annotate(int argc, const char **argv) return -ENOMEM; } - if (annotate_check_args(&annotate_opts) < 0) + if (annotate_check_args() < 0) return -EINVAL; #ifdef HAVE_GTK2_SUPPORT @@ -732,7 +732,7 @@ out_delete: #ifndef NDEBUG perf_session__delete(annotate.session); #endif - annotation_options__exit(&annotate_opts); + annotation_options__exit(); return ret; } diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index bc0d986c1e0c1..17fb171e898b9 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -1430,7 +1430,7 @@ int cmd_report(int argc, const char **argv) */ symbol_conf.keep_exited_threads = true; - annotation_options__init(&annotate_opts); + annotation_options__init(); ret = perf_config(report__config, &report); if (ret) @@ -1464,7 +1464,7 @@ int cmd_report(int argc, const char **argv) return -ENOMEM; } - if (annotate_check_args(&annotate_opts) < 0) { + if (annotate_check_args() < 0) { ret = -EINVAL; goto exit; } @@ -1696,7 +1696,7 @@ repeat: */ symbol_conf.priv_size += sizeof(u32); } - annotation_config__init(&annotate_opts); + annotation_config__init(); } if (symbol__init(&session->header.env) < 0) @@ -1750,7 +1750,7 @@ error: zstd_fini(&(session->zstd_data)); perf_session__delete(session); exit: - annotation_options__exit(&annotate_opts); + annotation_options__exit(); free(sort_order_help); free(field_order_help); return ret; diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index b5222d2419832..ed83afeeced05 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1608,7 +1608,7 @@ int cmd_top(int argc, const char **argv) if (status < 0) return status; - annotation_options__init(&annotate_opts); + annotation_options__init(); annotate_opts.min_pcnt = 5; annotate_opts.context = 4; @@ -1660,7 +1660,7 @@ int cmd_top(int argc, const char **argv) if (status) goto out_delete_evlist; - if (annotate_check_args(&annotate_opts) < 0) + if (annotate_check_args() < 0) goto out_delete_evlist; if (!top.evlist->core.nr_entries) { @@ -1786,7 +1786,7 @@ int cmd_top(int argc, const char **argv) if (status < 0) goto out_delete_evlist; - annotation_config__init(&annotate_opts); + annotation_config__init(); symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL); status = symbol__init(NULL); @@ -1839,7 +1839,7 @@ int cmd_top(int argc, const char **argv) out_delete_evlist: evlist__delete(top.evlist); perf_session__delete(top.session); - annotation_options__exit(&annotate_opts); + annotation_options__exit(); return status; } diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index daff9af552f4f..626ff3baeb855 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -3416,8 +3416,10 @@ static int annotation__config(const char *var, const char *value, void *data) return 0; } -void annotation_options__init(struct annotation_options *opt) +void annotation_options__init(void) { + struct annotation_options *opt = &annotate_opts; + memset(opt, 0, sizeof(*opt)); /* Default values. */ @@ -3428,16 +3430,15 @@ void annotation_options__init(struct annotation_options *opt) opt->percent_type = PERCENT_PERIOD_LOCAL; } - -void annotation_options__exit(struct annotation_options *opt) +void annotation_options__exit(void) { - zfree(&opt->disassembler_style); - zfree(&opt->objdump_path); + zfree(&annotate_opts.disassembler_style); + zfree(&annotate_opts.objdump_path); } -void annotation_config__init(struct annotation_options *opt) +void annotation_config__init(void) { - perf_config(annotation__config, opt); + perf_config(annotation__config, &annotate_opts); } static unsigned int parse_percent_type(char *str1, char *str2) @@ -3491,8 +3492,10 @@ out: return err; } -int annotate_check_args(struct annotation_options *args) +int annotate_check_args(void) { + struct annotation_options *args = &annotate_opts; + if (args->prefix_strip && !args->prefix) { pr_err("--prefix-strip requires --prefix\n"); return -1; diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 857c5fa0e6b16..4283eb4522b28 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -428,14 +428,14 @@ static inline int symbol__tui_annotate(struct map_symbol *ms __maybe_unused, } #endif -void annotation_options__init(struct annotation_options *opt); -void annotation_options__exit(struct annotation_options *opt); +void annotation_options__init(void); +void annotation_options__exit(void); -void annotation_config__init(struct annotation_options *opt); +void annotation_config__init(void); int annotate_parse_percent_type(const struct option *opt, const char *_str, int unset); -int annotate_check_args(struct annotation_options *args); +int annotate_check_args(void); #endif /* __PERF_ANNOTATE_H */ -- GitLab From 2fa21d694c63081f26444847c916e5fc83bcefa1 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 28 Nov 2023 09:54:40 -0800 Subject: [PATCH 0217/1290] perf annotate: Remove remaining usages of local annotation options So that it can get rid of the unused data. Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231128175441.721579-8-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/annotate.c | 14 ++++++-------- tools/perf/util/annotate.c | 2 +- tools/perf/util/annotate.h | 6 +++--- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index fda17c1f20319..cb2eb6dcb5321 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -37,11 +37,10 @@ static inline struct annotation *browser__annotation(struct ui_browser *browser) return symbol__annotation(ms->sym); } -static bool disasm_line__filter(struct ui_browser *browser, void *entry) +static bool disasm_line__filter(struct ui_browser *browser __maybe_unused, void *entry) { - struct annotation *notes = browser__annotation(browser); struct annotation_line *al = list_entry(entry, struct annotation_line, node); - return annotation_line__filter(al, notes); + return annotation_line__filter(al); } static int ui_browser__jumps_percent_color(struct ui_browser *browser, int nr, bool current) @@ -269,7 +268,6 @@ static void disasm_rb_tree__insert(struct annotate_browser *browser, static void annotate_browser__set_top(struct annotate_browser *browser, struct annotation_line *pos, u32 idx) { - struct annotation *notes = browser__annotation(&browser->b); unsigned back; ui_browser__refresh_dimensions(&browser->b); @@ -279,7 +277,7 @@ static void annotate_browser__set_top(struct annotate_browser *browser, while (browser->b.top_idx != 0 && back != 0) { pos = list_entry(pos->node.prev, struct annotation_line, node); - if (annotation_line__filter(pos, notes)) + if (annotation_line__filter(pos)) continue; --browser->b.top_idx; @@ -498,7 +496,7 @@ struct disasm_line *annotate_browser__find_offset(struct annotate_browser *brows list_for_each_entry(pos, ¬es->src->source, al.node) { if (pos->al.offset == offset) return pos; - if (!annotation_line__filter(&pos->al, notes)) + if (!annotation_line__filter(&pos->al)) ++*idx; } @@ -542,7 +540,7 @@ struct annotation_line *annotate_browser__find_string(struct annotate_browser *b *idx = browser->b.index; list_for_each_entry_continue(al, ¬es->src->source, node) { - if (annotation_line__filter(al, notes)) + if (annotation_line__filter(al)) continue; ++*idx; @@ -579,7 +577,7 @@ struct annotation_line *annotate_browser__find_string_reverse(struct annotate_br *idx = browser->b.index; list_for_each_entry_continue_reverse(al, ¬es->src->source, node) { - if (annotation_line__filter(al, notes)) + if (annotation_line__filter(al)) continue; --*idx; diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 626ff3baeb855..09c399ab03843 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -2727,7 +2727,7 @@ static int symbol__annotate_fprintf2(struct symbol *sym, FILE *fp) struct annotation_line *al; list_for_each_entry(al, ¬es->src->source, node) { - if (annotation_line__filter(al, notes)) + if (annotation_line__filter(al)) continue; annotation_line__write(al, notes, &wops); fputc('\n', fp); diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 4283eb4522b28..6d5a6bb49a975 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -320,7 +320,7 @@ bool annotation__trylock(struct annotation *notes) EXCLUSIVE_TRYLOCK_FUNCTION(tr static inline int annotation__cycles_width(struct annotation *notes) { - if (notes->branch && notes->options->show_minmax_cycle) + if (notes->branch && annotate_opts.show_minmax_cycle) return ANNOTATION__IPC_WIDTH + ANNOTATION__MINMAX_CYCLES_WIDTH; return notes->branch ? ANNOTATION__IPC_WIDTH + ANNOTATION__CYCLES_WIDTH : 0; @@ -331,9 +331,9 @@ static inline int annotation__pcnt_width(struct annotation *notes) return (symbol_conf.show_total_period ? 12 : 7) * notes->nr_events; } -static inline bool annotation_line__filter(struct annotation_line *al, struct annotation *notes) +static inline bool annotation_line__filter(struct annotation_line *al) { - return notes->options->hide_src_code && al->offset == -1; + return annotate_opts.hide_src_code && al->offset == -1; } void annotation__set_offsets(struct annotation *notes, s64 size); -- GitLab From 327f7533cc596427b62f431c8852951412b6c0dc Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 28 Nov 2023 09:54:41 -0800 Subject: [PATCH 0218/1290] perf annotate: Get rid of local annotation options It doesn't need the option in the struct annotation which is allocated for each symbol. It can directly use the global options and save 8 bytes per symbol. Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231128175441.721579-9-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 2 -- tools/perf/util/annotate.h | 1 - 2 files changed, 3 deletions(-) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 09c399ab03843..c81fa0791918e 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -3333,8 +3333,6 @@ int symbol__annotate2(struct map_symbol *ms, struct evsel *evsel, if (err) goto out_free_offsets; - notes->options = &annotate_opts; - symbol__calc_percent(sym, evsel); annotation__set_offsets(notes, size); diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 6d5a6bb49a975..589f8aaf02366 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -294,7 +294,6 @@ struct annotated_branch { struct LOCKABLE annotation { u64 start; - struct annotation_options *options; int nr_events; int max_jump_sources; struct { -- GitLab From cb46fca88d14939da2785567253d0a297f31be27 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 29 Aug 2023 08:20:14 -0700 Subject: [PATCH 0219/1290] cxl: Add Support for Get Timestamp Add the call to the UAPI such that userspace may corelate the timestamps from the device log with system wall time, if, for example there's any sort of inaccuracy or skew in the device. Signed-off-by: Davidlohr Bueso Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/20230829152014.15452-1-dave@stgolabs.net Signed-off-by: Dan Williams --- drivers/cxl/core/mbox.c | 1 + drivers/cxl/cxlmem.h | 1 + include/uapi/linux/cxl_mem.h | 1 + 3 files changed, 3 insertions(+) diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 36270dcfb42ef..b86dbd25740c3 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -63,6 +63,7 @@ static struct cxl_mem_command cxl_mem_commands[CXL_MEM_COMMAND_ID_MAX] = { CXL_CMD(GET_SHUTDOWN_STATE, 0, 0x1, 0), CXL_CMD(SET_SHUTDOWN_STATE, 0x1, 0, 0), CXL_CMD(GET_SCAN_MEDIA_CAPS, 0x10, 0x4, 0), + CXL_CMD(GET_TIMESTAMP, 0, 0x8, 0), }; /* diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index a2fcbca253f39..6a6becee402b9 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -503,6 +503,7 @@ enum cxl_opcode { CXL_MBOX_OP_GET_FW_INFO = 0x0200, CXL_MBOX_OP_TRANSFER_FW = 0x0201, CXL_MBOX_OP_ACTIVATE_FW = 0x0202, + CXL_MBOX_OP_GET_TIMESTAMP = 0x0300, CXL_MBOX_OP_SET_TIMESTAMP = 0x0301, CXL_MBOX_OP_GET_SUPPORTED_LOGS = 0x0400, CXL_MBOX_OP_GET_LOG = 0x0401, diff --git a/include/uapi/linux/cxl_mem.h b/include/uapi/linux/cxl_mem.h index 14bc6e7421483..42066f4eb8903 100644 --- a/include/uapi/linux/cxl_mem.h +++ b/include/uapi/linux/cxl_mem.h @@ -46,6 +46,7 @@ ___C(GET_SCAN_MEDIA_CAPS, "Get Scan Media Capabilities"), \ ___DEPRECATED(SCAN_MEDIA, "Scan Media"), \ ___DEPRECATED(GET_SCAN_MEDIA, "Get Scan Media Results"), \ + ___C(GET_TIMESTAMP, "Get Timestamp"), \ ___C(MAX, "invalid / last command") #define ___C(a, b) CXL_MEM_COMMAND_ID_##a -- GitLab From 2159bd4e905704b1765b6b883ea15e51ad986a6a Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Thu, 7 Dec 2023 21:10:01 +0800 Subject: [PATCH 0220/1290] memblock: Return NUMA_NO_NODE instead of -1 to improve code readability When no corresponding memory region is found for the given pfn, return NUMA_NO_NODE instead of -1. This improves code readability and aligns with the existing logic of the memblock_search_pfn_nid() function's user. Signed-off-by: Yuntao Wang Link: https://lore.kernel.org/r/20231207131001.224914-1-ytcoode@gmail.com Signed-off-by: Mike Rapoport (IBM) --- mm/memblock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memblock.c b/mm/memblock.c index 5a88d6d24d793..c1f579c23396a 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1851,7 +1851,7 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn, int mid = memblock_search(type, PFN_PHYS(pfn)); if (mid == -1) - return -1; + return NUMA_NO_NODE; *start_pfn = PFN_DOWN(type->regions[mid].base); *end_pfn = PFN_DOWN(type->regions[mid].base + type->regions[mid].size); -- GitLab From 765a0542fdc7aad7cbc1da3bd19bed6297b54e2c Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 8 Dec 2023 09:07:21 -0800 Subject: [PATCH 0221/1290] x86/virt/tdx: Detect TDX during kernel boot Intel Trust Domain Extensions (TDX) protects guest VMs from malicious host and certain physical attacks. A CPU-attested software module called 'the TDX module' runs inside a new isolated memory range as a trusted hypervisor to manage and run protected VMs. Pre-TDX Intel hardware has support for a memory encryption architecture called MKTME. The memory encryption hardware underpinning MKTME is also used for Intel TDX. TDX ends up "stealing" some of the physical address space from the MKTME architecture for crypto-protection to VMs. The BIOS is responsible for partitioning the "KeyID" space between legacy MKTME and TDX. The KeyIDs reserved for TDX are called 'TDX private KeyIDs' or 'TDX KeyIDs' for short. During machine boot, TDX microcode verifies that the BIOS programmed TDX private KeyIDs consistently and correctly programmed across all CPU packages. The MSRs are locked in this state after verification. This is why MSR_IA32_MKTME_KEYID_PARTITIONING gets used for TDX enumeration: it indicates not just that the hardware supports TDX, but that all the boot-time security checks passed. The TDX module is expected to be loaded by the BIOS when it enables TDX, but the kernel needs to properly initialize it before it can be used to create and run any TDX guests. The TDX module will be initialized by the KVM subsystem when KVM wants to use TDX. Detect platform TDX support by detecting TDX private KeyIDs. The TDX module itself requires one TDX KeyID as the 'TDX global KeyID' to protect its metadata. Each TDX guest also needs a TDX KeyID for its own protection. Just use the first TDX KeyID as the global KeyID and leave the rest for TDX guests. If no TDX KeyID is left for TDX guests, disable TDX as initializing the TDX module alone is useless. [ dhansen: add X86_FEATURE, replace helper function ] Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Reviewed-by: Kirill A. Shutemov Reviewed-by: Isaku Yamahata Reviewed-by: David Hildenbrand Reviewed-by: Dave Hansen Reviewed-by: Kuppuswamy Sathyanarayanan Link: https://lore.kernel.org/all/20231208170740.53979-1-dave.hansen%40intel.com --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/msr-index.h | 3 ++ arch/x86/include/asm/tdx.h | 3 ++ arch/x86/kernel/cpu/common.c | 2 + arch/x86/virt/vmx/tdx/Makefile | 2 +- arch/x86/virt/vmx/tdx/tdx.c | 81 ++++++++++++++++++++++++++++++ 6 files changed, 91 insertions(+), 1 deletion(-) create mode 100644 arch/x86/virt/vmx/tdx/tdx.c diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 4af140cf5719e..45ddc6b6baaa1 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -198,6 +198,7 @@ #define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ #define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ #define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ +#define X86_FEATURE_TDX_HOST_PLATFORM ( 7*32+ 7) /* Platform supports being a TDX host */ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ #define X86_FEATURE_XCOMPACTED ( 7*32+10) /* "" Use compacted XSTATE (XSAVES or XSAVEC) */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 1d51e1850ed03..66c12d4efa31a 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -536,6 +536,9 @@ #define MSR_RELOAD_PMC0 0x000014c1 #define MSR_RELOAD_FIXED_CTR0 0x00001309 +/* KeyID partitioning between MKTME and TDX */ +#define MSR_IA32_MKTME_KEYID_PARTITIONING 0x00000087 + /* * AMD64 MSRs. Not complete. See the architecture manual for a more * complete list. diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index f3d5305a60fc5..e5dd1cb0e1a10 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -83,6 +83,9 @@ static inline long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, u64 __seamcall(u64 fn, struct tdx_module_args *args); u64 __seamcall_ret(u64 fn, struct tdx_module_args *args); u64 __seamcall_saved_ret(u64 fn, struct tdx_module_args *args); +void tdx_init(void); +#else +static inline void tdx_init(void) { } #endif /* CONFIG_INTEL_TDX_HOST */ #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b14fc8c1c9538..b968a2496e80b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -66,6 +66,7 @@ #include #include #include +#include #include "cpu.h" @@ -1987,6 +1988,7 @@ static __init void identify_boot_cpu(void) setup_cr_pinning(); tsx_init(); + tdx_init(); lkgs_init(); } diff --git a/arch/x86/virt/vmx/tdx/Makefile b/arch/x86/virt/vmx/tdx/Makefile index 46ef8f73aebbb..90da47eb85eec 100644 --- a/arch/x86/virt/vmx/tdx/Makefile +++ b/arch/x86/virt/vmx/tdx/Makefile @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y += seamcall.o +obj-y += seamcall.o tdx.o diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c new file mode 100644 index 0000000000000..94689aef44a65 --- /dev/null +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright(c) 2023 Intel Corporation. + * + * Intel Trusted Domain Extensions (TDX) support + */ + +#define pr_fmt(fmt) "virt/tdx: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static u32 tdx_global_keyid __ro_after_init; +static u32 tdx_guest_keyid_start __ro_after_init; +static u32 tdx_nr_guest_keyids __ro_after_init; + +static __init int record_keyid_partitioning(u32 *tdx_keyid_start, + u32 *nr_tdx_keyids) +{ + u32 _nr_mktme_keyids, _tdx_keyid_start, _nr_tdx_keyids; + int ret; + + /* + * IA32_MKTME_KEYID_PARTIONING: + * Bit [31:0]: Number of MKTME KeyIDs. + * Bit [63:32]: Number of TDX private KeyIDs. + */ + ret = rdmsr_safe(MSR_IA32_MKTME_KEYID_PARTITIONING, &_nr_mktme_keyids, + &_nr_tdx_keyids); + if (ret || !_nr_tdx_keyids) + return -EINVAL; + + /* TDX KeyIDs start after the last MKTME KeyID. */ + _tdx_keyid_start = _nr_mktme_keyids + 1; + + *tdx_keyid_start = _tdx_keyid_start; + *nr_tdx_keyids = _nr_tdx_keyids; + + return 0; +} + +void __init tdx_init(void) +{ + u32 tdx_keyid_start, nr_tdx_keyids; + int err; + + err = record_keyid_partitioning(&tdx_keyid_start, &nr_tdx_keyids); + if (err) + return; + + pr_info("BIOS enabled: private KeyID range [%u, %u)\n", + tdx_keyid_start, tdx_keyid_start + nr_tdx_keyids); + + /* + * The TDX module itself requires one 'global KeyID' to protect + * its metadata. If there's only one TDX KeyID, there won't be + * any left for TDX guests thus there's no point to enable TDX + * at all. + */ + if (nr_tdx_keyids < 2) { + pr_err("initialization failed: too few private KeyIDs available.\n"); + return; + } + + /* + * Just use the first TDX KeyID as the 'global KeyID' and + * leave the rest for TDX guests. + */ + tdx_global_keyid = tdx_keyid_start; + tdx_guest_keyid_start = tdx_keyid_start + 1; + tdx_nr_guest_keyids = nr_tdx_keyids - 1; + + setup_force_cpu_cap(X86_FEATURE_TDX_HOST_PLATFORM); +} -- GitLab From d623704bb23901a25bf6d6a40aa16b43a17622eb Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 8 Dec 2023 09:07:22 -0800 Subject: [PATCH 0222/1290] x86/virt/tdx: Define TDX supported page sizes as macros TDX supports 4K, 2M and 1G page sizes. The corresponding values are defined by the TDX module spec and used as TDX module ABI. Currently, they are used in try_accept_one() when the TDX guest tries to accept a page. However currently try_accept_one() uses hard-coded magic values. Define TDX supported page sizes as macros and get rid of the hard-coded values in try_accept_one(). TDX host support will need to use them too. Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Reviewed-by: Kirill A. Shutemov Reviewed-by: Dave Hansen Reviewed-by: David Hildenbrand Link: https://lore.kernel.org/all/20231208170740.53979-2-dave.hansen%40intel.com --- arch/x86/coco/tdx/tdx-shared.c | 6 +++--- arch/x86/include/asm/shared/tdx.h | 5 +++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/x86/coco/tdx/tdx-shared.c b/arch/x86/coco/tdx/tdx-shared.c index 78e413269791e..1655aa56a0a51 100644 --- a/arch/x86/coco/tdx/tdx-shared.c +++ b/arch/x86/coco/tdx/tdx-shared.c @@ -22,13 +22,13 @@ static unsigned long try_accept_one(phys_addr_t start, unsigned long len, */ switch (pg_level) { case PG_LEVEL_4K: - page_size = 0; + page_size = TDX_PS_4K; break; case PG_LEVEL_2M: - page_size = 1; + page_size = TDX_PS_2M; break; case PG_LEVEL_1G: - page_size = 2; + page_size = TDX_PS_1G; break; default: return 0; diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index ccce7ebd86772..a4036149c4847 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -55,6 +55,11 @@ (TDX_RDX | TDX_RBX | TDX_RSI | TDX_RDI | TDX_R8 | TDX_R9 | \ TDX_R10 | TDX_R11 | TDX_R12 | TDX_R13 | TDX_R14 | TDX_R15) +/* TDX supported page sizes from the TDX module ABI. */ +#define TDX_PS_4K 0 +#define TDX_PS_2M 1 +#define TDX_PS_1G 2 + #ifndef __ASSEMBLY__ #include -- GitLab From 3115cabd935ade76fbe61154d1e405158e548272 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 8 Dec 2023 09:07:23 -0800 Subject: [PATCH 0223/1290] x86/virt/tdx: Make INTEL_TDX_HOST depend on X86_X2APIC TDX capable platforms are locked to X2APIC mode and cannot fall back to the legacy xAPIC mode when TDX is enabled by the BIOS. TDX host support requires x2APIC. Make INTEL_TDX_HOST depend on X86_X2APIC. Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Reviewed-by: Dave Hansen Reviewed-by: David Hildenbrand Reviewed-by: Kirill A. Shutemov Link: https://lore.kernel.org/lkml/ba80b303-31bf-d44a-b05d-5c0f83038798@intel.com/ Link: https://lore.kernel.org/all/20231208170740.53979-3-dave.hansen%40intel.com --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3762f41bb0929..eb6e63956d51e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1970,6 +1970,7 @@ config INTEL_TDX_HOST depends on CPU_SUP_INTEL depends on X86_64 depends on KVM_INTEL + depends on X86_X2APIC help Intel Trust Domain Extensions (TDX) protects guest VMs from malicious host and certain physical attacks. This option enables necessary TDX -- GitLab From 1e66a7e275393055d98d2306771fe1feadeb1cd6 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 8 Dec 2023 09:07:24 -0800 Subject: [PATCH 0224/1290] x86/virt/tdx: Handle SEAMCALL no entropy error in common code Some SEAMCALLs use the RDRAND hardware and can fail for the same reasons as RDRAND. Use the kernel RDRAND retry logic for them. There are three __seamcall*() variants. Do the SEAMCALL retry in common code and add a wrapper for each of them. Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Reviewed-by: Kirill A. Shutemov Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Dave Hansen Link: https://lore.kernel.org/all/20231208170740.53979-4-dave.hansen%40intel.com --- arch/x86/include/asm/tdx.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index e5dd1cb0e1a10..24c0357081bdf 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -24,6 +24,11 @@ #define TDX_SEAMCALL_GP (TDX_SW_ERROR | X86_TRAP_GP) #define TDX_SEAMCALL_UD (TDX_SW_ERROR | X86_TRAP_UD) +/* + * TDX module SEAMCALL leaf function error codes + */ +#define TDX_RND_NO_ENTROPY 0x8000020300000000ULL + #ifndef __ASSEMBLY__ /* @@ -84,6 +89,27 @@ u64 __seamcall(u64 fn, struct tdx_module_args *args); u64 __seamcall_ret(u64 fn, struct tdx_module_args *args); u64 __seamcall_saved_ret(u64 fn, struct tdx_module_args *args); void tdx_init(void); + +#include + +typedef u64 (*sc_func_t)(u64 fn, struct tdx_module_args *args); + +static inline u64 sc_retry(sc_func_t func, u64 fn, + struct tdx_module_args *args) +{ + int retry = RDRAND_RETRY_LOOPS; + u64 ret; + + do { + ret = func(fn, args); + } while (ret == TDX_RND_NO_ENTROPY && --retry); + + return ret; +} + +#define seamcall(_fn, _args) sc_retry(__seamcall, (_fn), (_args)) +#define seamcall_ret(_fn, _args) sc_retry(__seamcall_ret, (_fn), (_args)) +#define seamcall_saved_ret(_fn, _args) sc_retry(__seamcall_saved_ret, (_fn), (_args)) #else static inline void tdx_init(void) { } #endif /* CONFIG_INTEL_TDX_HOST */ -- GitLab From df01f5ae07dd9ca0c755943a5cfcc4e78e480f99 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 8 Dec 2023 09:07:25 -0800 Subject: [PATCH 0225/1290] x86/virt/tdx: Add SEAMCALL error printing for module initialization The SEAMCALLs involved during the TDX module initialization are not expected to fail. In fact, they are not expected to return any non-zero code (except the "running out of entropy error", which can be handled internally already). Add yet another set of SEAMCALL wrappers, which treats all non-zero return code as error, to support printing SEAMCALL error upon failure for module initialization. Note the TDX module initialization doesn't use the _saved_ret() variant thus no wrapper is added for it. SEAMCALL assembly can also return kernel-defined error codes for three special cases: 1) TDX isn't enabled by the BIOS; 2) TDX module isn't loaded; 3) CPU isn't in VMX operation. Whether they can legally happen depends on the caller, so leave to the caller to print error message when desired. Also convert the SEAMCALL error codes to the kernel error codes in the new wrappers so that each SEAMCALL caller doesn't have to repeat the conversion. [ dhansen: Align the register dump with show_regs(). Zero-pad the contents, split on two lines and use consistent spacing. ] Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Reviewed-by: Kirill A. Shutemov Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Dave Hansen Link: https://lore.kernel.org/all/20231208170740.53979-5-dave.hansen%40intel.com --- arch/x86/include/asm/tdx.h | 1 + arch/x86/virt/vmx/tdx/tdx.c | 44 +++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 24c0357081bdf..2c0f4162fe99d 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -27,6 +27,7 @@ /* * TDX module SEAMCALL leaf function error codes */ +#define TDX_SUCCESS 0ULL #define TDX_RND_NO_ENTROPY 0x8000020300000000ULL #ifndef __ASSEMBLY__ diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 94689aef44a65..262b9b64a50ba 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -21,6 +21,50 @@ static u32 tdx_global_keyid __ro_after_init; static u32 tdx_guest_keyid_start __ro_after_init; static u32 tdx_nr_guest_keyids __ro_after_init; +typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args); + +static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args) +{ + pr_err("SEAMCALL (0x%016llx) failed: 0x%016llx\n", fn, err); +} + +static inline void seamcall_err_ret(u64 fn, u64 err, + struct tdx_module_args *args) +{ + seamcall_err(fn, err, args); + pr_err("RCX 0x%016llx RDX 0x%016llx R08 0x%016llx\n", + args->rcx, args->rdx, args->r8); + pr_err("R09 0x%016llx R10 0x%016llx R11 0x%016llx\n", + args->r9, args->r10, args->r11); +} + +static inline int sc_retry_prerr(sc_func_t func, sc_err_func_t err_func, + u64 fn, struct tdx_module_args *args) +{ + u64 sret = sc_retry(func, fn, args); + + if (sret == TDX_SUCCESS) + return 0; + + if (sret == TDX_SEAMCALL_VMFAILINVALID) + return -ENODEV; + + if (sret == TDX_SEAMCALL_GP) + return -EOPNOTSUPP; + + if (sret == TDX_SEAMCALL_UD) + return -EACCES; + + err_func(fn, sret, args); + return -EIO; +} + +#define seamcall_prerr(__fn, __args) \ + sc_retry_prerr(__seamcall, seamcall_err, (__fn), (__args)) + +#define seamcall_prerr_ret(__fn, __args) \ + sc_retry_prerr(__seamcall_ret, seamcall_err_ret, (__fn), (__args)) + static __init int record_keyid_partitioning(u32 *tdx_keyid_start, u32 *nr_tdx_keyids) { -- GitLab From 6162b310bc219d18bac970dbd441d7743097d1b9 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 8 Dec 2023 09:07:26 -0800 Subject: [PATCH 0226/1290] x86/virt/tdx: Add skeleton to enable TDX on demand There are essentially two steps to get the TDX module ready: 1) Get each CPU ready to run TDX 2) Set up the shared TDX module data structures Introduce and export (to KVM) the infrastructure to do both of these pieces at runtime. == Per-CPU TDX Initialization == Track the initialization status of each CPU with a per-cpu variable. This avoids failures in the case of KVM module reloads and handles cases where CPUs come online later. Generally, the per-cpu SEAMCALLs happen first. But there's actually one global call that has to happen before _any_ others (TDH_SYS_INIT). It's analogous to the boot CPU having to do a bit of extra work just because it happens to be the first one. Track if _any_ CPU has done this call and then only actually do it during the first per-cpu init. == Shared TDX Initialization == Create the global state function (tdx_enable()) as a simple placeholder. The TODO list will be pared down as functionality is added. Use a state machine protected by mutex to make sure the work in tdx_enable() will only be done once. This avoids failures if the KVM module is reloaded. A CPU must be made ready to run TDX before it can participate in initializing the shared parts of the module. Any caller of tdx_enable() need to ensure that it can never run on a CPU which is not ready to run TDX. It needs to be wary of CPU hotplug, preemption and the VMX enabling state of any CPU on which it might run. == Why runtime instead of boot time? == The TDX module can be initialized only once in its lifetime. Instead of always initializing it at boot time, this implementation chooses an "on demand" approach to initialize TDX until there is a real need (e.g when requested by KVM). This approach has below pros: 1) It avoids consuming the memory that must be allocated by kernel and given to the TDX module as metadata (~1/256th of the TDX-usable memory), and also saves the CPU cycles of initializing the TDX module (and the metadata) when TDX is not used at all. 2) The TDX module design allows it to be updated while the system is running. The update procedure shares quite a few steps with this "on demand" initialization mechanism. The hope is that much of "on demand" mechanism can be shared with a future "update" mechanism. A boot-time TDX module implementation would not be able to share much code with the update mechanism. 3) Making SEAMCALL requires VMX to be enabled. Currently, only the KVM code mucks with VMX enabling. If the TDX module were to be initialized separately from KVM (like at boot), the boot code would need to be taught how to muck with VMX enabling and KVM would need to be taught how to cope with that. Making KVM itself responsible for TDX initialization lets the rest of the kernel stay blissfully unaware of VMX. [ dhansen: completely reorder/rewrite changelog ] Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Reviewed-by: Kirill A. Shutemov Reviewed-by: Nikolay Borisov Reviewed-by: Dave Hansen Link: https://lore.kernel.org/all/20231208170740.53979-6-dave.hansen%40intel.com --- arch/x86/include/asm/tdx.h | 4 + arch/x86/virt/vmx/tdx/tdx.c | 167 ++++++++++++++++++++++++++++++++++++ arch/x86/virt/vmx/tdx/tdx.h | 30 +++++++ 3 files changed, 201 insertions(+) create mode 100644 arch/x86/virt/vmx/tdx/tdx.h diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 2c0f4162fe99d..c54948e1999c0 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -111,8 +111,12 @@ static inline u64 sc_retry(sc_func_t func, u64 fn, #define seamcall(_fn, _args) sc_retry(__seamcall, (_fn), (_args)) #define seamcall_ret(_fn, _args) sc_retry(__seamcall_ret, (_fn), (_args)) #define seamcall_saved_ret(_fn, _args) sc_retry(__seamcall_saved_ret, (_fn), (_args)) +int tdx_cpu_enable(void); +int tdx_enable(void); #else static inline void tdx_init(void) { } +static inline int tdx_cpu_enable(void) { return -ENODEV; } +static inline int tdx_enable(void) { return -ENODEV; } #endif /* CONFIG_INTEL_TDX_HOST */ #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 262b9b64a50ba..ecb0df8ff8b8f 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -12,15 +12,25 @@ #include #include #include +#include +#include +#include +#include #include #include #include #include +#include "tdx.h" static u32 tdx_global_keyid __ro_after_init; static u32 tdx_guest_keyid_start __ro_after_init; static u32 tdx_nr_guest_keyids __ro_after_init; +static DEFINE_PER_CPU(bool, tdx_lp_initialized); + +static enum tdx_module_status_t tdx_module_status; +static DEFINE_MUTEX(tdx_module_lock); + typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args); static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args) @@ -65,6 +75,163 @@ static inline int sc_retry_prerr(sc_func_t func, sc_err_func_t err_func, #define seamcall_prerr_ret(__fn, __args) \ sc_retry_prerr(__seamcall_ret, seamcall_err_ret, (__fn), (__args)) +/* + * Do the module global initialization once and return its result. + * It can be done on any cpu. It's always called with interrupts + * disabled. + */ +static int try_init_module_global(void) +{ + struct tdx_module_args args = {}; + static DEFINE_RAW_SPINLOCK(sysinit_lock); + static bool sysinit_done; + static int sysinit_ret; + + lockdep_assert_irqs_disabled(); + + raw_spin_lock(&sysinit_lock); + + if (sysinit_done) + goto out; + + /* RCX is module attributes and all bits are reserved */ + args.rcx = 0; + sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args); + + /* + * The first SEAMCALL also detects the TDX module, thus + * it can fail due to the TDX module is not loaded. + * Dump message to let the user know. + */ + if (sysinit_ret == -ENODEV) + pr_err("module not loaded\n"); + + sysinit_done = true; +out: + raw_spin_unlock(&sysinit_lock); + return sysinit_ret; +} + +/** + * tdx_cpu_enable - Enable TDX on local cpu + * + * Do one-time TDX module per-cpu initialization SEAMCALL (and TDX module + * global initialization SEAMCALL if not done) on local cpu to make this + * cpu be ready to run any other SEAMCALLs. + * + * Always call this function via IPI function calls. + * + * Return 0 on success, otherwise errors. + */ +int tdx_cpu_enable(void) +{ + struct tdx_module_args args = {}; + int ret; + + if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM)) + return -ENODEV; + + lockdep_assert_irqs_disabled(); + + if (__this_cpu_read(tdx_lp_initialized)) + return 0; + + /* + * The TDX module global initialization is the very first step + * to enable TDX. Need to do it first (if hasn't been done) + * before the per-cpu initialization. + */ + ret = try_init_module_global(); + if (ret) + return ret; + + ret = seamcall_prerr(TDH_SYS_LP_INIT, &args); + if (ret) + return ret; + + __this_cpu_write(tdx_lp_initialized, true); + + return 0; +} +EXPORT_SYMBOL_GPL(tdx_cpu_enable); + +static int init_tdx_module(void) +{ + /* + * TODO: + * + * - Build the list of TDX-usable memory regions. + * - Get TDX module "TD Memory Region" (TDMR) global metadata. + * - Construct a list of TDMRs to cover all TDX-usable memory + * regions. + * - Configure the TDMRs and the global KeyID to the TDX module. + * - Configure the global KeyID on all packages. + * - Initialize all TDMRs. + * + * Return error before all steps are done. + */ + return -EINVAL; +} + +static int __tdx_enable(void) +{ + int ret; + + ret = init_tdx_module(); + if (ret) { + pr_err("module initialization failed (%d)\n", ret); + tdx_module_status = TDX_MODULE_ERROR; + return ret; + } + + pr_info("module initialized\n"); + tdx_module_status = TDX_MODULE_INITIALIZED; + + return 0; +} + +/** + * tdx_enable - Enable TDX module to make it ready to run TDX guests + * + * This function assumes the caller has: 1) held read lock of CPU hotplug + * lock to prevent any new cpu from becoming online; 2) done both VMXON + * and tdx_cpu_enable() on all online cpus. + * + * This function can be called in parallel by multiple callers. + * + * Return 0 if TDX is enabled successfully, otherwise error. + */ +int tdx_enable(void) +{ + int ret; + + if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM)) + return -ENODEV; + + lockdep_assert_cpus_held(); + + mutex_lock(&tdx_module_lock); + + switch (tdx_module_status) { + case TDX_MODULE_UNINITIALIZED: + ret = __tdx_enable(); + break; + case TDX_MODULE_INITIALIZED: + /* Already initialized, great, tell the caller. */ + ret = 0; + break; + default: + /* Failed to initialize in the previous attempts */ + ret = -EINVAL; + break; + } + + mutex_unlock(&tdx_module_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(tdx_enable); + static __init int record_keyid_partitioning(u32 *tdx_keyid_start, u32 *nr_tdx_keyids) { diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h new file mode 100644 index 0000000000000..a3c52270df5b7 --- /dev/null +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _X86_VIRT_TDX_H +#define _X86_VIRT_TDX_H + +/* + * This file contains both macros and data structures defined by the TDX + * architecture and Linux defined software data structures and functions. + * The two should not be mixed together for better readability. The + * architectural definitions come first. + */ + +/* + * TDX module SEAMCALL leaf functions + */ +#define TDH_SYS_INIT 33 +#define TDH_SYS_LP_INIT 35 + +/* + * Do not put any hardware-defined TDX structure representations below + * this comment! + */ + +/* Kernel defined TDX module status during module initialization. */ +enum tdx_module_status_t { + TDX_MODULE_UNINITIALIZED, + TDX_MODULE_INITIALIZED, + TDX_MODULE_ERROR +}; + +#endif -- GitLab From abe8dbab8f9f8370c26e7b79b49ed795c1b6b70f Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 8 Dec 2023 09:07:27 -0800 Subject: [PATCH 0227/1290] x86/virt/tdx: Use all system memory when initializing TDX module as TDX memory Start to transit out the "multi-steps" to initialize the TDX module. TDX provides increased levels of memory confidentiality and integrity. This requires special hardware support for features like memory encryption and storage of memory integrity checksums. Not all memory satisfies these requirements. As a result, TDX introduced the concept of a "Convertible Memory Region" (CMR). During boot, the firmware builds a list of all of the memory ranges which can provide the TDX security guarantees. The list of these ranges is available to the kernel by querying the TDX module. CMRs tell the kernel which memory is TDX compatible. The kernel needs to build a list of memory regions (out of CMRs) as "TDX-usable" memory and pass them to the TDX module. Once this is done, those "TDX-usable" memory regions are fixed during module's lifetime. To keep things simple, assume that all TDX-protected memory will come from the page allocator. Make sure all pages in the page allocator *are* TDX-usable memory. As TDX-usable memory is a fixed configuration, take a snapshot of the memory configuration from memblocks at the time of module initialization (memblocks are modified on memory hotplug). This snapshot is used to enable TDX support for *this* memory configuration only. Use a memory hotplug notifier to ensure that no other RAM can be added outside of this configuration. This approach requires all memblock memory regions at the time of module initialization to be TDX convertible memory to work, otherwise module initialization will fail in a later SEAMCALL when passing those regions to the module. This approach works when all boot-time "system RAM" is TDX convertible memory and no non-TDX-convertible memory is hot-added to the core-mm before module initialization. For instance, on the first generation of TDX machines, both CXL memory and NVDIMM are not TDX convertible memory. Using kmem driver to hot-add any CXL memory or NVDIMM to the core-mm before module initialization will result in failure to initialize the module. The SEAMCALL error code will be available in the dmesg to help user to understand the failure. Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Reviewed-by: "Huang, Ying" Reviewed-by: Isaku Yamahata Reviewed-by: Dave Hansen Reviewed-by: Kirill A. Shutemov Link: https://lore.kernel.org/all/20231208170740.53979-7-dave.hansen%40intel.com --- arch/x86/Kconfig | 1 + arch/x86/kernel/setup.c | 2 + arch/x86/virt/vmx/tdx/tdx.c | 167 +++++++++++++++++++++++++++++++++++- arch/x86/virt/vmx/tdx/tdx.h | 6 ++ 4 files changed, 174 insertions(+), 2 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index eb6e63956d51e..2c69ef8448059 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1971,6 +1971,7 @@ config INTEL_TDX_HOST depends on X86_64 depends on KVM_INTEL depends on X86_X2APIC + select ARCH_KEEP_MEMBLOCK help Intel Trust Domain Extensions (TDX) protects guest VMs from malicious host and certain physical attacks. This option enables necessary TDX diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 1526747bedf2f..9597c002b3c49 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1033,6 +1033,8 @@ void __init setup_arch(char **cmdline_p) * * Moreover, on machines with SandyBridge graphics or in setups that use * crashkernel the entire 1M is reserved anyway. + * + * Note the host kernel TDX also requires the first 1MB being reserved. */ x86_platform.realmode_reserve(); diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index ecb0df8ff8b8f..6a3585bc8aa48 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -16,6 +16,12 @@ #include #include #include +#include +#include +#include +#include +#include +#include #include #include #include @@ -31,6 +37,9 @@ static DEFINE_PER_CPU(bool, tdx_lp_initialized); static enum tdx_module_status_t tdx_module_status; static DEFINE_MUTEX(tdx_module_lock); +/* All TDX-usable memory regions. Protected by mem_hotplug_lock. */ +static LIST_HEAD(tdx_memlist); + typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args); static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args) @@ -155,12 +164,102 @@ int tdx_cpu_enable(void) } EXPORT_SYMBOL_GPL(tdx_cpu_enable); +/* + * Add a memory region as a TDX memory block. The caller must make sure + * all memory regions are added in address ascending order and don't + * overlap. + */ +static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn, + unsigned long end_pfn) +{ + struct tdx_memblock *tmb; + + tmb = kmalloc(sizeof(*tmb), GFP_KERNEL); + if (!tmb) + return -ENOMEM; + + INIT_LIST_HEAD(&tmb->list); + tmb->start_pfn = start_pfn; + tmb->end_pfn = end_pfn; + + /* @tmb_list is protected by mem_hotplug_lock */ + list_add_tail(&tmb->list, tmb_list); + return 0; +} + +static void free_tdx_memlist(struct list_head *tmb_list) +{ + /* @tmb_list is protected by mem_hotplug_lock */ + while (!list_empty(tmb_list)) { + struct tdx_memblock *tmb = list_first_entry(tmb_list, + struct tdx_memblock, list); + + list_del(&tmb->list); + kfree(tmb); + } +} + +/* + * Ensure that all memblock memory regions are convertible to TDX + * memory. Once this has been established, stash the memblock + * ranges off in a secondary structure because memblock is modified + * in memory hotplug while TDX memory regions are fixed. + */ +static int build_tdx_memlist(struct list_head *tmb_list) +{ + unsigned long start_pfn, end_pfn; + int i, ret; + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { + /* + * The first 1MB is not reported as TDX convertible memory. + * Although the first 1MB is always reserved and won't end up + * to the page allocator, it is still in memblock's memory + * regions. Skip them manually to exclude them as TDX memory. + */ + start_pfn = max(start_pfn, PHYS_PFN(SZ_1M)); + if (start_pfn >= end_pfn) + continue; + + /* + * Add the memory regions as TDX memory. The regions in + * memblock has already guaranteed they are in address + * ascending order and don't overlap. + */ + ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn); + if (ret) + goto err; + } + + return 0; +err: + free_tdx_memlist(tmb_list); + return ret; +} + static int init_tdx_module(void) { + int ret; + + /* + * To keep things simple, assume that all TDX-protected memory + * will come from the page allocator. Make sure all pages in the + * page allocator are TDX-usable memory. + * + * Build the list of "TDX-usable" memory regions which cover all + * pages in the page allocator to guarantee that. Do it while + * holding mem_hotplug_lock read-lock as the memory hotplug code + * path reads the @tdx_memlist to reject any new memory. + */ + get_online_mems(); + + ret = build_tdx_memlist(&tdx_memlist); + if (ret) + goto out_put_tdxmem; + /* * TODO: * - * - Build the list of TDX-usable memory regions. * - Get TDX module "TD Memory Region" (TDMR) global metadata. * - Construct a list of TDMRs to cover all TDX-usable memory * regions. @@ -170,7 +269,14 @@ static int init_tdx_module(void) * * Return error before all steps are done. */ - return -EINVAL; + ret = -EINVAL; +out_put_tdxmem: + /* + * @tdx_memlist is written here and read at memory hotplug time. + * Lock out memory hotplug code while building it. + */ + put_online_mems(); + return ret; } static int __tdx_enable(void) @@ -257,6 +363,56 @@ static __init int record_keyid_partitioning(u32 *tdx_keyid_start, return 0; } +static bool is_tdx_memory(unsigned long start_pfn, unsigned long end_pfn) +{ + struct tdx_memblock *tmb; + + /* + * This check assumes that the start_pfn<->end_pfn range does not + * cross multiple @tdx_memlist entries. A single memory online + * event across multiple memblocks (from which @tdx_memlist + * entries are derived at the time of module initialization) is + * not possible. This is because memory offline/online is done + * on granularity of 'struct memory_block', and the hotpluggable + * memory region (one memblock) must be multiple of memory_block. + */ + list_for_each_entry(tmb, &tdx_memlist, list) { + if (start_pfn >= tmb->start_pfn && end_pfn <= tmb->end_pfn) + return true; + } + return false; +} + +static int tdx_memory_notifier(struct notifier_block *nb, unsigned long action, + void *v) +{ + struct memory_notify *mn = v; + + if (action != MEM_GOING_ONLINE) + return NOTIFY_OK; + + /* + * Empty list means TDX isn't enabled. Allow any memory + * to go online. + */ + if (list_empty(&tdx_memlist)) + return NOTIFY_OK; + + /* + * The TDX memory configuration is static and can not be + * changed. Reject onlining any memory which is outside of + * the static configuration whether it supports TDX or not. + */ + if (is_tdx_memory(mn->start_pfn, mn->start_pfn + mn->nr_pages)) + return NOTIFY_OK; + + return NOTIFY_BAD; +} + +static struct notifier_block tdx_memory_nb = { + .notifier_call = tdx_memory_notifier, +}; + void __init tdx_init(void) { u32 tdx_keyid_start, nr_tdx_keyids; @@ -280,6 +436,13 @@ void __init tdx_init(void) return; } + err = register_memory_notifier(&tdx_memory_nb); + if (err) { + pr_err("initialization failed: register_memory_notifier() failed (%d)\n", + err); + return; + } + /* * Just use the first TDX KeyID as the 'global KeyID' and * leave the rest for TDX guests. diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index a3c52270df5b7..c11e0a7ca664b 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -27,4 +27,10 @@ enum tdx_module_status_t { TDX_MODULE_ERROR }; +struct tdx_memblock { + struct list_head list; + unsigned long start_pfn; + unsigned long end_pfn; +}; + #endif -- GitLab From cf72bc481634b7c4cd780b6338f222e2892b0232 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 8 Dec 2023 09:07:28 -0800 Subject: [PATCH 0228/1290] x86/virt/tdx: Get module global metadata for module initialization The TDX module global metadata provides system-wide information about the module. TL;DR: Use the TDH.SYS.RD SEAMCALL to tell if the module is good or not. Long Version: 1) Only initialize TDX module with version 1.5 and later TDX module 1.0 has some compatibility issues with the later versions of module, as documented in the "Intel TDX module ABI incompatibilities between TDX1.0 and TDX1.5" spec. Don't bother with module versions that do not have a stable ABI. 2) Get the essential global metadata for module initialization TDX reports a list of "Convertible Memory Region" (CMR) to tell the kernel which memory is TDX compatible. The kernel needs to build a list of memory regions (out of CMRs) as "TDX-usable" memory and pass them to the TDX module. The kernel does this by constructing a list of "TD Memory Regions" (TDMRs) to cover all these memory regions and passing them to the TDX module. Each TDMR is a TDX architectural data structure containing the memory region that the TDMR covers, plus the information to track (within this TDMR): a) the "Physical Address Metadata Table" (PAMT) to track each TDX memory page's status (such as which TDX guest "owns" a given page, and b) the "reserved areas" to tell memory holes that cannot be used as TDX memory. The kernel needs to get below metadata from the TDX module to build the list of TDMRs: a) the maximum number of supported TDMRs b) the maximum number of supported reserved areas per TDMR and, c) the PAMT entry size for each TDX-supported page size. == Implementation == The TDX module has two modes of fetching the metadata: a one field at a time, or all in one blob. Use the field at a time for now. It is slower, but there just are not enough fields now to justify the complexity of extra unpacking. The err_free_tdxmem=>out_put_tdxmem goto looks wonky by itself. But it is the first of a bunch of error handling that will get stuck at its site. [ dhansen: clean up changelog and add a struct to map between the TDX module fields and 'struct tdx_tdmr_sysinfo' ] Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20231208170740.53979-8-dave.hansen%40intel.com --- arch/x86/include/asm/shared/tdx.h | 1 + arch/x86/virt/vmx/tdx/tdx.c | 88 ++++++++++++++++++++++++++++++- arch/x86/virt/vmx/tdx/tdx.h | 39 ++++++++++++++ 3 files changed, 127 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index a4036149c4847..fdfd41511b021 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -59,6 +59,7 @@ #define TDX_PS_4K 0 #define TDX_PS_2M 1 #define TDX_PS_1G 2 +#define TDX_PS_NR (TDX_PS_1G + 1) #ifndef __ASSEMBLY__ diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 6a3585bc8aa48..e76ad7c2e53b2 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -237,8 +237,85 @@ err: return ret; } +static int read_sys_metadata_field(u64 field_id, u64 *data) +{ + struct tdx_module_args args = {}; + int ret; + + /* + * TDH.SYS.RD -- reads one global metadata field + * - RDX (in): the field to read + * - R8 (out): the field data + */ + args.rdx = field_id; + ret = seamcall_prerr_ret(TDH_SYS_RD, &args); + if (ret) + return ret; + + *data = args.r8; + + return 0; +} + +static int read_sys_metadata_field16(u64 field_id, + int offset, + struct tdx_tdmr_sysinfo *ts) +{ + u16 *ts_member = ((void *)ts) + offset; + u64 tmp; + int ret; + + if (WARN_ON_ONCE(MD_FIELD_ID_ELE_SIZE_CODE(field_id) != + MD_FIELD_ID_ELE_SIZE_16BIT)) + return -EINVAL; + + ret = read_sys_metadata_field(field_id, &tmp); + if (ret) + return ret; + + *ts_member = tmp; + + return 0; +} + +struct field_mapping { + u64 field_id; + int offset; +}; + +#define TD_SYSINFO_MAP(_field_id, _offset) \ + { .field_id = MD_FIELD_ID_##_field_id, \ + .offset = offsetof(struct tdx_tdmr_sysinfo, _offset) } + +/* Map TD_SYSINFO fields into 'struct tdx_tdmr_sysinfo': */ +static const struct field_mapping fields[] = { + TD_SYSINFO_MAP(MAX_TDMRS, max_tdmrs), + TD_SYSINFO_MAP(MAX_RESERVED_PER_TDMR, max_reserved_per_tdmr), + TD_SYSINFO_MAP(PAMT_4K_ENTRY_SIZE, pamt_entry_size[TDX_PS_4K]), + TD_SYSINFO_MAP(PAMT_2M_ENTRY_SIZE, pamt_entry_size[TDX_PS_2M]), + TD_SYSINFO_MAP(PAMT_1G_ENTRY_SIZE, pamt_entry_size[TDX_PS_1G]), +}; + +static int get_tdx_tdmr_sysinfo(struct tdx_tdmr_sysinfo *tdmr_sysinfo) +{ + int ret; + int i; + + /* Populate 'tdmr_sysinfo' fields using the mapping structure above: */ + for (i = 0; i < ARRAY_SIZE(fields); i++) { + ret = read_sys_metadata_field16(fields[i].field_id, + fields[i].offset, + tdmr_sysinfo); + if (ret) + return ret; + } + + return 0; +} + static int init_tdx_module(void) { + struct tdx_tdmr_sysinfo tdmr_sysinfo; int ret; /* @@ -257,10 +334,13 @@ static int init_tdx_module(void) if (ret) goto out_put_tdxmem; + ret = get_tdx_tdmr_sysinfo(&tdmr_sysinfo); + if (ret) + goto err_free_tdxmem; + /* * TODO: * - * - Get TDX module "TD Memory Region" (TDMR) global metadata. * - Construct a list of TDMRs to cover all TDX-usable memory * regions. * - Configure the TDMRs and the global KeyID to the TDX module. @@ -270,6 +350,8 @@ static int init_tdx_module(void) * Return error before all steps are done. */ ret = -EINVAL; + if (ret) + goto err_free_tdxmem; out_put_tdxmem: /* * @tdx_memlist is written here and read at memory hotplug time. @@ -277,6 +359,10 @@ out_put_tdxmem: */ put_online_mems(); return ret; + +err_free_tdxmem: + free_tdx_memlist(&tdx_memlist); + goto out_put_tdxmem; } static int __tdx_enable(void) diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index c11e0a7ca664b..29cdf5ea5544a 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -2,6 +2,8 @@ #ifndef _X86_VIRT_TDX_H #define _X86_VIRT_TDX_H +#include + /* * This file contains both macros and data structures defined by the TDX * architecture and Linux defined software data structures and functions. @@ -13,8 +15,38 @@ * TDX module SEAMCALL leaf functions */ #define TDH_SYS_INIT 33 +#define TDH_SYS_RD 34 #define TDH_SYS_LP_INIT 35 +/* + * Global scope metadata field ID. + * + * See Table "Global Scope Metadata", TDX module 1.5 ABI spec. + */ +#define MD_FIELD_ID_MAX_TDMRS 0x9100000100000008ULL +#define MD_FIELD_ID_MAX_RESERVED_PER_TDMR 0x9100000100000009ULL +#define MD_FIELD_ID_PAMT_4K_ENTRY_SIZE 0x9100000100000010ULL +#define MD_FIELD_ID_PAMT_2M_ENTRY_SIZE 0x9100000100000011ULL +#define MD_FIELD_ID_PAMT_1G_ENTRY_SIZE 0x9100000100000012ULL + +/* + * Sub-field definition of metadata field ID. + * + * See Table "MD_FIELD_ID (Metadata Field Identifier / Sequence Header) + * Definition", TDX module 1.5 ABI spec. + * + * - Bit 33:32: ELEMENT_SIZE_CODE -- size of a single element of metadata + * + * 0: 8 bits + * 1: 16 bits + * 2: 32 bits + * 3: 64 bits + */ +#define MD_FIELD_ID_ELE_SIZE_CODE(_field_id) \ + (((_field_id) & GENMASK_ULL(33, 32)) >> 32) + +#define MD_FIELD_ID_ELE_SIZE_16BIT 1 + /* * Do not put any hardware-defined TDX structure representations below * this comment! @@ -33,4 +65,11 @@ struct tdx_memblock { unsigned long end_pfn; }; +/* "TDMR info" part of "Global Scope Metadata" for constructing TDMRs */ +struct tdx_tdmr_sysinfo { + u16 max_tdmrs; + u16 max_reserved_per_tdmr; + u16 pamt_entry_size[TDX_PS_NR]; +}; + #endif -- GitLab From 5173d3c5d018161aca17d4ac95367cf832c7fff1 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 8 Dec 2023 09:07:29 -0800 Subject: [PATCH 0229/1290] x86/virt/tdx: Add placeholder to construct TDMRs to cover all TDX memory regions After the kernel selects all TDX-usable memory regions, the kernel needs to pass those regions to the TDX module via data structure "TD Memory Region" (TDMR). Add a placeholder to construct a list of TDMRs (in multiple steps) to cover all TDX-usable memory regions. === Long Version === TDX provides increased levels of memory confidentiality and integrity. This requires special hardware support for features like memory encryption and storage of memory integrity checksums. Not all memory satisfies these requirements. As a result, TDX introduced the concept of a "Convertible Memory Region" (CMR). During boot, the firmware builds a list of all of the memory ranges which can provide the TDX security guarantees. The list of these ranges is available to the kernel by querying the TDX module. The TDX architecture needs additional metadata to record things like which TD guest "owns" a given page of memory. This metadata essentially serves as the 'struct page' for the TDX module. The space for this metadata is not reserved by the hardware up front and must be allocated by the kernel and given to the TDX module. Since this metadata consumes space, the VMM can choose whether or not to allocate it for a given area of convertible memory. If it chooses not to, the memory cannot receive TDX protections and can not be used by TDX guests as private memory. For every memory region that the VMM wants to use as TDX memory, it sets up a "TD Memory Region" (TDMR). Each TDMR represents a physically contiguous convertible range and must also have its own physically contiguous metadata table, referred to as a Physical Address Metadata Table (PAMT), to track status for each page in the TDMR range. Unlike a CMR, each TDMR requires 1G granularity and alignment. To support physical RAM areas that don't meet those strict requirements, each TDMR permits a number of internal "reserved areas" which can be placed over memory holes. If PAMT metadata is placed within a TDMR it must be covered by one of these reserved areas. Let's summarize the concepts: CMR - Firmware-enumerated physical ranges that support TDX. CMRs are 4K aligned. TDMR - Physical address range which is chosen by the kernel to support TDX. 1G granularity and alignment required. Each TDMR has reserved areas where TDX memory holes and overlapping PAMTs can be represented. PAMT - Physically contiguous TDX metadata. One table for each page size per TDMR. Roughly 1/256th of TDMR in size. 256G TDMR = ~1G PAMT. As one step of initializing the TDX module, the kernel configures TDX-usable memory regions by passing a list of TDMRs to the TDX module. Constructing the list of TDMRs consists below steps: 1) Fill out TDMRs to cover all memory regions that the TDX module will use for TD memory. 2) Allocate and set up PAMT for each TDMR. 3) Designate reserved areas for each TDMR. Add a placeholder to construct TDMRs to do the above steps. To keep things simple, just allocate enough space to hold maximum number of TDMRs up front. Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Reviewed-by: Isaku Yamahata Reviewed-by: Dave Hansen Reviewed-by: Kirill A. Shutemov Link: https://lore.kernel.org/all/20231208170740.53979-9-dave.hansen%40intel.com --- arch/x86/virt/vmx/tdx/tdx.c | 93 +++++++++++++++++++++++++++++++++++-- arch/x86/virt/vmx/tdx/tdx.h | 33 +++++++++++++ 2 files changed, 123 insertions(+), 3 deletions(-) diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index e76ad7c2e53b2..eeb7bf8c37427 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -34,6 +35,8 @@ static u32 tdx_nr_guest_keyids __ro_after_init; static DEFINE_PER_CPU(bool, tdx_lp_initialized); +static struct tdmr_info_list tdx_tdmr_list; + static enum tdx_module_status_t tdx_module_status; static DEFINE_MUTEX(tdx_module_lock); @@ -313,6 +316,80 @@ static int get_tdx_tdmr_sysinfo(struct tdx_tdmr_sysinfo *tdmr_sysinfo) return 0; } +/* Calculate the actual TDMR size */ +static int tdmr_size_single(u16 max_reserved_per_tdmr) +{ + int tdmr_sz; + + /* + * The actual size of TDMR depends on the maximum + * number of reserved areas. + */ + tdmr_sz = sizeof(struct tdmr_info); + tdmr_sz += sizeof(struct tdmr_reserved_area) * max_reserved_per_tdmr; + + return ALIGN(tdmr_sz, TDMR_INFO_ALIGNMENT); +} + +static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list, + struct tdx_tdmr_sysinfo *tdmr_sysinfo) +{ + size_t tdmr_sz, tdmr_array_sz; + void *tdmr_array; + + tdmr_sz = tdmr_size_single(tdmr_sysinfo->max_reserved_per_tdmr); + tdmr_array_sz = tdmr_sz * tdmr_sysinfo->max_tdmrs; + + /* + * To keep things simple, allocate all TDMRs together. + * The buffer needs to be physically contiguous to make + * sure each TDMR is physically contiguous. + */ + tdmr_array = alloc_pages_exact(tdmr_array_sz, + GFP_KERNEL | __GFP_ZERO); + if (!tdmr_array) + return -ENOMEM; + + tdmr_list->tdmrs = tdmr_array; + + /* + * Keep the size of TDMR to find the target TDMR + * at a given index in the TDMR list. + */ + tdmr_list->tdmr_sz = tdmr_sz; + tdmr_list->max_tdmrs = tdmr_sysinfo->max_tdmrs; + tdmr_list->nr_consumed_tdmrs = 0; + + return 0; +} + +static void free_tdmr_list(struct tdmr_info_list *tdmr_list) +{ + free_pages_exact(tdmr_list->tdmrs, + tdmr_list->max_tdmrs * tdmr_list->tdmr_sz); +} + +/* + * Construct a list of TDMRs on the preallocated space in @tdmr_list + * to cover all TDX memory regions in @tmb_list based on the TDX module + * TDMR global information in @tdmr_sysinfo. + */ +static int construct_tdmrs(struct list_head *tmb_list, + struct tdmr_info_list *tdmr_list, + struct tdx_tdmr_sysinfo *tdmr_sysinfo) +{ + /* + * TODO: + * + * - Fill out TDMRs to cover all TDX memory regions. + * - Allocate and set up PAMTs for each TDMR. + * - Designate reserved areas for each TDMR. + * + * Return -EINVAL until constructing TDMRs is done + */ + return -EINVAL; +} + static int init_tdx_module(void) { struct tdx_tdmr_sysinfo tdmr_sysinfo; @@ -338,11 +415,19 @@ static int init_tdx_module(void) if (ret) goto err_free_tdxmem; + /* Allocate enough space for constructing TDMRs */ + ret = alloc_tdmr_list(&tdx_tdmr_list, &tdmr_sysinfo); + if (ret) + goto err_free_tdxmem; + + /* Cover all TDX-usable memory regions in TDMRs */ + ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &tdmr_sysinfo); + if (ret) + goto err_free_tdmrs; + /* * TODO: * - * - Construct a list of TDMRs to cover all TDX-usable memory - * regions. * - Configure the TDMRs and the global KeyID to the TDX module. * - Configure the global KeyID on all packages. * - Initialize all TDMRs. @@ -351,7 +436,7 @@ static int init_tdx_module(void) */ ret = -EINVAL; if (ret) - goto err_free_tdxmem; + goto err_free_tdmrs; out_put_tdxmem: /* * @tdx_memlist is written here and read at memory hotplug time. @@ -360,6 +445,8 @@ out_put_tdxmem: put_online_mems(); return ret; +err_free_tdmrs: + free_tdmr_list(&tdx_tdmr_list); err_free_tdxmem: free_tdx_memlist(&tdx_memlist); goto out_put_tdxmem; diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index 29cdf5ea5544a..9b6b5d70804f4 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -47,6 +47,30 @@ #define MD_FIELD_ID_ELE_SIZE_16BIT 1 +struct tdmr_reserved_area { + u64 offset; + u64 size; +} __packed; + +#define TDMR_INFO_ALIGNMENT 512 + +struct tdmr_info { + u64 base; + u64 size; + u64 pamt_1g_base; + u64 pamt_1g_size; + u64 pamt_2m_base; + u64 pamt_2m_size; + u64 pamt_4k_base; + u64 pamt_4k_size; + /* + * The actual number of reserved areas depends on the value of + * field MD_FIELD_ID_MAX_RESERVED_PER_TDMR in the TDX module + * global metadata. + */ + DECLARE_FLEX_ARRAY(struct tdmr_reserved_area, reserved_areas); +} __packed __aligned(TDMR_INFO_ALIGNMENT); + /* * Do not put any hardware-defined TDX structure representations below * this comment! @@ -72,4 +96,13 @@ struct tdx_tdmr_sysinfo { u16 pamt_entry_size[TDX_PS_NR]; }; +struct tdmr_info_list { + void *tdmrs; /* Flexible array to hold 'tdmr_info's */ + int nr_consumed_tdmrs; /* How many 'tdmr_info's are in use */ + + /* Metadata for finding target 'tdmr_info' and freeing @tdmrs */ + int tdmr_sz; /* Size of one 'tdmr_info' */ + int max_tdmrs; /* How many 'tdmr_info's are allocated */ +}; + #endif -- GitLab From f3338ac15931075ef8b81145f5de128158a8dc8e Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 8 Dec 2023 09:07:30 -0800 Subject: [PATCH 0230/1290] x86/virt/tdx: Fill out TDMRs to cover all TDX memory regions Start to transit out the "multi-steps" to construct a list of "TD Memory Regions" (TDMRs) to cover all TDX-usable memory regions. The kernel configures TDX-usable memory regions by passing a list of TDMRs "TD Memory Regions" (TDMRs) to the TDX module. Each TDMR contains the information of the base/size of a memory region, the base/size of the associated Physical Address Metadata Table (PAMT) and a list of reserved areas in the region. Do the first step to fill out a number of TDMRs to cover all TDX memory regions. To keep it simple, always try to use one TDMR for each memory region. As the first step only set up the base/size for each TDMR. Each TDMR must be 1G aligned and the size must be in 1G granularity. This implies that one TDMR could cover multiple memory regions. If a memory region spans the 1GB boundary and the former part is already covered by the previous TDMR, just use a new TDMR for the remaining part. TDX only supports a limited number of TDMRs. Disable TDX if all TDMRs are consumed but there is more memory region to cover. There are fancier things that could be done like trying to merge adjacent TDMRs. This would allow more pathological memory layouts to be supported. But, current systems are not even close to exhausting the existing TDMR resources in practice. For now, keep it simple. Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Reviewed-by: Kirill A. Shutemov Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Yuan Yao Reviewed-by: Dave Hansen Link: https://lore.kernel.org/all/20231208170740.53979-10-dave.hansen%40intel.com --- arch/x86/virt/vmx/tdx/tdx.c | 103 +++++++++++++++++++++++++++++++++++- arch/x86/virt/vmx/tdx/tdx.h | 3 ++ 2 files changed, 105 insertions(+), 1 deletion(-) diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index eeb7bf8c37427..d7f928713b6c9 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -369,6 +369,102 @@ static void free_tdmr_list(struct tdmr_info_list *tdmr_list) tdmr_list->max_tdmrs * tdmr_list->tdmr_sz); } +/* Get the TDMR from the list at the given index. */ +static struct tdmr_info *tdmr_entry(struct tdmr_info_list *tdmr_list, + int idx) +{ + int tdmr_info_offset = tdmr_list->tdmr_sz * idx; + + return (void *)tdmr_list->tdmrs + tdmr_info_offset; +} + +#define TDMR_ALIGNMENT SZ_1G +#define TDMR_ALIGN_DOWN(_addr) ALIGN_DOWN((_addr), TDMR_ALIGNMENT) +#define TDMR_ALIGN_UP(_addr) ALIGN((_addr), TDMR_ALIGNMENT) + +static inline u64 tdmr_end(struct tdmr_info *tdmr) +{ + return tdmr->base + tdmr->size; +} + +/* + * Take the memory referenced in @tmb_list and populate the + * preallocated @tdmr_list, following all the special alignment + * and size rules for TDMR. + */ +static int fill_out_tdmrs(struct list_head *tmb_list, + struct tdmr_info_list *tdmr_list) +{ + struct tdx_memblock *tmb; + int tdmr_idx = 0; + + /* + * Loop over TDX memory regions and fill out TDMRs to cover them. + * To keep it simple, always try to use one TDMR to cover one + * memory region. + * + * In practice TDX supports at least 64 TDMRs. A 2-socket system + * typically only consumes less than 10 of those. This code is + * dumb and simple and may use more TMDRs than is strictly + * required. + */ + list_for_each_entry(tmb, tmb_list, list) { + struct tdmr_info *tdmr = tdmr_entry(tdmr_list, tdmr_idx); + u64 start, end; + + start = TDMR_ALIGN_DOWN(PFN_PHYS(tmb->start_pfn)); + end = TDMR_ALIGN_UP(PFN_PHYS(tmb->end_pfn)); + + /* + * A valid size indicates the current TDMR has already + * been filled out to cover the previous memory region(s). + */ + if (tdmr->size) { + /* + * Loop to the next if the current memory region + * has already been fully covered. + */ + if (end <= tdmr_end(tdmr)) + continue; + + /* Otherwise, skip the already covered part. */ + if (start < tdmr_end(tdmr)) + start = tdmr_end(tdmr); + + /* + * Create a new TDMR to cover the current memory + * region, or the remaining part of it. + */ + tdmr_idx++; + if (tdmr_idx >= tdmr_list->max_tdmrs) { + pr_warn("initialization failed: TDMRs exhausted.\n"); + return -ENOSPC; + } + + tdmr = tdmr_entry(tdmr_list, tdmr_idx); + } + + tdmr->base = start; + tdmr->size = end - start; + } + + /* @tdmr_idx is always the index of the last valid TDMR. */ + tdmr_list->nr_consumed_tdmrs = tdmr_idx + 1; + + /* + * Warn early that kernel is about to run out of TDMRs. + * + * This is an indication that TDMR allocation has to be + * reworked to be smarter to not run into an issue. + */ + if (tdmr_list->max_tdmrs - tdmr_list->nr_consumed_tdmrs < TDMR_NR_WARN) + pr_warn("consumed TDMRs reaching limit: %d used out of %d\n", + tdmr_list->nr_consumed_tdmrs, + tdmr_list->max_tdmrs); + + return 0; +} + /* * Construct a list of TDMRs on the preallocated space in @tdmr_list * to cover all TDX memory regions in @tmb_list based on the TDX module @@ -378,10 +474,15 @@ static int construct_tdmrs(struct list_head *tmb_list, struct tdmr_info_list *tdmr_list, struct tdx_tdmr_sysinfo *tdmr_sysinfo) { + int ret; + + ret = fill_out_tdmrs(tmb_list, tdmr_list); + if (ret) + return ret; + /* * TODO: * - * - Fill out TDMRs to cover all TDX memory regions. * - Allocate and set up PAMTs for each TDMR. * - Designate reserved areas for each TDMR. * diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index 9b6b5d70804f4..f18ce1b88b0af 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -96,6 +96,9 @@ struct tdx_tdmr_sysinfo { u16 pamt_entry_size[TDX_PS_NR]; }; +/* Warn if kernel has less than TDMR_NR_WARN TDMRs after allocation */ +#define TDMR_NR_WARN 4 + struct tdmr_info_list { void *tdmrs; /* Flexible array to hold 'tdmr_info's */ int nr_consumed_tdmrs; /* How many 'tdmr_info's are in use */ -- GitLab From ac3a22088434e31dddda82aa473bee008df99b97 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 8 Dec 2023 09:07:31 -0800 Subject: [PATCH 0231/1290] x86/virt/tdx: Allocate and set up PAMTs for TDMRs The TDX module uses additional metadata to record things like which guest "owns" a given page of memory. This metadata, referred as Physical Address Metadata Table (PAMT), essentially serves as the 'struct page' for the TDX module. PAMTs are not reserved by hardware up front. They must be allocated by the kernel and then given to the TDX module during module initialization. TDX supports 3 page sizes: 4K, 2M, and 1G. Each "TD Memory Region" (TDMR) has 3 PAMTs to track the 3 supported page sizes. Each PAMT must be a physically contiguous area from a Convertible Memory Region (CMR). However, the PAMTs which track pages in one TDMR do not need to reside within that TDMR but can be anywhere in CMRs. If one PAMT overlaps with any TDMR, the overlapping part must be reported as a reserved area in that particular TDMR. Use alloc_contig_pages() since PAMT must be a physically contiguous area and it may be potentially large (~1/256th of the size of the given TDMR). The downside is alloc_contig_pages() may fail at runtime. One (bad) mitigation is to launch a TDX guest early during system boot to get those PAMTs allocated at early time, but the only way to fix is to add a boot option to allocate or reserve PAMTs during kernel boot. It is imperfect but will be improved on later. TDX only supports a limited number of reserved areas per TDMR to cover both PAMTs and memory holes within the given TDMR. If many PAMTs are allocated within a single TDMR, the reserved areas may not be sufficient to cover all of them. Adopt the following policies when allocating PAMTs for a given TDMR: - Allocate three PAMTs of the TDMR in one contiguous chunk to minimize the total number of reserved areas consumed for PAMTs. - Try to first allocate PAMT from the local node of the TDMR for better NUMA locality. Also dump out how many pages are allocated for PAMTs when the TDX module is initialized successfully. This helps answer the eternal "where did all my memory go?" questions. [ dhansen: merge in error handling cleanup ] Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Reviewed-by: Isaku Yamahata Reviewed-by: Dave Hansen Reviewed-by: Kirill A. Shutemov Reviewed-by: Yuan Yao Link: https://lore.kernel.org/all/20231208170740.53979-11-dave.hansen%40intel.com --- arch/x86/Kconfig | 1 + arch/x86/virt/vmx/tdx/tdx.c | 217 +++++++++++++++++++++++++++++++++++- arch/x86/virt/vmx/tdx/tdx.h | 1 + 3 files changed, 213 insertions(+), 6 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2c69ef8448059..e255d8ae5e77e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1972,6 +1972,7 @@ config INTEL_TDX_HOST depends on KVM_INTEL depends on X86_X2APIC select ARCH_KEEP_MEMBLOCK + depends on CONTIG_ALLOC help Intel Trust Domain Extensions (TDX) protects guest VMs from malicious host and certain physical attacks. This option enables necessary TDX diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index d7f928713b6c9..c2bff81245986 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -173,7 +173,7 @@ EXPORT_SYMBOL_GPL(tdx_cpu_enable); * overlap. */ static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn, - unsigned long end_pfn) + unsigned long end_pfn, int nid) { struct tdx_memblock *tmb; @@ -184,6 +184,7 @@ static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn, INIT_LIST_HEAD(&tmb->list); tmb->start_pfn = start_pfn; tmb->end_pfn = end_pfn; + tmb->nid = nid; /* @tmb_list is protected by mem_hotplug_lock */ list_add_tail(&tmb->list, tmb_list); @@ -211,9 +212,9 @@ static void free_tdx_memlist(struct list_head *tmb_list) static int build_tdx_memlist(struct list_head *tmb_list) { unsigned long start_pfn, end_pfn; - int i, ret; + int i, nid, ret; - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { /* * The first 1MB is not reported as TDX convertible memory. * Although the first 1MB is always reserved and won't end up @@ -229,7 +230,7 @@ static int build_tdx_memlist(struct list_head *tmb_list) * memblock has already guaranteed they are in address * ascending order and don't overlap. */ - ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn); + ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid); if (ret) goto err; } @@ -465,6 +466,202 @@ static int fill_out_tdmrs(struct list_head *tmb_list, return 0; } +/* + * Calculate PAMT size given a TDMR and a page size. The returned + * PAMT size is always aligned up to 4K page boundary. + */ +static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz, + u16 pamt_entry_size) +{ + unsigned long pamt_sz, nr_pamt_entries; + + switch (pgsz) { + case TDX_PS_4K: + nr_pamt_entries = tdmr->size >> PAGE_SHIFT; + break; + case TDX_PS_2M: + nr_pamt_entries = tdmr->size >> PMD_SHIFT; + break; + case TDX_PS_1G: + nr_pamt_entries = tdmr->size >> PUD_SHIFT; + break; + default: + WARN_ON_ONCE(1); + return 0; + } + + pamt_sz = nr_pamt_entries * pamt_entry_size; + /* TDX requires PAMT size must be 4K aligned */ + pamt_sz = ALIGN(pamt_sz, PAGE_SIZE); + + return pamt_sz; +} + +/* + * Locate a NUMA node which should hold the allocation of the @tdmr + * PAMT. This node will have some memory covered by the TDMR. The + * relative amount of memory covered is not considered. + */ +static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list) +{ + struct tdx_memblock *tmb; + + /* + * A TDMR must cover at least part of one TMB. That TMB will end + * after the TDMR begins. But, that TMB may have started before + * the TDMR. Find the next 'tmb' that _ends_ after this TDMR + * begins. Ignore 'tmb' start addresses. They are irrelevant. + */ + list_for_each_entry(tmb, tmb_list, list) { + if (tmb->end_pfn > PHYS_PFN(tdmr->base)) + return tmb->nid; + } + + /* + * Fall back to allocating the TDMR's metadata from node 0 when + * no TDX memory block can be found. This should never happen + * since TDMRs originate from TDX memory blocks. + */ + pr_warn("TDMR [0x%llx, 0x%llx): unable to find local NUMA node for PAMT allocation, fallback to use node 0.\n", + tdmr->base, tdmr_end(tdmr)); + return 0; +} + +/* + * Allocate PAMTs from the local NUMA node of some memory in @tmb_list + * within @tdmr, and set up PAMTs for @tdmr. + */ +static int tdmr_set_up_pamt(struct tdmr_info *tdmr, + struct list_head *tmb_list, + u16 pamt_entry_size[]) +{ + unsigned long pamt_base[TDX_PS_NR]; + unsigned long pamt_size[TDX_PS_NR]; + unsigned long tdmr_pamt_base; + unsigned long tdmr_pamt_size; + struct page *pamt; + int pgsz, nid; + + nid = tdmr_get_nid(tdmr, tmb_list); + + /* + * Calculate the PAMT size for each TDX supported page size + * and the total PAMT size. + */ + tdmr_pamt_size = 0; + for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) { + pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz, + pamt_entry_size[pgsz]); + tdmr_pamt_size += pamt_size[pgsz]; + } + + /* + * Allocate one chunk of physically contiguous memory for all + * PAMTs. This helps minimize the PAMT's use of reserved areas + * in overlapped TDMRs. + */ + pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL, + nid, &node_online_map); + if (!pamt) + return -ENOMEM; + + /* + * Break the contiguous allocation back up into the + * individual PAMTs for each page size. + */ + tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT; + for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) { + pamt_base[pgsz] = tdmr_pamt_base; + tdmr_pamt_base += pamt_size[pgsz]; + } + + tdmr->pamt_4k_base = pamt_base[TDX_PS_4K]; + tdmr->pamt_4k_size = pamt_size[TDX_PS_4K]; + tdmr->pamt_2m_base = pamt_base[TDX_PS_2M]; + tdmr->pamt_2m_size = pamt_size[TDX_PS_2M]; + tdmr->pamt_1g_base = pamt_base[TDX_PS_1G]; + tdmr->pamt_1g_size = pamt_size[TDX_PS_1G]; + + return 0; +} + +static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_base, + unsigned long *pamt_size) +{ + unsigned long pamt_bs, pamt_sz; + + /* + * The PAMT was allocated in one contiguous unit. The 4K PAMT + * should always point to the beginning of that allocation. + */ + pamt_bs = tdmr->pamt_4k_base; + pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size; + + WARN_ON_ONCE((pamt_bs & ~PAGE_MASK) || (pamt_sz & ~PAGE_MASK)); + + *pamt_base = pamt_bs; + *pamt_size = pamt_sz; +} + +static void tdmr_free_pamt(struct tdmr_info *tdmr) +{ + unsigned long pamt_base, pamt_size; + + tdmr_get_pamt(tdmr, &pamt_base, &pamt_size); + + /* Do nothing if PAMT hasn't been allocated for this TDMR */ + if (!pamt_size) + return; + + if (WARN_ON_ONCE(!pamt_base)) + return; + + free_contig_range(pamt_base >> PAGE_SHIFT, pamt_size >> PAGE_SHIFT); +} + +static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list) +{ + int i; + + for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) + tdmr_free_pamt(tdmr_entry(tdmr_list, i)); +} + +/* Allocate and set up PAMTs for all TDMRs */ +static int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list, + struct list_head *tmb_list, + u16 pamt_entry_size[]) +{ + int i, ret = 0; + + for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { + ret = tdmr_set_up_pamt(tdmr_entry(tdmr_list, i), tmb_list, + pamt_entry_size); + if (ret) + goto err; + } + + return 0; +err: + tdmrs_free_pamt_all(tdmr_list); + return ret; +} + +static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list) +{ + unsigned long pamt_size = 0; + int i; + + for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { + unsigned long base, size; + + tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size); + pamt_size += size; + } + + return pamt_size / 1024; +} + /* * Construct a list of TDMRs on the preallocated space in @tdmr_list * to cover all TDX memory regions in @tmb_list based on the TDX module @@ -480,10 +677,13 @@ static int construct_tdmrs(struct list_head *tmb_list, if (ret) return ret; + ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list, + tdmr_sysinfo->pamt_entry_size); + if (ret) + return ret; /* * TODO: * - * - Allocate and set up PAMTs for each TDMR. * - Designate reserved areas for each TDMR. * * Return -EINVAL until constructing TDMRs is done @@ -537,7 +737,10 @@ static int init_tdx_module(void) */ ret = -EINVAL; if (ret) - goto err_free_tdmrs; + goto err_free_pamts; + + pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list)); + out_put_tdxmem: /* * @tdx_memlist is written here and read at memory hotplug time. @@ -546,6 +749,8 @@ out_put_tdxmem: put_online_mems(); return ret; +err_free_pamts: + tdmrs_free_pamt_all(&tdx_tdmr_list); err_free_tdmrs: free_tdmr_list(&tdx_tdmr_list); err_free_tdxmem: diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index f18ce1b88b0af..1b04efece9dbd 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -87,6 +87,7 @@ struct tdx_memblock { struct list_head list; unsigned long start_pfn; unsigned long end_pfn; + int nid; }; /* "TDMR info" part of "Global Scope Metadata" for constructing TDMRs */ -- GitLab From dde3b60d572c941e2c72d22c18a08586fb67bb50 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 8 Dec 2023 09:07:32 -0800 Subject: [PATCH 0232/1290] x86/virt/tdx: Designate reserved areas for all TDMRs As the last step of constructing TDMRs, populate reserved areas for all TDMRs. Cover all memory holes and PAMTs with a TMDR reserved area. [ dhansen: trim down chagnelog ] Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Reviewed-by: Isaku Yamahata Reviewed-by: Kirill A. Shutemov Reviewed-by: Yuan Yao Reviewed-by: Dave Hansen Link: https://lore.kernel.org/all/20231208170740.53979-12-dave.hansen%40intel.com --- arch/x86/virt/vmx/tdx/tdx.c | 217 ++++++++++++++++++++++++++++++++++-- 1 file changed, 209 insertions(+), 8 deletions(-) diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index c2bff81245986..98283f728a816 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -662,6 +663,207 @@ static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list) return pamt_size / 1024; } +static int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx, u64 addr, + u64 size, u16 max_reserved_per_tdmr) +{ + struct tdmr_reserved_area *rsvd_areas = tdmr->reserved_areas; + int idx = *p_idx; + + /* Reserved area must be 4K aligned in offset and size */ + if (WARN_ON(addr & ~PAGE_MASK || size & ~PAGE_MASK)) + return -EINVAL; + + if (idx >= max_reserved_per_tdmr) { + pr_warn("initialization failed: TDMR [0x%llx, 0x%llx): reserved areas exhausted.\n", + tdmr->base, tdmr_end(tdmr)); + return -ENOSPC; + } + + /* + * Consume one reserved area per call. Make no effort to + * optimize or reduce the number of reserved areas which are + * consumed by contiguous reserved areas, for instance. + */ + rsvd_areas[idx].offset = addr - tdmr->base; + rsvd_areas[idx].size = size; + + *p_idx = idx + 1; + + return 0; +} + +/* + * Go through @tmb_list to find holes between memory areas. If any of + * those holes fall within @tdmr, set up a TDMR reserved area to cover + * the hole. + */ +static int tdmr_populate_rsvd_holes(struct list_head *tmb_list, + struct tdmr_info *tdmr, + int *rsvd_idx, + u16 max_reserved_per_tdmr) +{ + struct tdx_memblock *tmb; + u64 prev_end; + int ret; + + /* + * Start looking for reserved blocks at the + * beginning of the TDMR. + */ + prev_end = tdmr->base; + list_for_each_entry(tmb, tmb_list, list) { + u64 start, end; + + start = PFN_PHYS(tmb->start_pfn); + end = PFN_PHYS(tmb->end_pfn); + + /* Break if this region is after the TDMR */ + if (start >= tdmr_end(tdmr)) + break; + + /* Exclude regions before this TDMR */ + if (end < tdmr->base) + continue; + + /* + * Skip over memory areas that + * have already been dealt with. + */ + if (start <= prev_end) { + prev_end = end; + continue; + } + + /* Add the hole before this region */ + ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end, + start - prev_end, + max_reserved_per_tdmr); + if (ret) + return ret; + + prev_end = end; + } + + /* Add the hole after the last region if it exists. */ + if (prev_end < tdmr_end(tdmr)) { + ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end, + tdmr_end(tdmr) - prev_end, + max_reserved_per_tdmr); + if (ret) + return ret; + } + + return 0; +} + +/* + * Go through @tdmr_list to find all PAMTs. If any of those PAMTs + * overlaps with @tdmr, set up a TDMR reserved area to cover the + * overlapping part. + */ +static int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list, + struct tdmr_info *tdmr, + int *rsvd_idx, + u16 max_reserved_per_tdmr) +{ + int i, ret; + + for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { + struct tdmr_info *tmp = tdmr_entry(tdmr_list, i); + unsigned long pamt_base, pamt_size, pamt_end; + + tdmr_get_pamt(tmp, &pamt_base, &pamt_size); + /* Each TDMR must already have PAMT allocated */ + WARN_ON_ONCE(!pamt_size || !pamt_base); + + pamt_end = pamt_base + pamt_size; + /* Skip PAMTs outside of the given TDMR */ + if ((pamt_end <= tdmr->base) || + (pamt_base >= tdmr_end(tdmr))) + continue; + + /* Only mark the part within the TDMR as reserved */ + if (pamt_base < tdmr->base) + pamt_base = tdmr->base; + if (pamt_end > tdmr_end(tdmr)) + pamt_end = tdmr_end(tdmr); + + ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, pamt_base, + pamt_end - pamt_base, + max_reserved_per_tdmr); + if (ret) + return ret; + } + + return 0; +} + +/* Compare function called by sort() for TDMR reserved areas */ +static int rsvd_area_cmp_func(const void *a, const void *b) +{ + struct tdmr_reserved_area *r1 = (struct tdmr_reserved_area *)a; + struct tdmr_reserved_area *r2 = (struct tdmr_reserved_area *)b; + + if (r1->offset + r1->size <= r2->offset) + return -1; + if (r1->offset >= r2->offset + r2->size) + return 1; + + /* Reserved areas cannot overlap. The caller must guarantee. */ + WARN_ON_ONCE(1); + return -1; +} + +/* + * Populate reserved areas for the given @tdmr, including memory holes + * (via @tmb_list) and PAMTs (via @tdmr_list). + */ +static int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr, + struct list_head *tmb_list, + struct tdmr_info_list *tdmr_list, + u16 max_reserved_per_tdmr) +{ + int ret, rsvd_idx = 0; + + ret = tdmr_populate_rsvd_holes(tmb_list, tdmr, &rsvd_idx, + max_reserved_per_tdmr); + if (ret) + return ret; + + ret = tdmr_populate_rsvd_pamts(tdmr_list, tdmr, &rsvd_idx, + max_reserved_per_tdmr); + if (ret) + return ret; + + /* TDX requires reserved areas listed in address ascending order */ + sort(tdmr->reserved_areas, rsvd_idx, sizeof(struct tdmr_reserved_area), + rsvd_area_cmp_func, NULL); + + return 0; +} + +/* + * Populate reserved areas for all TDMRs in @tdmr_list, including memory + * holes (via @tmb_list) and PAMTs. + */ +static int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list, + struct list_head *tmb_list, + u16 max_reserved_per_tdmr) +{ + int i; + + for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { + int ret; + + ret = tdmr_populate_rsvd_areas(tdmr_entry(tdmr_list, i), + tmb_list, tdmr_list, max_reserved_per_tdmr); + if (ret) + return ret; + } + + return 0; +} + /* * Construct a list of TDMRs on the preallocated space in @tdmr_list * to cover all TDX memory regions in @tmb_list based on the TDX module @@ -681,14 +883,13 @@ static int construct_tdmrs(struct list_head *tmb_list, tdmr_sysinfo->pamt_entry_size); if (ret) return ret; - /* - * TODO: - * - * - Designate reserved areas for each TDMR. - * - * Return -EINVAL until constructing TDMRs is done - */ - return -EINVAL; + + ret = tdmrs_populate_rsvd_areas_all(tdmr_list, tmb_list, + tdmr_sysinfo->max_reserved_per_tdmr); + if (ret) + tdmrs_free_pamt_all(tdmr_list); + + return ret; } static int init_tdx_module(void) -- GitLab From 554ce1c36d1bbeaec42ba870bad75e200af204be Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 8 Dec 2023 09:07:33 -0800 Subject: [PATCH 0233/1290] x86/virt/tdx: Configure TDX module with the TDMRs and global KeyID The TDX module uses a private KeyID as the "global KeyID" for mapping things like the PAMT and other TDX metadata. This KeyID has already been reserved when detecting TDX during the kernel early boot. Now that the "TD Memory Regions" (TDMRs) are fully built, pass them to the TDX module together with the global KeyID. Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Reviewed-by: Isaku Yamahata Reviewed-by: Kirill A. Shutemov Reviewed-by: Yuan Yao Reviewed-by: Dave Hansen Link: https://lore.kernel.org/all/20231208170740.53979-13-dave.hansen%40intel.com --- arch/x86/virt/vmx/tdx/tdx.c | 43 ++++++++++++++++++++++++++++++++++++- arch/x86/virt/vmx/tdx/tdx.h | 2 ++ 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 98283f728a816..5146503233c07 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -24,6 +24,8 @@ #include #include #include +#include +#include #include #include #include @@ -892,6 +894,41 @@ static int construct_tdmrs(struct list_head *tmb_list, return ret; } +static int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid) +{ + struct tdx_module_args args = {}; + u64 *tdmr_pa_array; + size_t array_sz; + int i, ret; + + /* + * TDMRs are passed to the TDX module via an array of physical + * addresses of each TDMR. The array itself also has certain + * alignment requirement. + */ + array_sz = tdmr_list->nr_consumed_tdmrs * sizeof(u64); + array_sz = roundup_pow_of_two(array_sz); + if (array_sz < TDMR_INFO_PA_ARRAY_ALIGNMENT) + array_sz = TDMR_INFO_PA_ARRAY_ALIGNMENT; + + tdmr_pa_array = kzalloc(array_sz, GFP_KERNEL); + if (!tdmr_pa_array) + return -ENOMEM; + + for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) + tdmr_pa_array[i] = __pa(tdmr_entry(tdmr_list, i)); + + args.rcx = __pa(tdmr_pa_array); + args.rdx = tdmr_list->nr_consumed_tdmrs; + args.r8 = global_keyid; + ret = seamcall_prerr(TDH_SYS_CONFIG, &args); + + /* Free the array as it is not required anymore. */ + kfree(tdmr_pa_array); + + return ret; +} + static int init_tdx_module(void) { struct tdx_tdmr_sysinfo tdmr_sysinfo; @@ -927,10 +964,14 @@ static int init_tdx_module(void) if (ret) goto err_free_tdmrs; + /* Pass the TDMRs and the global KeyID to the TDX module */ + ret = config_tdx_module(&tdx_tdmr_list, tdx_global_keyid); + if (ret) + goto err_free_pamts; + /* * TODO: * - * - Configure the TDMRs and the global KeyID to the TDX module. * - Configure the global KeyID on all packages. * - Initialize all TDMRs. * diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index 1b04efece9dbd..fa5bcf8b5a9ca 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -17,6 +17,7 @@ #define TDH_SYS_INIT 33 #define TDH_SYS_RD 34 #define TDH_SYS_LP_INIT 35 +#define TDH_SYS_CONFIG 45 /* * Global scope metadata field ID. @@ -53,6 +54,7 @@ struct tdmr_reserved_area { } __packed; #define TDMR_INFO_ALIGNMENT 512 +#define TDMR_INFO_PA_ARRAY_ALIGNMENT 512 struct tdmr_info { u64 base; -- GitLab From e56d28df2f666af9ce0c27d8d204ad813d3766f4 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 8 Dec 2023 09:07:34 -0800 Subject: [PATCH 0234/1290] x86/virt/tdx: Configure global KeyID on all packages After the list of TDMRs and the global KeyID are configured to the TDX module, the kernel needs to configure the key of the global KeyID on all packages using TDH.SYS.KEY.CONFIG. This SEAMCALL cannot run parallel on different cpus. Loop all online cpus and use smp_call_on_cpu() to call this SEAMCALL on the first cpu of each package. To keep things simple, this implementation takes no affirmative steps to online cpus to make sure there's at least one cpu for each package. The callers (aka. KVM) can ensure success by ensuring sufficient CPUs are online for this to succeed. Intel hardware doesn't guarantee cache coherency across different KeyIDs. The PAMTs are transitioning from being used by the kernel mapping (KeyId 0) to the TDX module's "global KeyID" mapping. This means that the kernel must flush any dirty KeyID-0 PAMT cachelines before the TDX module uses the global KeyID to access the PAMTs. Otherwise, if those dirty cachelines were written back, they would corrupt the TDX module's metadata. Aside: This corruption would be detected by the memory integrity hardware on the next read of the memory with the global KeyID. The result would likely be fatal to the system but would not impact TDX security. Following the TDX module specification, flush cache before configuring the global KeyID on all packages. Given the PAMT size can be large (~1/256th of system RAM), just use WBINVD on all CPUs to flush. If TDH.SYS.KEY.CONFIG fails, the TDX module may already have "converted" some memory for TDX module use. Convert the memory back so that it can be safely used by the kernel again. Note that this is slower than it should be because of the "partial write machine check" erratum which affects TDX-capable hardware. Also refactor and introduce a new helper: tdmr_do_pamt_func(). This takes a TDMR and runs a function on its PAMT. It looks a _bit_ odd to pass a function pointer around like this, but its use is pretty narrow and it does eliminate what would otherwise be some copying and pasting. [ dhansen: * munge changelog as usual * remove weird (*pamd_func)() syntax ] Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Reviewed-by: Isaku Yamahata Reviewed-by: Kirill A. Shutemov Reviewed-by: Yuan Yao Reviewed-by: Dave Hansen Link: https://lore.kernel.org/all/20231208170740.53979-14-dave.hansen%40intel.com --- arch/x86/virt/vmx/tdx/tdx.c | 133 +++++++++++++++++++++++++++++++++++- arch/x86/virt/vmx/tdx/tdx.h | 1 + 2 files changed, 132 insertions(+), 2 deletions(-) diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 5146503233c07..d2e4180b8bd0f 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -606,7 +607,8 @@ static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_base, *pamt_size = pamt_sz; } -static void tdmr_free_pamt(struct tdmr_info *tdmr) +static void tdmr_do_pamt_func(struct tdmr_info *tdmr, + void (*pamt_func)(unsigned long base, unsigned long size)) { unsigned long pamt_base, pamt_size; @@ -619,9 +621,19 @@ static void tdmr_free_pamt(struct tdmr_info *tdmr) if (WARN_ON_ONCE(!pamt_base)) return; + pamt_func(pamt_base, pamt_size); +} + +static void free_pamt(unsigned long pamt_base, unsigned long pamt_size) +{ free_contig_range(pamt_base >> PAGE_SHIFT, pamt_size >> PAGE_SHIFT); } +static void tdmr_free_pamt(struct tdmr_info *tdmr) +{ + tdmr_do_pamt_func(tdmr, free_pamt); +} + static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list) { int i; @@ -650,6 +662,41 @@ err: return ret; } +/* + * Convert TDX private pages back to normal by using MOVDIR64B to + * clear these pages. Note this function doesn't flush cache of + * these TDX private pages. The caller should make sure of that. + */ +static void reset_tdx_pages(unsigned long base, unsigned long size) +{ + const void *zero_page = (const void *)page_address(ZERO_PAGE(0)); + unsigned long phys, end; + + end = base + size; + for (phys = base; phys < end; phys += 64) + movdir64b(__va(phys), zero_page); + + /* + * MOVDIR64B uses WC protocol. Use memory barrier to + * make sure any later user of these pages sees the + * updated data. + */ + mb(); +} + +static void tdmr_reset_pamt(struct tdmr_info *tdmr) +{ + tdmr_do_pamt_func(tdmr, reset_tdx_pages); +} + +static void tdmrs_reset_pamt_all(struct tdmr_info_list *tdmr_list) +{ + int i; + + for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) + tdmr_reset_pamt(tdmr_entry(tdmr_list, i)); +} + static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list) { unsigned long pamt_size = 0; @@ -929,6 +976,64 @@ static int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid) return ret; } +static int do_global_key_config(void *unused) +{ + struct tdx_module_args args = {}; + + return seamcall_prerr(TDH_SYS_KEY_CONFIG, &args); +} + +/* + * Attempt to configure the global KeyID on all physical packages. + * + * This requires running code on at least one CPU in each package. + * TDMR initialization) will fail will fail if any package in the + * system has no online CPUs. + * + * This code takes no affirmative steps to online CPUs. Callers (aka. + * KVM) can ensure success by ensuring sufficient CPUs are online and + * can run SEAMCALLs. + */ +static int config_global_keyid(void) +{ + cpumask_var_t packages; + int cpu, ret = -EINVAL; + + if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) + return -ENOMEM; + + /* + * Hardware doesn't guarantee cache coherency across different + * KeyIDs. The kernel needs to flush PAMT's dirty cachelines + * (associated with KeyID 0) before the TDX module can use the + * global KeyID to access the PAMT. Given PAMTs are potentially + * large (~1/256th of system RAM), just use WBINVD. + */ + wbinvd_on_all_cpus(); + + for_each_online_cpu(cpu) { + /* + * The key configuration only needs to be done once per + * package and will return an error if configured more + * than once. Avoid doing it multiple times per package. + */ + if (cpumask_test_and_set_cpu(topology_physical_package_id(cpu), + packages)) + continue; + + /* + * TDH.SYS.KEY.CONFIG cannot run concurrently on + * different cpus. Do it one by one. + */ + ret = smp_call_on_cpu(cpu, do_global_key_config, NULL, true); + if (ret) + break; + } + + free_cpumask_var(packages); + return ret; +} + static int init_tdx_module(void) { struct tdx_tdmr_sysinfo tdmr_sysinfo; @@ -969,6 +1074,11 @@ static int init_tdx_module(void) if (ret) goto err_free_pamts; + /* Config the key of global KeyID on all packages */ + ret = config_global_keyid(); + if (ret) + goto err_reset_pamts; + /* * TODO: * @@ -979,7 +1089,7 @@ static int init_tdx_module(void) */ ret = -EINVAL; if (ret) - goto err_free_pamts; + goto err_reset_pamts; pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list)); @@ -991,6 +1101,22 @@ out_put_tdxmem: put_online_mems(); return ret; +err_reset_pamts: + /* + * Part of PAMTs may already have been initialized by the + * TDX module. Flush cache before returning PAMTs back + * to the kernel. + */ + wbinvd_on_all_cpus(); + /* + * According to the TDX hardware spec, if the platform + * doesn't have the "partial write machine check" + * erratum, any kernel read/write will never cause #MC + * in kernel space, thus it's OK to not convert PAMTs + * back to normal. But do the conversion anyway here + * as suggested by the TDX spec. + */ + tdmrs_reset_pamt_all(&tdx_tdmr_list); err_free_pamts: tdmrs_free_pamt_all(&tdx_tdmr_list); err_free_tdmrs: @@ -1024,6 +1150,9 @@ static int __tdx_enable(void) * lock to prevent any new cpu from becoming online; 2) done both VMXON * and tdx_cpu_enable() on all online cpus. * + * This function requires there's at least one online cpu for each CPU + * package to succeed. + * * This function can be called in parallel by multiple callers. * * Return 0 if TDX is enabled successfully, otherwise error. diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index fa5bcf8b5a9ca..dd35baf756b86 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -14,6 +14,7 @@ /* * TDX module SEAMCALL leaf functions */ +#define TDH_SYS_KEY_CONFIG 31 #define TDH_SYS_INIT 33 #define TDH_SYS_RD 34 #define TDH_SYS_LP_INIT 35 -- GitLab From 0b2bc38131f02d6fd38695f191bbd8c6109ecffc Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 8 Dec 2023 09:07:35 -0800 Subject: [PATCH 0235/1290] x86/virt/tdx: Initialize all TDMRs After the global KeyID has been configured on all packages, initialize all TDMRs to make all TDX-usable memory regions that are passed to the TDX module become usable. This is the last step of initializing the TDX module. Initializing TDMRs can be time consuming on large memory systems as it involves initializing all metadata entries for all pages that can be used by TDX guests. Initializing different TDMRs can be parallelized. For now to keep it simple, just initialize all TDMRs one by one. It can be enhanced in the future. Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Reviewed-by: Isaku Yamahata Reviewed-by: Kirill A. Shutemov Reviewed-by: Yuan Yao Reviewed-by: Dave Hansen Link: https://lore.kernel.org/all/20231208170740.53979-15-dave.hansen%40intel.com --- arch/x86/virt/vmx/tdx/tdx.c | 61 +++++++++++++++++++++++++++++++------ arch/x86/virt/vmx/tdx/tdx.h | 1 + 2 files changed, 53 insertions(+), 9 deletions(-) diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index d2e4180b8bd0f..48fb1b3101c04 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -1034,6 +1034,56 @@ static int config_global_keyid(void) return ret; } +static int init_tdmr(struct tdmr_info *tdmr) +{ + u64 next; + + /* + * Initializing a TDMR can be time consuming. To avoid long + * SEAMCALLs, the TDX module may only initialize a part of the + * TDMR in each call. + */ + do { + struct tdx_module_args args = { + .rcx = tdmr->base, + }; + int ret; + + ret = seamcall_prerr_ret(TDH_SYS_TDMR_INIT, &args); + if (ret) + return ret; + /* + * RDX contains 'next-to-initialize' address if + * TDH.SYS.TDMR.INIT did not fully complete and + * should be retried. + */ + next = args.rdx; + cond_resched(); + /* Keep making SEAMCALLs until the TDMR is done */ + } while (next < tdmr->base + tdmr->size); + + return 0; +} + +static int init_tdmrs(struct tdmr_info_list *tdmr_list) +{ + int i; + + /* + * This operation is costly. It can be parallelized, + * but keep it simple for now. + */ + for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { + int ret; + + ret = init_tdmr(tdmr_entry(tdmr_list, i)); + if (ret) + return ret; + } + + return 0; +} + static int init_tdx_module(void) { struct tdx_tdmr_sysinfo tdmr_sysinfo; @@ -1079,15 +1129,8 @@ static int init_tdx_module(void) if (ret) goto err_reset_pamts; - /* - * TODO: - * - * - Configure the global KeyID on all packages. - * - Initialize all TDMRs. - * - * Return error before all steps are done. - */ - ret = -EINVAL; + /* Initialize TDMRs to complete the TDX module initialization */ + ret = init_tdmrs(&tdx_tdmr_list); if (ret) goto err_reset_pamts; diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index dd35baf756b86..c0610f0bb88ce 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -18,6 +18,7 @@ #define TDH_SYS_INIT 33 #define TDH_SYS_RD 34 #define TDH_SYS_LP_INIT 35 +#define TDH_SYS_TDMR_INIT 36 #define TDH_SYS_CONFIG 45 /* -- GitLab From f3f6aa68640298fb966811b991c7b8efee67e181 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 8 Dec 2023 09:07:36 -0800 Subject: [PATCH 0236/1290] x86/virt/tdx: Handle TDX interaction with sleep and hibernation TDX is incompatible with hibernation and some ACPI sleep states. Users must disable hibernation to use TDX. Users must also disable TDX if they want to use ACPI S3 sleep. This feels a bit wonky and asymmetric, but it avoids adding any new command-line parameters for now. It can be improved if users hate it too much. Long version: TDX cannot survive from S3 and deeper states. The hardware resets and disables TDX completely when platform goes to S3 and deeper. Both TDX guests and the TDX module get destroyed permanently. The kernel uses S3 to support suspend-to-ram, and S4 or deeper states to support hibernation. The kernel also maintains TDX states to track whether it has been initialized and its metadata resource, etc. After resuming from S3 or hibernation, these TDX states won't be correct anymore. Theoretically, the kernel can do more complicated things like resetting TDX internal states and TDX module metadata before going to S3 or deeper, and re-initialize TDX module after resuming, etc, but there is no way to save/restore TDX guests for now. Until TDX supports full save and restore of TDX guests, there is no big value to handle TDX module in suspend and hibernation alone. To make things simple, just choose to make TDX mutually exclusive with S3 and hibernation. Note the TDX module is initialized at runtime. To avoid having to deal with the fuss of determining TDX state at runtime, just choose TDX vs S3 and hibernation at kernel early boot. It's a bad user experience if the choice of TDX and S3/hibernation is done at runtime anyway, i.e., the user can experience being able to do S3/hibernation but later becoming unable to due to TDX being enabled. Disable TDX in kernel early boot when hibernation support is available. Currently there's no mechanism exposed by the hibernation code to allow other kernel code to disable hibernation once for all. Users that want TDX must disable hibernation, like using hibername=no on the command line. Disable ACPI S3 when TDX is enabled by the BIOS. For now the user needs to disable TDX in the BIOS to use ACPI S3. A new kernel command line can be added in the future if there's a need to let user disable TDX host via kernel command line. Alternatively, the kernel could disable TDX when ACPI S3 is supported and request the user to disable S3 to use TDX. But there's no existing kernel command line to do that, and BIOS doesn't always have an option to disable S3. [ dhansen: subject / changelog tweaks ] Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Reviewed-by: Kirill A. Shutemov Reviewed-by: Dave Hansen Link: https://lore.kernel.org/all/20231208170740.53979-16-dave.hansen%40intel.com --- arch/x86/virt/vmx/tdx/tdx.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 48fb1b3101c04..6d030f64720ba 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include #include #include @@ -1329,6 +1331,15 @@ void __init tdx_init(void) return; } + /* + * At this point, hibernation_available() indicates whether or + * not hibernation support has been permanently disabled. + */ + if (hibernation_available()) { + pr_err("initialization failed: Hibernation support is enabled\n"); + return; + } + err = register_memory_notifier(&tdx_memory_nb); if (err) { pr_err("initialization failed: register_memory_notifier() failed (%d)\n", @@ -1336,6 +1347,11 @@ void __init tdx_init(void) return; } +#if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND) + pr_info("Disable ACPI S3. Turn off TDX in the BIOS to use ACPI S3.\n"); + acpi_suspend_lowlevel = NULL; +#endif + /* * Just use the first TDX KeyID as the 'global KeyID' and * leave the rest for TDX guests. -- GitLab From 79eba8c924f7decfa71ddf187d38cb9f5f2cd7b3 Mon Sep 17 00:00:00 2001 From: Jo Van Bulck Date: Thu, 5 Oct 2023 17:38:42 +0200 Subject: [PATCH 0237/1290] selftests/sgx: Fix uninitialized pointer dereference in error path Ensure ctx is zero-initialized, such that the encl_measure function will not call EVP_MD_CTX_destroy with an uninitialized ctx pointer in case of an early error during key generation. Fixes: 2adcba79e69d ("selftests/x86: Add a selftest for SGX") Signed-off-by: Jo Van Bulck Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Acked-by: Kai Huang Link: https://lore.kernel.org/all/20231005153854.25566-2-jo.vanbulck%40cs.kuleuven.be --- tools/testing/selftests/sgx/sigstruct.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/sgx/sigstruct.c b/tools/testing/selftests/sgx/sigstruct.c index a07896a463643..d73b29becf5b0 100644 --- a/tools/testing/selftests/sgx/sigstruct.c +++ b/tools/testing/selftests/sgx/sigstruct.c @@ -318,9 +318,9 @@ bool encl_measure(struct encl *encl) struct sgx_sigstruct *sigstruct = &encl->sigstruct; struct sgx_sigstruct_payload payload; uint8_t digest[SHA256_DIGEST_LENGTH]; + EVP_MD_CTX *ctx = NULL; unsigned int siglen; RSA *key = NULL; - EVP_MD_CTX *ctx; int i; memset(sigstruct, 0, sizeof(*sigstruct)); @@ -384,7 +384,8 @@ bool encl_measure(struct encl *encl) return true; err: - EVP_MD_CTX_destroy(ctx); + if (ctx) + EVP_MD_CTX_destroy(ctx); RSA_free(key); return false; } -- GitLab From b84fc2e0139ba4b23b8039bd7cfd242894fe8f8b Mon Sep 17 00:00:00 2001 From: Jo Van Bulck Date: Thu, 5 Oct 2023 17:38:43 +0200 Subject: [PATCH 0238/1290] selftests/sgx: Fix uninitialized pointer dereferences in encl_get_entry Ensure sym_tab and sym_names are zero-initialized and add an early-out condition in the unlikely (erroneous) case that the enclave ELF file would not contain a symbol table. This addresses -Werror=maybe-uninitialized compiler warnings for gcc -O2. Fixes: 33c5aac3bf32 ("selftests/sgx: Test complete changing of page type flow") Signed-off-by: Jo Van Bulck Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Link: https://lore.kernel.org/all/20231005153854.25566-3-jo.vanbulck%40cs.kuleuven.be --- tools/testing/selftests/sgx/load.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/sgx/load.c b/tools/testing/selftests/sgx/load.c index 94bdeac1cf041..c9f658e44de6c 100644 --- a/tools/testing/selftests/sgx/load.c +++ b/tools/testing/selftests/sgx/load.c @@ -136,11 +136,11 @@ static bool encl_ioc_add_pages(struct encl *encl, struct encl_segment *seg) */ uint64_t encl_get_entry(struct encl *encl, const char *symbol) { + Elf64_Sym *symtab = NULL; + char *sym_names = NULL; Elf64_Shdr *sections; - Elf64_Sym *symtab; Elf64_Ehdr *ehdr; - char *sym_names; - int num_sym; + int num_sym = 0; int i; ehdr = encl->bin; @@ -161,6 +161,9 @@ uint64_t encl_get_entry(struct encl *encl, const char *symbol) } } + if (!symtab || !sym_names) + return 0; + for (i = 0; i < num_sym; i++) { Elf64_Sym *sym = &symtab[i]; -- GitLab From 853a57a43ebdb8c024160c1a0990bae85f4bcc2f Mon Sep 17 00:00:00 2001 From: Jo Van Bulck Date: Thu, 5 Oct 2023 17:38:44 +0200 Subject: [PATCH 0239/1290] selftests/sgx: Include memory clobber for inline asm in test enclave Add the "memory" clobber to the EMODPE and EACCEPT asm blocks to tell the compiler the assembly code accesses to the secinfo struct. This ensures the compiler treats the asm block as a memory barrier and the write to secinfo will be visible to ENCLU. Fixes: 20404a808593 ("selftests/sgx: Add test for EPCM permission changes") Signed-off-by: Jo Van Bulck Signed-off-by: Dave Hansen Reviewed-by: Kai Huang Reviewed-by: Jarkko Sakkinen Link: https://lore.kernel.org/all/20231005153854.25566-4-jo.vanbulck%40cs.kuleuven.be --- tools/testing/selftests/sgx/test_encl.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/sgx/test_encl.c b/tools/testing/selftests/sgx/test_encl.c index c0d6397295e31..ae791df3e5a57 100644 --- a/tools/testing/selftests/sgx/test_encl.c +++ b/tools/testing/selftests/sgx/test_encl.c @@ -24,10 +24,11 @@ static void do_encl_emodpe(void *_op) secinfo.flags = op->flags; asm volatile(".byte 0x0f, 0x01, 0xd7" - : + : /* no outputs */ : "a" (EMODPE), "b" (&secinfo), - "c" (op->epc_addr)); + "c" (op->epc_addr) + : "memory" /* read from secinfo pointer */); } static void do_encl_eaccept(void *_op) @@ -42,7 +43,8 @@ static void do_encl_eaccept(void *_op) : "=a" (rax) : "a" (EACCEPT), "b" (&secinfo), - "c" (op->epc_addr)); + "c" (op->epc_addr) + : "memory" /* read from secinfo pointer */); op->ret = rax; } -- GitLab From f79464658d858e07af0c4c9b30f5baedeeae0c04 Mon Sep 17 00:00:00 2001 From: Jo Van Bulck Date: Thu, 5 Oct 2023 17:38:45 +0200 Subject: [PATCH 0240/1290] selftests/sgx: Separate linker options Fixes "'linker' input unused [-Wunused-command-line-argument]" errors when compiling with clang. Signed-off-by: Jo Van Bulck Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Link: https://lore.kernel.org/all/20231005153854.25566-5-jo.vanbulck%40cs.kuleuven.be --- tools/testing/selftests/sgx/Makefile | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/sgx/Makefile b/tools/testing/selftests/sgx/Makefile index 50aab6b57da34..dcdd04b322f83 100644 --- a/tools/testing/selftests/sgx/Makefile +++ b/tools/testing/selftests/sgx/Makefile @@ -12,9 +12,11 @@ OBJCOPY := $(CROSS_COMPILE)objcopy endif INCLUDES := -I$(top_srcdir)/tools/include -HOST_CFLAGS := -Wall -Werror -g $(INCLUDES) -fPIC -z noexecstack -ENCL_CFLAGS := -Wall -Werror -static -nostdlib -nostartfiles -fPIC \ +HOST_CFLAGS := -Wall -Werror -g $(INCLUDES) -fPIC +HOST_LDFLAGS := -z noexecstack -lcrypto +ENCL_CFLAGS += -Wall -Werror -static -nostdlib -nostartfiles -fPIC \ -fno-stack-protector -mrdrnd $(INCLUDES) +ENCL_LDFLAGS := -Wl,-T,test_encl.lds,--build-id=none TEST_CUSTOM_PROGS := $(OUTPUT)/test_sgx TEST_FILES := $(OUTPUT)/test_encl.elf @@ -28,7 +30,7 @@ $(OUTPUT)/test_sgx: $(OUTPUT)/main.o \ $(OUTPUT)/sigstruct.o \ $(OUTPUT)/call.o \ $(OUTPUT)/sign_key.o - $(CC) $(HOST_CFLAGS) -o $@ $^ -lcrypto + $(CC) $(HOST_CFLAGS) -o $@ $^ $(HOST_LDFLAGS) $(OUTPUT)/main.o: main.c $(CC) $(HOST_CFLAGS) -c $< -o $@ @@ -45,8 +47,8 @@ $(OUTPUT)/call.o: call.S $(OUTPUT)/sign_key.o: sign_key.S $(CC) $(HOST_CFLAGS) -c $< -o $@ -$(OUTPUT)/test_encl.elf: test_encl.lds test_encl.c test_encl_bootstrap.S - $(CC) $(ENCL_CFLAGS) -T $^ -o $@ -Wl,--build-id=none +$(OUTPUT)/test_encl.elf: test_encl.c test_encl_bootstrap.S + $(CC) $(ENCL_CFLAGS) $^ -o $@ $(ENCL_LDFLAGS) EXTRA_CLEAN := \ $(OUTPUT)/test_encl.elf \ -- GitLab From 304b259e63b94c4f825e0648b33b15f8962955c7 Mon Sep 17 00:00:00 2001 From: Jo Van Bulck Date: Thu, 5 Oct 2023 17:38:46 +0200 Subject: [PATCH 0241/1290] selftests/sgx: Specify freestanding environment for enclave compilation Use -ffreestanding to assert the enclave compilation targets a freestanding environment (i.e., without "main" or standard libraries). This fixes clang reporting "undefined reference to `memset'" after erroneously optimizing away the provided memset/memcpy implementations. Still need to instruct the linker from using standard system startup functions, but drop -nostartfiles as it is implied by -nostdlib. Signed-off-by: Jo Van Bulck Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Acked-by: Kai Huang Link: https://lore.kernel.org/all/20231005153854.25566-6-jo.vanbulck%40cs.kuleuven.be --- tools/testing/selftests/sgx/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/sgx/Makefile b/tools/testing/selftests/sgx/Makefile index dcdd04b322f83..7eb890bdd3f0c 100644 --- a/tools/testing/selftests/sgx/Makefile +++ b/tools/testing/selftests/sgx/Makefile @@ -14,7 +14,7 @@ endif INCLUDES := -I$(top_srcdir)/tools/include HOST_CFLAGS := -Wall -Werror -g $(INCLUDES) -fPIC HOST_LDFLAGS := -z noexecstack -lcrypto -ENCL_CFLAGS += -Wall -Werror -static -nostdlib -nostartfiles -fPIC \ +ENCL_CFLAGS += -Wall -Werror -static -nostdlib -ffreestanding -fPIC \ -fno-stack-protector -mrdrnd $(INCLUDES) ENCL_LDFLAGS := -Wl,-T,test_encl.lds,--build-id=none -- GitLab From 4f812df8f37463767c2a74c2bd77de94acdb2be6 Mon Sep 17 00:00:00 2001 From: Jo Van Bulck Date: Thu, 5 Oct 2023 17:38:47 +0200 Subject: [PATCH 0242/1290] selftests/sgx: Remove redundant enclave base address save/restore Remove redundant push/pop pair that stores and restores the enclave base address in the test enclave, as it is never used after the pop and can anyway be easily retrieved via the __encl_base symbol. Signed-off-by: Jo Van Bulck Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Acked-by: Kai Huang Link: https://lore.kernel.org/all/20231005153854.25566-7-jo.vanbulck%40cs.kuleuven.be --- tools/testing/selftests/sgx/test_encl_bootstrap.S | 3 --- 1 file changed, 3 deletions(-) diff --git a/tools/testing/selftests/sgx/test_encl_bootstrap.S b/tools/testing/selftests/sgx/test_encl_bootstrap.S index 03ae0f57e29d0..e0ce993d3f2cb 100644 --- a/tools/testing/selftests/sgx/test_encl_bootstrap.S +++ b/tools/testing/selftests/sgx/test_encl_bootstrap.S @@ -55,12 +55,9 @@ encl_entry_core: push %rax push %rcx # push the address after EENTER - push %rbx # push the enclave base address call encl_body - pop %rbx # pop the enclave base address - /* Clear volatile GPRs, except RAX (EEXIT function). */ xor %rcx, %rcx xor %rdx, %rdx -- GitLab From f7884e732841450de36c2b1ed49b91e8e854a7d1 Mon Sep 17 00:00:00 2001 From: Jo Van Bulck Date: Thu, 5 Oct 2023 17:38:48 +0200 Subject: [PATCH 0243/1290] selftests/sgx: Produce static-pie executable for test enclave The current combination of -static and -fPIC creates a static executable with position-dependent addresses for global variables. Use -static-pie and -fPIE to create a proper static position independent executable that can be loaded at any address without a dynamic linker. When building the original "lea (encl_stack)(%rbx), %rax" assembly code with -static-pie -fPIE, the linker complains about a relocation it cannot resolve: /usr/local/bin/ld: /tmp/cchIWyfG.o: relocation R_X86_64_32S against `.data' can not be used when making a PIE object; recompile with -fPIE collect2: error: ld returned 1 exit status Thus, since only RIP-relative addressing is legit for local symbols, use "encl_stack(%rip)" and declare an explicit "__encl_base" symbol at the start of the linker script to be able to calculate the stack address relative to the current TCS in the enclave assembly entry code. Signed-off-by: Jo Van Bulck Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Acked-by: Kai Huang Link: https://lore.kernel.org/all/f9c24d89-ed72-7d9e-c650-050d722c6b04@cs.kuleuven.be/ Link: https://lore.kernel.org/all/20231005153854.25566-8-jo.vanbulck%40cs.kuleuven.be --- tools/testing/selftests/sgx/Makefile | 2 +- tools/testing/selftests/sgx/test_encl.lds | 1 + tools/testing/selftests/sgx/test_encl_bootstrap.S | 9 ++++++--- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/sgx/Makefile b/tools/testing/selftests/sgx/Makefile index 7eb890bdd3f0c..8d2ba6adc92bd 100644 --- a/tools/testing/selftests/sgx/Makefile +++ b/tools/testing/selftests/sgx/Makefile @@ -14,7 +14,7 @@ endif INCLUDES := -I$(top_srcdir)/tools/include HOST_CFLAGS := -Wall -Werror -g $(INCLUDES) -fPIC HOST_LDFLAGS := -z noexecstack -lcrypto -ENCL_CFLAGS += -Wall -Werror -static -nostdlib -ffreestanding -fPIC \ +ENCL_CFLAGS += -Wall -Werror -static-pie -nostdlib -ffreestanding -fPIE \ -fno-stack-protector -mrdrnd $(INCLUDES) ENCL_LDFLAGS := -Wl,-T,test_encl.lds,--build-id=none diff --git a/tools/testing/selftests/sgx/test_encl.lds b/tools/testing/selftests/sgx/test_encl.lds index a1ec64f7d91fc..62d37160f59b2 100644 --- a/tools/testing/selftests/sgx/test_encl.lds +++ b/tools/testing/selftests/sgx/test_encl.lds @@ -10,6 +10,7 @@ PHDRS SECTIONS { . = 0; + __encl_base = .; .tcs : { *(.tcs*) } : tcs diff --git a/tools/testing/selftests/sgx/test_encl_bootstrap.S b/tools/testing/selftests/sgx/test_encl_bootstrap.S index e0ce993d3f2cb..28fe5d2ac0afd 100644 --- a/tools/testing/selftests/sgx/test_encl_bootstrap.S +++ b/tools/testing/selftests/sgx/test_encl_bootstrap.S @@ -42,9 +42,12 @@ encl_entry: # RBX contains the base address for TCS, which is the first address # inside the enclave for TCS #1 and one page into the enclave for - # TCS #2. By adding the value of encl_stack to it, we get - # the absolute address for the stack. - lea (encl_stack)(%rbx), %rax + # TCS #2. First make it relative by substracting __encl_base and + # then add the address of encl_stack to get the address for the stack. + lea __encl_base(%rip), %rax + sub %rax, %rbx + lea encl_stack(%rip), %rax + add %rbx, %rax jmp encl_entry_core encl_dyn_entry: # Entry point for dynamically created TCS page expected to follow -- GitLab From d06978e8e47a348c0d33462a8c2bf8f46d2b7df5 Mon Sep 17 00:00:00 2001 From: Jo Van Bulck Date: Thu, 5 Oct 2023 17:38:49 +0200 Subject: [PATCH 0244/1290] selftests/sgx: Handle relocations in test enclave Static-pie binaries normally include a startup routine to perform any ELF relocations from .rela.dyn. Since the enclave loading process is different and glibc is not included, do the necessary relocation for encl_op_array entries manually at runtime relative to the enclave base to ensure correct function pointers. When keeping encl_op_array as a local variable on the stack, gcc without optimizations generates code that explicitly gets the right function addresses and stores them to create the array on the stack: encl_body: /* snipped */ lea do_encl_op_put_to_buf(%rip), %rax mov %rax, -0x50(%rbp) lea do_encl_op_get_from_buf(%rip), %rax mov %rax,-0x48(%rbp) lea do_encl_op_put_to_addr(%rip), %rax /* snipped */ However, gcc -Os or clang generate more efficient code that initializes encl_op_array by copying a "prepared copy" containing the absolute addresses of the functions (i.e., relative to the image base starting from 0) generated by the compiler/linker: encl_body: /* snipped */ lea prepared_copy(%rip), %rsi lea -0x48(%rsp), %rdi mov $0x10,%ecx rep movsl %ds:(%rsi),%es:(%rdi) /* snipped */ When building the enclave with -static-pie, the compiler/linker includes relocation entries for the function symbols in the "prepared copy": Relocation section '.rela.dyn' at offset 0x4000 contains 12 entries: Offset Info Type Symbol /* snipped; "prepared_copy" starts at 0x6000 */ 000000006000 000000000008 R_X86_64_RELATIVE 000000006008 000000000008 R_X86_64_RELATIVE 000000006010 000000000008 R_X86_64_RELATIVE 000000006018 000000000008 R_X86_64_RELATIVE 000000006020 000000000008 R_X86_64_RELATIVE 000000006028 000000000008 R_X86_64_RELATIVE 000000006030 000000000008 R_X86_64_RELATIVE 000000006038 000000000008 R_X86_64_RELATIVE Static-pie binaries normally include a glibc "_dl_relocate_static_pie" routine that will perform these relocations as part of the startup. However, since the enclave loading process is different and glibc is not included, we cannot rely on these relocations to be performed. Without relocations, the code would erroneously jump to the _absolute_ function address loaded from the local copy. Thus, declare "encl_op_array" as global and manually relocate the loaded function-pointer entries relative to the enclave base at runtime. This generates the following code: encl_body: /* snipped */ lea encl_op_array(%rip), %rcx lea __encl_base(%rip), %rax add (%rcx,%rdx,8),%rax jmp *%rax Signed-off-by: Jo Van Bulck Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Acked-by: Kai Huang Link: https://lore.kernel.org/all/150d8ca8-2c66-60d1-f9fc-8e6279824e94@cs.kuleuven.be/ Link: https://lore.kernel.org/all/5c22de5a-4b3b-1f38-9771-409b4ec7f96d@cs.kuleuven.be/#r Link: https://lore.kernel.org/all/20231005153854.25566-9-jo.vanbulck%40cs.kuleuven.be --- tools/testing/selftests/sgx/test_encl.c | 50 +++++++++++++++++-------- 1 file changed, 35 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/sgx/test_encl.c b/tools/testing/selftests/sgx/test_encl.c index ae791df3e5a57..649604c526e77 100644 --- a/tools/testing/selftests/sgx/test_encl.c +++ b/tools/testing/selftests/sgx/test_encl.c @@ -121,21 +121,41 @@ static void do_encl_op_nop(void *_op) } +/* + * Symbol placed at the start of the enclave image by the linker script. + * Declare this extern symbol with visibility "hidden" to ensure the compiler + * does not access it through the GOT and generates position-independent + * addressing as __encl_base(%rip), so we can get the actual enclave base + * during runtime. + */ +extern const uint8_t __attribute__((visibility("hidden"))) __encl_base; + +typedef void (*encl_op_t)(void *); +static const encl_op_t encl_op_array[ENCL_OP_MAX] = { + do_encl_op_put_to_buf, + do_encl_op_get_from_buf, + do_encl_op_put_to_addr, + do_encl_op_get_from_addr, + do_encl_op_nop, + do_encl_eaccept, + do_encl_emodpe, + do_encl_init_tcs_page, +}; + void encl_body(void *rdi, void *rsi) { - const void (*encl_op_array[ENCL_OP_MAX])(void *) = { - do_encl_op_put_to_buf, - do_encl_op_get_from_buf, - do_encl_op_put_to_addr, - do_encl_op_get_from_addr, - do_encl_op_nop, - do_encl_eaccept, - do_encl_emodpe, - do_encl_init_tcs_page, - }; - - struct encl_op_header *op = (struct encl_op_header *)rdi; - - if (op->type < ENCL_OP_MAX) - (*encl_op_array[op->type])(op); + struct encl_op_header *header = (struct encl_op_header *)rdi; + encl_op_t op; + + if (header->type >= ENCL_OP_MAX) + return; + + /* + * The enclave base address needs to be added, as this call site + * *cannot be* made rip-relative by the compiler, or fixed up by + * any other possible means. + */ + op = ((uint64_t)&__encl_base) + encl_op_array[header->type]; + + (*op)(header); } -- GitLab From 9fd552ee32c6c1e27c125016b87d295bea6faea7 Mon Sep 17 00:00:00 2001 From: Jo Van Bulck Date: Thu, 5 Oct 2023 17:38:50 +0200 Subject: [PATCH 0245/1290] selftests/sgx: Fix linker script asserts DEFINED only considers symbols, not section names. Hence, replace the check for .got.plt with the _GLOBAL_OFFSET_TABLE_ symbol and remove other (non-essential) asserts. Signed-off-by: Jo Van Bulck Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Link: https://lore.kernel.org/all/20231005153854.25566-10-jo.vanbulck%40cs.kuleuven.be --- tools/testing/selftests/sgx/test_encl.lds | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tools/testing/selftests/sgx/test_encl.lds b/tools/testing/selftests/sgx/test_encl.lds index 62d37160f59b2..6ffdfc9fb4cfb 100644 --- a/tools/testing/selftests/sgx/test_encl.lds +++ b/tools/testing/selftests/sgx/test_encl.lds @@ -35,8 +35,4 @@ SECTIONS } } -ASSERT(!DEFINED(.altinstructions), "ALTERNATIVES are not supported in enclaves") -ASSERT(!DEFINED(.altinstr_replacement), "ALTERNATIVES are not supported in enclaves") -ASSERT(!DEFINED(.discard.retpoline_safe), "RETPOLINE ALTERNATIVES are not supported in enclaves") -ASSERT(!DEFINED(.discard.nospec), "RETPOLINE ALTERNATIVES are not supported in enclaves") -ASSERT(!DEFINED(.got.plt), "Libcalls are not supported in enclaves") +ASSERT(!DEFINED(_GLOBAL_OFFSET_TABLE_), "Libcalls through GOT are not supported in enclaves") -- GitLab From a4c39ef4ed43103caef80029cd30427a9ff342d8 Mon Sep 17 00:00:00 2001 From: Jo Van Bulck Date: Thu, 5 Oct 2023 17:38:51 +0200 Subject: [PATCH 0246/1290] selftests/sgx: Ensure test enclave buffer is entirely preserved Attach the "used" attribute to instruct the compiler to preserve the static encl_buffer, even if it appears it is not entirely referenced in the enclave code, as expected by the external tests manipulating page permissions. Signed-off-by: Jo Van Bulck Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Acked-by: Kai Huang Link: https://lore.kernel.org/all/a2732938-f3db-a0af-3d68-a18060f66e79@cs.kuleuven.be/ Link: https://lore.kernel.org/all/20231005153854.25566-11-jo.vanbulck%40cs.kuleuven.be --- tools/testing/selftests/sgx/defines.h | 1 + tools/testing/selftests/sgx/test_encl.c | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/sgx/defines.h b/tools/testing/selftests/sgx/defines.h index d8587c971941a..b8f482667ce14 100644 --- a/tools/testing/selftests/sgx/defines.h +++ b/tools/testing/selftests/sgx/defines.h @@ -13,6 +13,7 @@ #define __aligned(x) __attribute__((__aligned__(x))) #define __packed __attribute__((packed)) +#define __used __attribute__((used)) #include "../../../../arch/x86/include/asm/sgx.h" #include "../../../../arch/x86/include/asm/enclu.h" diff --git a/tools/testing/selftests/sgx/test_encl.c b/tools/testing/selftests/sgx/test_encl.c index 649604c526e77..7465f121fb746 100644 --- a/tools/testing/selftests/sgx/test_encl.c +++ b/tools/testing/selftests/sgx/test_encl.c @@ -5,11 +5,12 @@ #include "defines.h" /* - * Data buffer spanning two pages that will be placed first in .data - * segment. Even if not used internally the second page is needed by - * external test manipulating page permissions. + * Data buffer spanning two pages that will be placed first in the .data + * segment. Even if not used internally the second page is needed by external + * test manipulating page permissions, so mark encl_buffer as "used" to make + * sure it is entirely preserved by the compiler. */ -static uint8_t encl_buffer[8192] = { 1 }; +static uint8_t __used encl_buffer[8192] = { 1 }; enum sgx_enclu_function { EACCEPT = 0x5, -- GitLab From 022416496008bd51cb7b33975cc0008749329a86 Mon Sep 17 00:00:00 2001 From: Jo Van Bulck Date: Thu, 5 Oct 2023 17:38:52 +0200 Subject: [PATCH 0247/1290] selftests/sgx: Ensure expected location of test enclave buffer The external tests manipulating page permissions expect encl_buffer to be placed at the start of the test enclave's .data section. As this is not guaranteed per the C standard, explicitly place encl_buffer in a separate section that is explicitly placed at the start of the .data segment in the linker script to avoid the compiler placing it somewhere else in .data. Signed-off-by: Jo Van Bulck Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Acked-by: Kai Huang Link: https://lore.kernel.org/all/20231005153854.25566-12-jo.vanbulck%40cs.kuleuven.be --- tools/testing/selftests/sgx/defines.h | 1 + tools/testing/selftests/sgx/test_encl.c | 8 ++++---- tools/testing/selftests/sgx/test_encl.lds | 1 + 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/sgx/defines.h b/tools/testing/selftests/sgx/defines.h index b8f482667ce14..402f8787a71cc 100644 --- a/tools/testing/selftests/sgx/defines.h +++ b/tools/testing/selftests/sgx/defines.h @@ -14,6 +14,7 @@ #define __aligned(x) __attribute__((__aligned__(x))) #define __packed __attribute__((packed)) #define __used __attribute__((used)) +#define __section(x)__attribute__((__section__(x))) #include "../../../../arch/x86/include/asm/sgx.h" #include "../../../../arch/x86/include/asm/enclu.h" diff --git a/tools/testing/selftests/sgx/test_encl.c b/tools/testing/selftests/sgx/test_encl.c index 7465f121fb746..2c4d709cce2d9 100644 --- a/tools/testing/selftests/sgx/test_encl.c +++ b/tools/testing/selftests/sgx/test_encl.c @@ -6,11 +6,11 @@ /* * Data buffer spanning two pages that will be placed first in the .data - * segment. Even if not used internally the second page is needed by external - * test manipulating page permissions, so mark encl_buffer as "used" to make - * sure it is entirely preserved by the compiler. + * segment via the linker script. Even if not used internally the second page + * is needed by external test manipulating page permissions, so mark + * encl_buffer as "used" to make sure it is entirely preserved by the compiler. */ -static uint8_t __used encl_buffer[8192] = { 1 }; +static uint8_t __used __section(".data.encl_buffer") encl_buffer[8192] = { 1 }; enum sgx_enclu_function { EACCEPT = 0x5, diff --git a/tools/testing/selftests/sgx/test_encl.lds b/tools/testing/selftests/sgx/test_encl.lds index 6ffdfc9fb4cfb..333a3e78fdc97 100644 --- a/tools/testing/selftests/sgx/test_encl.lds +++ b/tools/testing/selftests/sgx/test_encl.lds @@ -24,6 +24,7 @@ SECTIONS } : text .data : { + *(.data.encl_buffer) *(.data*) } : data -- GitLab From ec44ca1e34bc3a9864a8c1bf8636066ec6d5d2e5 Mon Sep 17 00:00:00 2001 From: Jo Van Bulck Date: Thu, 5 Oct 2023 17:38:53 +0200 Subject: [PATCH 0248/1290] selftests/sgx: Discard unsupported ELF sections Building the test enclave with -static-pie may produce a dynamic symbol table, but this is not supported for enclaves and any relocations need to happen manually (e.g., as for "encl_op_array"). Thus, opportunistically discard ".dyn*" and ".gnu.hash" which the enclave loader cannot handle. Signed-off-by: Jo Van Bulck Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Link: https://lore.kernel.org/all/20231005153854.25566-13-jo.vanbulck%40cs.kuleuven.be --- tools/testing/selftests/sgx/test_encl.lds | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/sgx/test_encl.lds b/tools/testing/selftests/sgx/test_encl.lds index 333a3e78fdc97..ffe851a1cac40 100644 --- a/tools/testing/selftests/sgx/test_encl.lds +++ b/tools/testing/selftests/sgx/test_encl.lds @@ -33,6 +33,8 @@ SECTIONS *(.note*) *(.debug*) *(.eh_frame*) + *(.dyn*) + *(.gnu.hash) } } -- GitLab From 886c5be0b12e89b2905c26c4f24d50ae91f261da Mon Sep 17 00:00:00 2001 From: Jo Van Bulck Date: Thu, 5 Oct 2023 17:38:54 +0200 Subject: [PATCH 0249/1290] selftests/sgx: Remove incomplete ABI sanitization code in test enclave As the selftest enclave is *not* intended for production, simplify the code by not initializing CPU configuration registers as expected by the ABI on enclave entry or cleansing caller-save registers on enclave exit. Signed-off-by: Jo Van Bulck Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Link: https://lore.kernel.org/all/da0cfb1e-e347-f7f2-ac72-aec0ee0d867d@intel.com/ Link: https://lore.kernel.org/all/20231005153854.25566-14-jo.vanbulck%40cs.kuleuven.be --- .../testing/selftests/sgx/test_encl_bootstrap.S | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/sgx/test_encl_bootstrap.S b/tools/testing/selftests/sgx/test_encl_bootstrap.S index 28fe5d2ac0afd..d8c4ac94e032c 100644 --- a/tools/testing/selftests/sgx/test_encl_bootstrap.S +++ b/tools/testing/selftests/sgx/test_encl_bootstrap.S @@ -59,21 +59,11 @@ encl_entry_core: push %rcx # push the address after EENTER + # NOTE: as the selftest enclave is *not* intended for production, + # simplify the code by not initializing ABI registers on entry or + # cleansing caller-save registers on exit. call encl_body - /* Clear volatile GPRs, except RAX (EEXIT function). */ - xor %rcx, %rcx - xor %rdx, %rdx - xor %rdi, %rdi - xor %rsi, %rsi - xor %r8, %r8 - xor %r9, %r9 - xor %r10, %r10 - xor %r11, %r11 - - # Reset status flags. - add %rdx, %rdx # OF = SF = AF = CF = 0; ZF = PF = 1 - # Prepare EEXIT target by popping the address of the instruction after # EENTER to RBX. pop %rbx -- GitLab From 981cf568a8644161c2f15c02278ebc2834b51ba6 Mon Sep 17 00:00:00 2001 From: Zhao Mengmeng Date: Tue, 5 Dec 2023 21:56:05 -0500 Subject: [PATCH 0250/1290] selftests/sgx: Skip non X86_64 platform When building whole selftests on arm64, rsync gives an erorr about sgx: rsync: [sender] link_stat "/root/linux-next/tools/testing/selftests/sgx/test_encl.elf" failed: No such file or directory (2) rsync error: some files/attrs were not transferred (see previous errors) (code 23) at main.c(1327) [sender=3.2.5] The root casue is sgx only used on X86_64, and shall be skipped on other platforms. Fix this by moving TEST_CUSTOM_PROGS and TEST_FILES inside the if check, then the build result will be "Skipping non-existent dir: sgx". Fixes: 2adcba79e69d ("selftests/x86: Add a selftest for SGX") Signed-off-by: Zhao Mengmeng Signed-off-by: Dave Hansen Reviewed-by: Jarkko Sakkinen Link: https://lore.kernel.org/all/20231206025605.3965302-1-zhaomzhao%40126.com --- tools/testing/selftests/sgx/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/sgx/Makefile b/tools/testing/selftests/sgx/Makefile index 8d2ba6adc92bd..867f88ce2570a 100644 --- a/tools/testing/selftests/sgx/Makefile +++ b/tools/testing/selftests/sgx/Makefile @@ -18,10 +18,10 @@ ENCL_CFLAGS += -Wall -Werror -static-pie -nostdlib -ffreestanding -fPIE \ -fno-stack-protector -mrdrnd $(INCLUDES) ENCL_LDFLAGS := -Wl,-T,test_encl.lds,--build-id=none +ifeq ($(CAN_BUILD_X86_64), 1) TEST_CUSTOM_PROGS := $(OUTPUT)/test_sgx TEST_FILES := $(OUTPUT)/test_encl.elf -ifeq ($(CAN_BUILD_X86_64), 1) all: $(TEST_CUSTOM_PROGS) $(OUTPUT)/test_encl.elf endif -- GitLab From d3e09f57345f86cf5c47a89fc216f6c98f9a4a7a Mon Sep 17 00:00:00 2001 From: Anshul Dalal Date: Fri, 8 Dec 2023 13:20:35 +0530 Subject: [PATCH 0251/1290] dt-bindings: input: gpio-mouse: Convert to json-schema Convert device tree binding documentation for GPIO attached mouse to json-schema. Signed-off-by: Anshul Dalal Reviewed-by: Linus Walleij Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20231208075037.114598-1-anshulusr@gmail.com Signed-off-by: Dmitry Torokhov --- .../devicetree/bindings/input/gpio-mouse.txt | 32 --------- .../devicetree/bindings/input/gpio-mouse.yaml | 68 +++++++++++++++++++ 2 files changed, 68 insertions(+), 32 deletions(-) delete mode 100644 Documentation/devicetree/bindings/input/gpio-mouse.txt create mode 100644 Documentation/devicetree/bindings/input/gpio-mouse.yaml diff --git a/Documentation/devicetree/bindings/input/gpio-mouse.txt b/Documentation/devicetree/bindings/input/gpio-mouse.txt deleted file mode 100644 index 519510a11af98..0000000000000 --- a/Documentation/devicetree/bindings/input/gpio-mouse.txt +++ /dev/null @@ -1,32 +0,0 @@ -Device-Tree bindings for GPIO attached mice - -This simply uses standard GPIO handles to define a simple mouse connected -to 5-7 GPIO lines. - -Required properties: - - compatible: must be "gpio-mouse" - - scan-interval-ms: The scanning interval in milliseconds - - up-gpios: GPIO line phandle to the line indicating "up" - - down-gpios: GPIO line phandle to the line indicating "down" - - left-gpios: GPIO line phandle to the line indicating "left" - - right-gpios: GPIO line phandle to the line indicating "right" - -Optional properties: - - button-left-gpios: GPIO line handle to the left mouse button - - button-middle-gpios: GPIO line handle to the middle mouse button - - button-right-gpios: GPIO line handle to the right mouse button -Example: - -#include - -gpio-mouse { - compatible = "gpio-mouse"; - scan-interval-ms = <50>; - up-gpios = <&gpio0 0 GPIO_ACTIVE_LOW>; - down-gpios = <&gpio0 1 GPIO_ACTIVE_LOW>; - left-gpios = <&gpio0 2 GPIO_ACTIVE_LOW>; - right-gpios = <&gpio0 3 GPIO_ACTIVE_LOW>; - button-left-gpios = <&gpio0 4 GPIO_ACTIVE_LOW>; - button-middle-gpios = <&gpio0 5 GPIO_ACTIVE_LOW>; - button-right-gpios = <&gpio0 6 GPIO_ACTIVE_LOW>; -}; diff --git a/Documentation/devicetree/bindings/input/gpio-mouse.yaml b/Documentation/devicetree/bindings/input/gpio-mouse.yaml new file mode 100644 index 0000000000000..3928ec6aff1dc --- /dev/null +++ b/Documentation/devicetree/bindings/input/gpio-mouse.yaml @@ -0,0 +1,68 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/input/gpio-mouse.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: GPIO attached mouse + +description: | + This simply uses standard GPIO handles to define a simple mouse connected + to 5-7 GPIO lines. + +maintainers: + - Anshul Dalal + +properties: + compatible: + const: gpio-mouse + + scan-interval-ms: + maxItems: 1 + + up-gpios: + maxItems: 1 + + down-gpios: + maxItems: 1 + + left-gpios: + maxItems: 1 + + right-gpios: + maxItems: 1 + + button-left-gpios: + maxItems: 1 + + button-middle-gpios: + maxItems: 1 + + button-right-gpios: + maxItems: 1 + +required: + - compatible + - scan-interval-ms + - up-gpios + - down-gpios + - left-gpios + - right-gpios + +additionalProperties: false + +examples: + - | + #include + + gpio-mouse { + compatible = "gpio-mouse"; + scan-interval-ms = <50>; + up-gpios = <&gpio0 0 GPIO_ACTIVE_LOW>; + down-gpios = <&gpio0 1 GPIO_ACTIVE_LOW>; + left-gpios = <&gpio0 2 GPIO_ACTIVE_LOW>; + right-gpios = <&gpio0 3 GPIO_ACTIVE_LOW>; + button-left-gpios = <&gpio0 4 GPIO_ACTIVE_LOW>; + button-middle-gpios = <&gpio0 5 GPIO_ACTIVE_LOW>; + button-right-gpios = <&gpio0 6 GPIO_ACTIVE_LOW>; + }; -- GitLab From 909484169a7bd935b4e56e1d77d63e8fccf6c6f1 Mon Sep 17 00:00:00 2001 From: Marcus Folkesson Date: Sat, 2 Dec 2023 17:59:48 +0100 Subject: [PATCH 0252/1290] Input: pxrc - simplify mutex handling with guard macro Use the guard(mutex) macro for handle mutex lock/unlocks. Signed-off-by: Marcus Folkesson Link: https://lore.kernel.org/r/20231202-pxrc-guard-v3-1-2ca8bc8cf689@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/joystick/pxrc.c | 42 +++++++++++++++-------------------- 1 file changed, 18 insertions(+), 24 deletions(-) diff --git a/drivers/input/joystick/pxrc.c b/drivers/input/joystick/pxrc.c index ea2bf5951d677..52d9eab667b7a 100644 --- a/drivers/input/joystick/pxrc.c +++ b/drivers/input/joystick/pxrc.c @@ -5,15 +5,17 @@ * Copyright (C) 2018 Marcus Folkesson */ -#include +#include #include -#include +#include +#include #include +#include +#include #include + #include #include -#include -#include #define PXRC_VENDOR_ID 0x1781 #define PXRC_PRODUCT_ID 0x0898 @@ -81,33 +83,28 @@ exit: static int pxrc_open(struct input_dev *input) { struct pxrc *pxrc = input_get_drvdata(input); - int retval; + int error; - mutex_lock(&pxrc->pm_mutex); - retval = usb_submit_urb(pxrc->urb, GFP_KERNEL); - if (retval) { + guard(mutex)(&pxrc->pm_mutex); + error = usb_submit_urb(pxrc->urb, GFP_KERNEL); + if (error) { dev_err(&pxrc->intf->dev, "%s - usb_submit_urb failed, error: %d\n", - __func__, retval); - retval = -EIO; - goto out; + __func__, error); + return -EIO; } pxrc->is_open = true; - -out: - mutex_unlock(&pxrc->pm_mutex); - return retval; + return 0; } static void pxrc_close(struct input_dev *input) { struct pxrc *pxrc = input_get_drvdata(input); - mutex_lock(&pxrc->pm_mutex); + guard(mutex)(&pxrc->pm_mutex); usb_kill_urb(pxrc->urb); pxrc->is_open = false; - mutex_unlock(&pxrc->pm_mutex); } static void pxrc_free_urb(void *_pxrc) @@ -208,10 +205,9 @@ static int pxrc_suspend(struct usb_interface *intf, pm_message_t message) { struct pxrc *pxrc = usb_get_intfdata(intf); - mutex_lock(&pxrc->pm_mutex); + guard(mutex)(&pxrc->pm_mutex); if (pxrc->is_open) usb_kill_urb(pxrc->urb); - mutex_unlock(&pxrc->pm_mutex); return 0; } @@ -219,14 +215,12 @@ static int pxrc_suspend(struct usb_interface *intf, pm_message_t message) static int pxrc_resume(struct usb_interface *intf) { struct pxrc *pxrc = usb_get_intfdata(intf); - int retval = 0; - mutex_lock(&pxrc->pm_mutex); + guard(mutex)(&pxrc->pm_mutex); if (pxrc->is_open && usb_submit_urb(pxrc->urb, GFP_KERNEL) < 0) - retval = -EIO; + return -EIO; - mutex_unlock(&pxrc->pm_mutex); - return retval; + return 0; } static int pxrc_pre_reset(struct usb_interface *intf) -- GitLab From 5b20755b7780464fea3e54af0af744258dcc2841 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 26 Nov 2023 16:19:14 +0900 Subject: [PATCH 0253/1290] init: move THIS_MODULE from to Commit f50169324df4 ("module.h: split out the EXPORT_SYMBOL into export.h") appropriately separated EXPORT_SYMBOL into because modules and EXPORT_SYMBOL are orthogonal; modules are symbol consumers, while EXPORT_SYMBOL are used by symbol providers, which may not be necessarily a module. However, that commit also relocated THIS_MODULE. As explained in the commit description, the intention was to define THIS_MODULE in a lightweight header, but I do not believe was the best location because EXPORT_SYMBOL and THIS_MODULE are unrelated. Move it to another lightweight header, . The reason for choosing is to make self-contained without relying on incorrectly including . With this adjustment, the role of becomes clearer as it only defines EXPORT_SYMBOL. Signed-off-by: Masahiro Yamada Reviewed-by: Luis Chamberlain --- include/linux/export.h | 18 ------------------ include/linux/init.h | 7 +++++++ 2 files changed, 7 insertions(+), 18 deletions(-) diff --git a/include/linux/export.h b/include/linux/export.h index 9911508a9604f..0bbd02fd351db 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -6,15 +6,6 @@ #include #include -/* - * Export symbols from the kernel to modules. Forked from module.h - * to reduce the amount of pointless cruft we feed to gcc when only - * exporting a simple symbol or two. - * - * Try not to add #includes here. It slows compilation and makes kernel - * hackers place grumpy comments in header files. - */ - /* * This comment block is used by fixdep. Please do not remove. * @@ -23,15 +14,6 @@ * side effect of the *.o build rule. */ -#ifndef __ASSEMBLY__ -#ifdef MODULE -extern struct module __this_module; -#define THIS_MODULE (&__this_module) -#else -#define THIS_MODULE ((struct module *)0) -#endif -#endif /* __ASSEMBLY__ */ - #ifdef CONFIG_64BIT #define __EXPORT_SYMBOL_REF(sym) \ .balign 8 ASM_NL \ diff --git a/include/linux/init.h b/include/linux/init.h index 01b52c9c75268..3fa3f6241350b 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -179,6 +179,13 @@ extern void (*late_time_init)(void); extern bool initcall_debug; +#ifdef MODULE +extern struct module __this_module; +#define THIS_MODULE (&__this_module) +#else +#define THIS_MODULE ((struct module *)0) +#endif + #endif #ifndef MODULE -- GitLab From 53243e098397185d910c10207bc3c0c26f072383 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 29 Nov 2023 08:53:56 +0900 Subject: [PATCH 0254/1290] kbuild: deb-pkg: remove the fakeroot builds support In 2017, the dpkg suite introduced the rootless builds support with the following commits: - 2436807c87b0 ("dpkg-deb: Add support for rootless builds") - fca1bfe84068 ("dpkg-buildpackage: Add support for rootless builds") This feature is available in the default dpkg on Debian 10 and Ubuntu 20.04. Remove the old method. Signed-off-by: Masahiro Yamada --- scripts/Makefile.package | 4 +--- scripts/package/builddeb | 8 +------- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/scripts/Makefile.package b/scripts/Makefile.package index 0c3adc48dfe89..a81dfb1f51810 100644 --- a/scripts/Makefile.package +++ b/scripts/Makefile.package @@ -109,8 +109,6 @@ debian-orig: linux.tar$(debian-orig-suffix) debian cp $< ../$(orig-name); \ fi -KBUILD_PKG_ROOTCMD ?= 'fakeroot -u' - PHONY += deb-pkg srcdeb-pkg bindeb-pkg deb-pkg: private build-type := source,binary @@ -125,7 +123,7 @@ deb-pkg srcdeb-pkg bindeb-pkg: $(if $(findstring source, $(build-type)), \ --unsigned-source --compression=$(KDEB_SOURCE_COMPRESS)) \ $(if $(findstring binary, $(build-type)), \ - -R'$(MAKE) -f debian/rules' -j1 -r$(KBUILD_PKG_ROOTCMD) -a$$(cat debian/arch), \ + -R'$(MAKE) -f debian/rules' -j1 -a$$(cat debian/arch), \ --no-check-builddeps) \ $(DPKG_FLAGS)) diff --git a/scripts/package/builddeb b/scripts/package/builddeb index d7dd0d04c70c9..2fe51e6919da3 100755 --- a/scripts/package/builddeb +++ b/scripts/package/builddeb @@ -36,19 +36,13 @@ create_package() { sh -c "cd '$pdir'; find . -type f ! -path './DEBIAN/*' -printf '%P\0' \ | xargs -r0 md5sum > DEBIAN/md5sums" - # Fix ownership and permissions - if [ "$DEB_RULES_REQUIRES_ROOT" = "no" ]; then - dpkg_deb_opts="--root-owner-group" - else - chown -R root:root "$pdir" - fi # a+rX in case we are in a restrictive umask environment like 0077 # ug-s in case we build in a setuid/setgid directory chmod -R go-w,a+rX,ug-s "$pdir" # Create the package dpkg-gencontrol -p$pname -P"$pdir" - dpkg-deb $dpkg_deb_opts ${KDEB_COMPRESS:+-Z$KDEB_COMPRESS} --build "$pdir" .. + dpkg-deb --root-owner-group ${KDEB_COMPRESS:+-Z$KDEB_COMPRESS} --build "$pdir" .. } install_linux_image () { -- GitLab From cbe826b058bb3547f195144fc018957871568320 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 3 Dec 2023 17:05:48 +0900 Subject: [PATCH 0255/1290] kbuild: determine base DTB by suffix When using the -dtbs syntax, you need to list the base first, as follows: foo-dtbs := foo_base.dtb foo_overlay1.dtbo foo_overlay2.dtbo dtb-y := foo.dtb You cannot do this arrangement: foo-dtbs := foo_overlay1.dtbo foo_overlay2.dtbo foo_base.dtb This restriction comes from $(firstword ...) in the current implementation, but it is unneeded to rely on the order in the -dtbs syntax. Instead, you can simply determine the base by the suffix because the base (*.dtb) and overlays (*.dtbo) use different suffixes. Signed-off-by: Masahiro Yamada Reviewed-by: Nicolas Schier --- scripts/Makefile.lib | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 1a965fe68e011..cd5b181060f15 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -83,8 +83,8 @@ dtb-$(CONFIG_OF_ALL_DTBS) += $(dtb-) multi-dtb-y := $(call multi-search, $(dtb-y), .dtb, -dtbs) # Primitive DTB compiled from *.dts real-dtb-y := $(call real-search, $(dtb-y), .dtb, -dtbs) -# Base DTB that overlay is applied onto (each first word of $(*-dtbs) expansion) -base-dtb-y := $(foreach m, $(multi-dtb-y), $(firstword $(call suffix-search, $m, .dtb, -dtbs))) +# Base DTB that overlay is applied onto +base-dtb-y := $(filter %.dtb, $(call real-search, $(multi-dtb-y), .dtb, -dtbs)) always-y += $(dtb-y) -- GitLab From cc87b7c06f2a6a1fbc7e06ccf6123aada4d0b588 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 3 Dec 2023 18:49:31 +0900 Subject: [PATCH 0256/1290] modpost: move __attribute__((format(printf, 2, 3))) to modpost.h This attribute must be added to the function declaration in a header for comprehensive checking of all the callsites. Fixes: 6d9a89ea4b06 ("kbuild: declare the modpost error functions as printf like") Signed-off-by: Masahiro Yamada Reviewed-by: Nathan Chancellor --- scripts/mod/modpost.c | 3 +-- scripts/mod/modpost.h | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index cb6406f485a96..ca0a90158f85c 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -60,8 +60,7 @@ static unsigned int nr_unresolved; #define MODULE_NAME_LEN (64 - sizeof(Elf_Addr)) -void __attribute__((format(printf, 2, 3))) -modpost_log(enum loglevel loglevel, const char *fmt, ...) +void modpost_log(enum loglevel loglevel, const char *fmt, ...) { va_list arglist; diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h index 69baf014da4fd..9fe974dc1a528 100644 --- a/scripts/mod/modpost.h +++ b/scripts/mod/modpost.h @@ -197,7 +197,8 @@ enum loglevel { LOG_FATAL }; -void modpost_log(enum loglevel loglevel, const char *fmt, ...); +void __attribute__((format(printf, 2, 3))) +modpost_log(enum loglevel loglevel, const char *fmt, ...); /* * warn - show the given message, then let modpost continue running, still -- GitLab From 16a473f60edc30ffcdf355676263730a6028ec67 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 3 Dec 2023 18:49:32 +0900 Subject: [PATCH 0257/1290] modpost: inform compilers that fatal() never returns The function fatal() never returns because modpost_log() calls exit(1) when LOG_FATAL is passed. Inform compilers of this fact so that unreachable code flow can be identified at compile time. Signed-off-by: Masahiro Yamada Reviewed-by: Nathan Chancellor --- scripts/mod/modpost.c | 3 +++ scripts/mod/modpost.h | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index ca0a90158f85c..c13bc9095df31 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -90,6 +90,9 @@ void modpost_log(enum loglevel loglevel, const char *fmt, ...) error_occurred = true; } +void __attribute__((alias("modpost_log"))) +modpost_log_noret(enum loglevel loglevel, const char *fmt, ...); + static inline bool strends(const char *str, const char *postfix) { if (strlen(str) < strlen(postfix)) diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h index 9fe974dc1a528..835cababf1b09 100644 --- a/scripts/mod/modpost.h +++ b/scripts/mod/modpost.h @@ -200,6 +200,9 @@ enum loglevel { void __attribute__((format(printf, 2, 3))) modpost_log(enum loglevel loglevel, const char *fmt, ...); +void __attribute__((format(printf, 2, 3), noreturn)) +modpost_log_noret(enum loglevel loglevel, const char *fmt, ...); + /* * warn - show the given message, then let modpost continue running, still * allowing modpost to exit successfully. This should be used when @@ -215,4 +218,4 @@ modpost_log(enum loglevel loglevel, const char *fmt, ...); */ #define warn(fmt, args...) modpost_log(LOG_WARN, fmt, ##args) #define error(fmt, args...) modpost_log(LOG_ERROR, fmt, ##args) -#define fatal(fmt, args...) modpost_log(LOG_FATAL, fmt, ##args) +#define fatal(fmt, args...) modpost_log_noret(LOG_FATAL, fmt, ##args) -- GitLab From 5cac96f937021de3b0fbc60cdc6d6c4ee5b2456d Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 3 Dec 2023 18:49:33 +0900 Subject: [PATCH 0258/1290] modpost: remove unneeded initializer in section_rel() This initializer was added to avoid -Wmaybe-uninitialized (gcc) and -Wsometimes-uninitialized (clang) warnings. Now that compilers recognize fatal() never returns, it is unneeded. Signed-off-by: Masahiro Yamada Reviewed-by: Nathan Chancellor --- scripts/mod/modpost.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index c13bc9095df31..3233946fa5f67 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -1421,7 +1421,7 @@ static void section_rel(struct module *mod, struct elf_info *elf, for (rel = start; rel < stop; rel++) { Elf_Sym *tsym; - Elf_Addr taddr = 0, r_offset; + Elf_Addr taddr, r_offset; unsigned int r_type, r_sym; void *loc; -- GitLab From c9f2b8d45aa453ee58e66a9b0e7a54e170381585 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 3 Dec 2023 18:49:34 +0900 Subject: [PATCH 0259/1290] modpost: remove unreachable code after fatal() Now compilers can recognize fatal() never returns. While GCC 4.5 dropped support for -Wunreachable-code, Clang is capable of detecting the unreachable code. $ make HOSTCC=clang HOSTCFLAGS=-Wunreachable-code-return [snip] HOSTCC scripts/mod/modpost.o scripts/mod/modpost.c:520:11: warning: 'return' will never be executed [-Wunreachable-code-return] return 0; ^ scripts/mod/modpost.c:477:10: warning: 'return' will never be executed [-Wunreachable-code-return] return 0; ^ 2 warnings generated. Signed-off-by: Masahiro Yamada Reviewed-by: Nathan Chancellor --- scripts/mod/modpost.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 3233946fa5f67..e2bc180cecc81 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -476,11 +476,9 @@ static int parse_elf(struct elf_info *info, const char *filename) fatal("%s: not relocatable object.", filename); /* Check if file offset is correct */ - if (hdr->e_shoff > info->size) { + if (hdr->e_shoff > info->size) fatal("section header offset=%lu in file '%s' is bigger than filesize=%zu\n", (unsigned long)hdr->e_shoff, filename, info->size); - return 0; - } if (hdr->e_shnum == SHN_UNDEF) { /* @@ -518,12 +516,11 @@ static int parse_elf(struct elf_info *info, const char *filename) const char *secname; int nobits = sechdrs[i].sh_type == SHT_NOBITS; - if (!nobits && sechdrs[i].sh_offset > info->size) { + if (!nobits && sechdrs[i].sh_offset > info->size) fatal("%s is truncated. sechdrs[i].sh_offset=%lu > sizeof(*hrd)=%zu\n", filename, (unsigned long)sechdrs[i].sh_offset, sizeof(*hdr)); - return 0; - } + secname = secstrings + sechdrs[i].sh_name; if (strcmp(secname, ".modinfo") == 0) { if (nobits) -- GitLab From 53c5adff34d77166ed69a4e4bdae3694fe961476 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 3 Dec 2023 19:14:16 +0900 Subject: [PATCH 0260/1290] sparc: vdso: clean up build artifacts in arch/sparc/vdso/ Currently, vdso-image-*.c, vdso*.so, vdso*.so.dbg are not cleaned because 'make clean' does not include include/config/auto.conf, resulting in $(vdso_img-y) being empty. Add the build artifacts to 'targets' unconditionally. Signed-off-by: Masahiro Yamada Acked-by: Sam Ravnborg --- arch/sparc/vdso/Makefile | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/sparc/vdso/Makefile b/arch/sparc/vdso/Makefile index d08c3a0443f3a..eb52d0666ffc4 100644 --- a/arch/sparc/vdso/Makefile +++ b/arch/sparc/vdso/Makefile @@ -24,11 +24,8 @@ targets += vdso.lds $(vobjs-y) # Build the vDSO image C files and link them in. vdso_img_objs := $(vdso_img-y:%=vdso-image-%.o) -vdso_img_cfiles := $(vdso_img-y:%=vdso-image-%.c) -vdso_img_sodbg := $(vdso_img-y:%=vdso%.so.dbg) obj-y += $(vdso_img_objs) -targets += $(vdso_img_cfiles) -targets += $(vdso_img_sodbg) $(vdso_img-y:%=vdso%.so) +targets += $(foreach x, 32 64, vdso-image-$(x).c vdso$(x).so vdso$(x).so.dbg) CPPFLAGS_vdso.lds += -P -C -- GitLab From 918d8f94720a103a48ffb5a3ec10c0f680ba78ad Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 3 Dec 2023 19:14:17 +0900 Subject: [PATCH 0261/1290] sparc: vdso: simplify obj-y addition Add objects to obj-y in a more straightforward way. Signed-off-by: Masahiro Yamada Acked-by: Sam Ravnborg --- arch/sparc/vdso/Makefile | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/arch/sparc/vdso/Makefile b/arch/sparc/vdso/Makefile index eb52d0666ffc4..03a32b6156eef 100644 --- a/arch/sparc/vdso/Makefile +++ b/arch/sparc/vdso/Makefile @@ -3,9 +3,6 @@ # Building vDSO images for sparc. # -VDSO64-$(CONFIG_SPARC64) := y -VDSOCOMPAT-$(CONFIG_COMPAT) := y - # files to link into the vdso vobjs-y := vdso-note.o vclock_gettime.o @@ -13,18 +10,14 @@ vobjs-y := vdso-note.o vclock_gettime.o obj-y += vma.o # vDSO images to build -vdso_img-$(VDSO64-y) += 64 -vdso_img-$(VDSOCOMPAT-y) += 32 +obj-$(CONFIG_SPARC64) += vdso-image-64.o +obj-$(CONFIG_COMPAT) += vdso-image-32.o vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) $(obj)/vdso.o: $(obj)/vdso.so targets += vdso.lds $(vobjs-y) - -# Build the vDSO image C files and link them in. -vdso_img_objs := $(vdso_img-y:%=vdso-image-%.o) -obj-y += $(vdso_img_objs) targets += $(foreach x, 32 64, vdso-image-$(x).c vdso$(x).so vdso$(x).so.dbg) CPPFLAGS_vdso.lds += -P -C -- GitLab From d821f8a26efb6789666d70ce7a8f27df6c33c12e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 3 Dec 2023 19:14:18 +0900 Subject: [PATCH 0262/1290] sparc: vdso: use $(addprefix ) instead of $(foreach ) $(addprefix ) is slightly shorter and more intuitive. Signed-off-by: Masahiro Yamada Acked-by: Sam Ravnborg --- arch/sparc/vdso/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sparc/vdso/Makefile b/arch/sparc/vdso/Makefile index 03a32b6156eef..7f5eedf1f5e0a 100644 --- a/arch/sparc/vdso/Makefile +++ b/arch/sparc/vdso/Makefile @@ -13,7 +13,7 @@ obj-y += vma.o obj-$(CONFIG_SPARC64) += vdso-image-64.o obj-$(CONFIG_COMPAT) += vdso-image-32.o -vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) +vobjs := $(addprefix $(obj)/, $(vobjs-y)) $(obj)/vdso.o: $(obj)/vdso.so -- GitLab From 6c07fd84977b605b6a4ceb03b38e6325974f06d6 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 3 Dec 2023 19:25:23 +0900 Subject: [PATCH 0263/1290] kconfig: factor out common code shared by mconf and nconf Separate out the duplicated code to mnconf-common.c. Signed-off-by: Masahiro Yamada --- scripts/kconfig/Makefile | 4 +-- scripts/kconfig/mconf.c | 54 +-------------------------------- scripts/kconfig/mnconf-common.c | 53 ++++++++++++++++++++++++++++++++ scripts/kconfig/mnconf-common.h | 18 +++++++++++ scripts/kconfig/nconf.c | 53 +------------------------------- 5 files changed, 75 insertions(+), 107 deletions(-) create mode 100644 scripts/kconfig/mnconf-common.c create mode 100644 scripts/kconfig/mnconf-common.h diff --git a/scripts/kconfig/Makefile b/scripts/kconfig/Makefile index 322c061b464d2..7c025f82718e2 100644 --- a/scripts/kconfig/Makefile +++ b/scripts/kconfig/Makefile @@ -174,7 +174,7 @@ conf-objs := conf.o $(common-objs) # nconf: Used for the nconfig target based on ncurses hostprogs += nconf -nconf-objs := nconf.o nconf.gui.o $(common-objs) +nconf-objs := nconf.o nconf.gui.o mnconf-common.o $(common-objs) HOSTLDLIBS_nconf = $(call read-file, $(obj)/nconf-libs) HOSTCFLAGS_nconf.o = $(call read-file, $(obj)/nconf-cflags) @@ -187,7 +187,7 @@ $(obj)/nconf.o $(obj)/nconf.gui.o: | $(obj)/nconf-cflags hostprogs += mconf lxdialog := $(addprefix lxdialog/, \ checklist.o inputbox.o menubox.o textbox.o util.o yesno.o) -mconf-objs := mconf.o $(lxdialog) $(common-objs) +mconf-objs := mconf.o $(lxdialog) mnconf-common.o $(common-objs) HOSTLDLIBS_mconf = $(call read-file, $(obj)/mconf-libs) $(foreach f, mconf.o $(lxdialog), \ diff --git a/scripts/kconfig/mconf.c b/scripts/kconfig/mconf.c index 3795c36a9181a..5df32148a8695 100644 --- a/scripts/kconfig/mconf.c +++ b/scripts/kconfig/mconf.c @@ -21,6 +21,7 @@ #include "lkc.h" #include "lxdialog/dialog.h" +#include "mnconf-common.h" static const char mconf_readme[] = "Overview\n" @@ -286,7 +287,6 @@ static int single_menu_mode; static int show_all_options; static int save_and_exit; static int silent; -static int jump_key_char; static void conf(struct menu *menu, struct menu *active_menu); @@ -378,58 +378,6 @@ static void show_help(struct menu *menu) str_free(&help); } -struct search_data { - struct list_head *head; - struct menu *target; -}; - -static int next_jump_key(int key) -{ - if (key < '1' || key > '9') - return '1'; - - key++; - - if (key > '9') - key = '1'; - - return key; -} - -static int handle_search_keys(int key, size_t start, size_t end, void *_data) -{ - struct search_data *data = _data; - struct jump_key *pos; - int index = 0; - - if (key < '1' || key > '9') - return 0; - - list_for_each_entry(pos, data->head, entries) { - index = next_jump_key(index); - - if (pos->offset < start) - continue; - - if (pos->offset >= end) - break; - - if (key == index) { - data->target = pos->target; - return 1; - } - } - - return 0; -} - -int get_jump_key_char(void) -{ - jump_key_char = next_jump_key(jump_key_char); - - return jump_key_char; -} - static void search_conf(void) { struct symbol **sym_arr; diff --git a/scripts/kconfig/mnconf-common.c b/scripts/kconfig/mnconf-common.c new file mode 100644 index 0000000000000..18cb9a6c5aaad --- /dev/null +++ b/scripts/kconfig/mnconf-common.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "expr.h" +#include "list.h" +#include "mnconf-common.h" + +int jump_key_char; + +int next_jump_key(int key) +{ + if (key < '1' || key > '9') + return '1'; + + key++; + + if (key > '9') + key = '1'; + + return key; +} + +int handle_search_keys(int key, size_t start, size_t end, void *_data) +{ + struct search_data *data = _data; + struct jump_key *pos; + int index = 0; + + if (key < '1' || key > '9') + return 0; + + list_for_each_entry(pos, data->head, entries) { + index = next_jump_key(index); + + if (pos->offset < start) + continue; + + if (pos->offset >= end) + break; + + if (key == index) { + data->target = pos->target; + return 1; + } + } + + return 0; +} + +int get_jump_key_char(void) +{ + jump_key_char = next_jump_key(jump_key_char); + + return jump_key_char; +} diff --git a/scripts/kconfig/mnconf-common.h b/scripts/kconfig/mnconf-common.h new file mode 100644 index 0000000000000..ab6292cc4bf25 --- /dev/null +++ b/scripts/kconfig/mnconf-common.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef MNCONF_COMMON_H +#define MNCONF_COMMON_H + +#include + +struct search_data { + struct list_head *head; + struct menu *target; +}; + +extern int jump_key_char; + +int next_jump_key(int key); +int handle_search_keys(int key, size_t start, size_t end, void *_data); +int get_jump_key_char(void); + +#endif /* MNCONF_COMMON_H */ diff --git a/scripts/kconfig/nconf.c b/scripts/kconfig/nconf.c index 8cd72fe259740..1148163cfa7e7 100644 --- a/scripts/kconfig/nconf.c +++ b/scripts/kconfig/nconf.c @@ -12,6 +12,7 @@ #include #include "lkc.h" +#include "mnconf-common.h" #include "nconf.h" #include @@ -279,7 +280,6 @@ static const char *current_instructions = menu_instructions; static char *dialog_input_result; static int dialog_input_result_len; -static int jump_key_char; static void selected_conf(struct menu *menu, struct menu *active_menu); static void conf(struct menu *menu); @@ -691,57 +691,6 @@ static int do_exit(void) return 0; } -struct search_data { - struct list_head *head; - struct menu *target; -}; - -static int next_jump_key(int key) -{ - if (key < '1' || key > '9') - return '1'; - - key++; - - if (key > '9') - key = '1'; - - return key; -} - -static int handle_search_keys(int key, size_t start, size_t end, void *_data) -{ - struct search_data *data = _data; - struct jump_key *pos; - int index = 0; - - if (key < '1' || key > '9') - return 0; - - list_for_each_entry(pos, data->head, entries) { - index = next_jump_key(index); - - if (pos->offset < start) - continue; - - if (pos->offset >= end) - break; - - if (key == index) { - data->target = pos->target; - return 1; - } - } - - return 0; -} - -int get_jump_key_char(void) -{ - jump_key_char = next_jump_key(jump_key_char); - - return jump_key_char; -} static void search_conf(void) { -- GitLab From 4a8ececbb50f0dd9395ffc4188ae780916df4a9c Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 22 Nov 2023 16:50:50 -0700 Subject: [PATCH 0264/1290] dt-bindings: dma: Drop undocumented examples The compatibles "ti,omap-sdma" and "ti,dra7-dma-crossbar" aren't documented by a schema which causes warnings: Documentation/devicetree/bindings/dma/dma-controller.example.dtb: /example-0/dma-controller@48000000: failed to match any schema with compatible: ['ti,omap-sdma'] Documentation/devicetree/bindings/dma/dma-router.example.dtb: /example-0/dma-router@4a002b78: failed to match any schema with compatible: ['ti,dra7-dma-crossbar'] As no one has cared to fix them, just drop them. Signed-off-by: Rob Herring Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20231122235050.2966280-1-robh@kernel.org Signed-off-by: Vinod Koul --- .../devicetree/bindings/dma/dma-controller.yaml | 15 --------------- .../devicetree/bindings/dma/dma-router.yaml | 11 ----------- 2 files changed, 26 deletions(-) diff --git a/Documentation/devicetree/bindings/dma/dma-controller.yaml b/Documentation/devicetree/bindings/dma/dma-controller.yaml index 04d150d4d15d3..e6afca558c2df 100644 --- a/Documentation/devicetree/bindings/dma/dma-controller.yaml +++ b/Documentation/devicetree/bindings/dma/dma-controller.yaml @@ -19,19 +19,4 @@ properties: additionalProperties: true -examples: - - | - dma: dma-controller@48000000 { - compatible = "ti,omap-sdma"; - reg = <0x48000000 0x1000>; - interrupts = <0 12 0x4>, - <0 13 0x4>, - <0 14 0x4>, - <0 15 0x4>; - #dma-cells = <1>; - dma-channels = <32>; - dma-requests = <127>; - dma-channel-mask = <0xfffe>; - }; - ... diff --git a/Documentation/devicetree/bindings/dma/dma-router.yaml b/Documentation/devicetree/bindings/dma/dma-router.yaml index 346fe0fa4460e..5ad2febc581e2 100644 --- a/Documentation/devicetree/bindings/dma/dma-router.yaml +++ b/Documentation/devicetree/bindings/dma/dma-router.yaml @@ -40,15 +40,4 @@ required: additionalProperties: true -examples: - - | - sdma_xbar: dma-router@4a002b78 { - compatible = "ti,dra7-dma-crossbar"; - reg = <0x4a002b78 0xfc>; - #dma-cells = <1>; - dma-requests = <205>; - ti,dma-safe-map = <0>; - dma-masters = <&sdma>; - }; - ... -- GitLab From 8e578b47e6d92d5e43982ddc54045973dd4a7de5 Mon Sep 17 00:00:00 2001 From: Shravan Chippa Date: Fri, 8 Dec 2023 16:08:53 +0530 Subject: [PATCH 0265/1290] dmaengine: sf-pdma: Support of_dma_controller_register() Update sf-pdma driver to adopt generic DMA device tree bindings. It calls of_dma_controller_register() with of_dma_xlate_by_chan_id to get the generic DMA device tree helper support and the DMA clients can look up the sf-pdma controller using standard APIs. Signed-off-by: Shravan Chippa Link: https://lore.kernel.org/r/20231208103856.3732998-2-shravan.chippa@microchip.com Signed-off-by: Vinod Koul --- drivers/dma/sf-pdma/sf-pdma.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/dma/sf-pdma/sf-pdma.c b/drivers/dma/sf-pdma/sf-pdma.c index 3125a2f162b47..6109e1c5a09ea 100644 --- a/drivers/dma/sf-pdma/sf-pdma.c +++ b/drivers/dma/sf-pdma/sf-pdma.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include "sf-pdma.h" @@ -563,7 +564,20 @@ static int sf_pdma_probe(struct platform_device *pdev) return ret; } + ret = of_dma_controller_register(pdev->dev.of_node, + of_dma_xlate_by_chan_id, pdma); + if (ret < 0) { + dev_err(&pdev->dev, + "Can't register SiFive Platform OF_DMA. (%d)\n", ret); + goto err_unregister; + } + return 0; + +err_unregister: + dma_async_device_unregister(&pdma->dma_dev); + + return ret; } static void sf_pdma_remove(struct platform_device *pdev) @@ -583,6 +597,9 @@ static void sf_pdma_remove(struct platform_device *pdev) tasklet_kill(&ch->err_tasklet); } + if (pdev->dev.of_node) + of_dma_controller_free(pdev->dev.of_node); + dma_async_device_unregister(&pdma->dma_dev); } -- GitLab From 72b22006ba78c2e3bf39b486a7b8155dc9020133 Mon Sep 17 00:00:00 2001 From: Shravan Chippa Date: Fri, 8 Dec 2023 16:08:54 +0530 Subject: [PATCH 0266/1290] dt-bindings: dma: sf-pdma: add new compatible name Add new compatible name microchip,mpfs-pdma to support out of order dma transfers Reviewed-by: Conor Dooley Signed-off-by: Shravan Chippa Link: https://lore.kernel.org/r/20231208103856.3732998-3-shravan.chippa@microchip.com Signed-off-by: Vinod Koul --- .../devicetree/bindings/dma/sifive,fu540-c000-pdma.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/dma/sifive,fu540-c000-pdma.yaml b/Documentation/devicetree/bindings/dma/sifive,fu540-c000-pdma.yaml index a1af0b9063653..3b22183a1a379 100644 --- a/Documentation/devicetree/bindings/dma/sifive,fu540-c000-pdma.yaml +++ b/Documentation/devicetree/bindings/dma/sifive,fu540-c000-pdma.yaml @@ -29,6 +29,7 @@ properties: compatible: items: - enum: + - microchip,mpfs-pdma - sifive,fu540-c000-pdma - const: sifive,pdma0 description: -- GitLab From 58eea79a1cf285a62af886851b1a91ed5aceb401 Mon Sep 17 00:00:00 2001 From: Shravan Chippa Date: Fri, 8 Dec 2023 16:08:55 +0530 Subject: [PATCH 0267/1290] dmaengine: sf-pdma: add mpfs-pdma compatible name Sifive platform dma (sf-pdma) has both in-order and out-of-order configurations but sf-pdam driver configured to do in-order DMA transfers, with out-of-order configuration got better throughput in the PolarFire SoC platform. Add a PolarFire SoC specific compatible and code to support for out-of-order dma transfers Reviewed-by: Emil Renner Berthing Signed-off-by: Shravan Chippa Link: https://lore.kernel.org/r/20231208103856.3732998-4-shravan.chippa@microchip.com Signed-off-by: Vinod Koul --- drivers/dma/sf-pdma/sf-pdma.c | 27 ++++++++++++++++++++++++--- drivers/dma/sf-pdma/sf-pdma.h | 8 +++++++- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/drivers/dma/sf-pdma/sf-pdma.c b/drivers/dma/sf-pdma/sf-pdma.c index 6109e1c5a09ea..428473611115d 100644 --- a/drivers/dma/sf-pdma/sf-pdma.c +++ b/drivers/dma/sf-pdma/sf-pdma.c @@ -25,6 +25,8 @@ #include "sf-pdma.h" +#define PDMA_QUIRK_NO_STRICT_ORDERING BIT(0) + #ifndef readq static inline unsigned long long readq(void __iomem *addr) { @@ -66,7 +68,7 @@ static struct sf_pdma_desc *sf_pdma_alloc_desc(struct sf_pdma_chan *chan) static void sf_pdma_fill_desc(struct sf_pdma_desc *desc, u64 dst, u64 src, u64 size) { - desc->xfer_type = PDMA_FULL_SPEED; + desc->xfer_type = desc->chan->pdma->transfer_type; desc->xfer_size = size; desc->dst_addr = dst; desc->src_addr = src; @@ -493,6 +495,7 @@ static void sf_pdma_setup_chans(struct sf_pdma *pdma) static int sf_pdma_probe(struct platform_device *pdev) { + const struct sf_pdma_driver_platdata *ddata; struct sf_pdma *pdma; int ret, n_chans; const enum dma_slave_buswidth widths = @@ -518,6 +521,14 @@ static int sf_pdma_probe(struct platform_device *pdev) pdma->n_chans = n_chans; + pdma->transfer_type = PDMA_FULL_SPEED | PDMA_STRICT_ORDERING; + + ddata = device_get_match_data(&pdev->dev); + if (ddata) { + if (ddata->quirks & PDMA_QUIRK_NO_STRICT_ORDERING) + pdma->transfer_type &= ~PDMA_STRICT_ORDERING; + } + pdma->membase = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(pdma->membase)) return PTR_ERR(pdma->membase); @@ -603,9 +614,19 @@ static void sf_pdma_remove(struct platform_device *pdev) dma_async_device_unregister(&pdma->dma_dev); } +static const struct sf_pdma_driver_platdata mpfs_pdma = { + .quirks = PDMA_QUIRK_NO_STRICT_ORDERING, +}; + static const struct of_device_id sf_pdma_dt_ids[] = { - { .compatible = "sifive,fu540-c000-pdma" }, - { .compatible = "sifive,pdma0" }, + { + .compatible = "sifive,fu540-c000-pdma", + }, { + .compatible = "sifive,pdma0", + }, { + .compatible = "microchip,mpfs-pdma", + .data = &mpfs_pdma, + }, {}, }; MODULE_DEVICE_TABLE(of, sf_pdma_dt_ids); diff --git a/drivers/dma/sf-pdma/sf-pdma.h b/drivers/dma/sf-pdma/sf-pdma.h index d05772b5d8d3f..215e07183d7e2 100644 --- a/drivers/dma/sf-pdma/sf-pdma.h +++ b/drivers/dma/sf-pdma/sf-pdma.h @@ -48,7 +48,8 @@ #define PDMA_ERR_STATUS_MASK GENMASK(31, 31) /* Transfer Type */ -#define PDMA_FULL_SPEED 0xFF000008 +#define PDMA_FULL_SPEED 0xFF000000 +#define PDMA_STRICT_ORDERING BIT(3) /* Error Recovery */ #define MAX_RETRY 1 @@ -112,8 +113,13 @@ struct sf_pdma { struct dma_device dma_dev; void __iomem *membase; void __iomem *mappedbase; + u32 transfer_type; u32 n_chans; struct sf_pdma_chan chans[] __counted_by(n_chans); }; +struct sf_pdma_driver_platdata { + u32 quirks; +}; + #endif /* _SF_PDMA_H */ -- GitLab From d95fcb78e7f263f909ce492c3882a704067dc534 Mon Sep 17 00:00:00 2001 From: Mohan Kumar Date: Tue, 28 Nov 2023 12:46:14 +0530 Subject: [PATCH 0268/1290] dt-bindings: dma: Add dma-channel-mask to nvidia,tegra210-adma Add dma-channel-mask binding doc support to nvidia,tegra210-adma to reserve the adma channel usage Acked-by: Rob Herring Signed-off-by: Mohan Kumar Link: https://lore.kernel.org/r/20231128071615.31447-2-mkumard@nvidia.com Signed-off-by: Vinod Koul --- .../devicetree/bindings/dma/nvidia,tegra210-adma.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/dma/nvidia,tegra210-adma.yaml b/Documentation/devicetree/bindings/dma/nvidia,tegra210-adma.yaml index 4003dbe94940c..877147e95ecc5 100644 --- a/Documentation/devicetree/bindings/dma/nvidia,tegra210-adma.yaml +++ b/Documentation/devicetree/bindings/dma/nvidia,tegra210-adma.yaml @@ -53,6 +53,9 @@ properties: ADMA_CHn_CTRL register. const: 1 + dma-channel-mask: + maxItems: 1 + required: - compatible - reg -- GitLab From 25b636225a0816eac20b02fcb37daf6c722d0bed Mon Sep 17 00:00:00 2001 From: Mohan Kumar Date: Tue, 28 Nov 2023 12:46:15 +0530 Subject: [PATCH 0269/1290] dmaengine: tegra210-adma: Support dma-channel-mask property To support the flexibility to reserve the specific dma channels add the support of dma-channel-mask property in the tegra210-adma driver Signed-off-by: Mohan Kumar Link: https://lore.kernel.org/r/20231128071615.31447-3-mkumard@nvidia.com Signed-off-by: Vinod Koul --- drivers/dma/tegra210-adma.c | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/drivers/dma/tegra210-adma.c b/drivers/dma/tegra210-adma.c index 7a0586633bf32..24ad7077c53ba 100644 --- a/drivers/dma/tegra210-adma.c +++ b/drivers/dma/tegra210-adma.c @@ -153,6 +153,7 @@ struct tegra_adma { void __iomem *base_addr; struct clk *ahub_clk; unsigned int nr_channels; + unsigned long *dma_chan_mask; unsigned long rx_requests_reserved; unsigned long tx_requests_reserved; @@ -741,6 +742,10 @@ static int __maybe_unused tegra_adma_runtime_suspend(struct device *dev) for (i = 0; i < tdma->nr_channels; i++) { tdc = &tdma->channels[i]; + /* skip for reserved channels */ + if (!tdc->tdma) + continue; + ch_reg = &tdc->ch_regs; ch_reg->cmd = tdma_ch_read(tdc, ADMA_CH_CMD); /* skip if channel is not active */ @@ -779,6 +784,9 @@ static int __maybe_unused tegra_adma_runtime_resume(struct device *dev) for (i = 0; i < tdma->nr_channels; i++) { tdc = &tdma->channels[i]; + /* skip for reserved channels */ + if (!tdc->tdma) + continue; ch_reg = &tdc->ch_regs; /* skip if channel was not active earlier */ if (!ch_reg->cmd) @@ -867,10 +875,31 @@ static int tegra_adma_probe(struct platform_device *pdev) return PTR_ERR(tdma->ahub_clk); } + tdma->dma_chan_mask = devm_kzalloc(&pdev->dev, + BITS_TO_LONGS(tdma->nr_channels) * sizeof(unsigned long), + GFP_KERNEL); + if (!tdma->dma_chan_mask) + return -ENOMEM; + + /* Enable all channels by default */ + bitmap_fill(tdma->dma_chan_mask, tdma->nr_channels); + + ret = of_property_read_u32_array(pdev->dev.of_node, "dma-channel-mask", + (u32 *)tdma->dma_chan_mask, + BITS_TO_U32(tdma->nr_channels)); + if (ret < 0 && (ret != -EINVAL)) { + dev_err(&pdev->dev, "dma-channel-mask is not complete.\n"); + return ret; + } + INIT_LIST_HEAD(&tdma->dma_dev.channels); for (i = 0; i < tdma->nr_channels; i++) { struct tegra_adma_chan *tdc = &tdma->channels[i]; + /* skip for reserved channels */ + if (!test_bit(i, tdma->dma_chan_mask)) + continue; + tdc->chan_addr = tdma->base_addr + cdata->ch_base_offset + (cdata->ch_reg_size * i); @@ -957,8 +986,10 @@ static void tegra_adma_remove(struct platform_device *pdev) of_dma_controller_free(pdev->dev.of_node); dma_async_device_unregister(&tdma->dma_dev); - for (i = 0; i < tdma->nr_channels; ++i) - irq_dispose_mapping(tdma->channels[i].irq); + for (i = 0; i < tdma->nr_channels; ++i) { + if (tdma->channels[i].irq) + irq_dispose_mapping(tdma->channels[i].irq); + } pm_runtime_disable(&pdev->dev); } -- GitLab From 57cdb720eaa5c4b2fcf04ba6ff6a26f638b0b474 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Tue, 17 Oct 2023 09:42:36 -0500 Subject: [PATCH 0270/1290] iommu/amd: Do not flush IRTE when only updating isRun and destination fields According to the recent update in the AMD IOMMU spec [1], the IsRun and Destination fields of the Interrupt Remapping Table Entry (IRTE) are not cached by the IOMMU hardware. Therefore, do not issue the INVALIDATE_INTERRUPT_TABLE command when updating IRTE[IsRun] and IRTE[Destination] when IRTE[GuestMode]=1, which should help improve IOMMU AVIC/x2AVIC performance. References: [1] AMD IOMMU Spec Revision (Rev 3.08-PUB) (Link: https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/specifications/48882_IOMMU.pdf) Cc: Joao Martins Cc: Alejandro Jimenez Signed-off-by: Suravee Suthikulpanit Reviewed-by: Vasant Hegde Tested-by: Alejandro Jimenez Link: https://lore.kernel.org/r/20231017144236.8287-1-suravee.suthikulpanit@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 9f70643608283..b14046b6e1e77 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3111,8 +3111,8 @@ out: return index; } -static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index, - struct irte_ga *irte) +static int __modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index, + struct irte_ga *irte) { struct irq_remap_table *table; struct irte_ga *entry; @@ -3139,6 +3139,18 @@ static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index, raw_spin_unlock_irqrestore(&table->lock, flags); + return 0; +} + +static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index, + struct irte_ga *irte) +{ + bool ret; + + ret = __modify_irte_ga(iommu, devid, index, irte); + if (ret) + return ret; + iommu_flush_irt_and_complete(iommu, devid); return 0; @@ -3822,8 +3834,8 @@ int amd_iommu_update_ga(int cpu, bool is_run, void *data) } entry->lo.fields_vapic.is_run = is_run; - return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, - ir_data->irq_2_irte.index, entry); + return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, + ir_data->irq_2_irte.index, entry); } EXPORT_SYMBOL(amd_iommu_update_ga); #endif -- GitLab From af3263758bf0b65a040e90190ab708b8a4027947 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 22 Nov 2023 09:02:07 +0000 Subject: [PATCH 0271/1290] iommu/amd: Rename iommu_flush_all_caches() -> amd_iommu_flush_all_caches() Rename function inline with driver naming convention. No functional changes. Signed-off-by: Vasant Hegde Reviewed-by: Suravee Suthikulpanit Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231122090215.6191-2-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu.h | 5 +++++ drivers/iommu/amd/amd_iommu_types.h | 6 ------ drivers/iommu/amd/init.c | 8 ++++---- drivers/iommu/amd/iommu.c | 2 +- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index 86be1edd50ee9..234db57cd3209 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -53,6 +53,11 @@ int amd_iommu_pdev_enable_cap_pri(struct pci_dev *pdev); void amd_iommu_pdev_disable_cap_pri(struct pci_dev *pdev); int amd_iommu_flush_page(struct iommu_domain *dom, u32 pasid, u64 address); +/* + * This function flushes all internal caches of + * the IOMMU used by this driver. + */ +void amd_iommu_flush_all_caches(struct amd_iommu *iommu); void amd_iommu_update_and_flush_device_table(struct protection_domain *domain); void amd_iommu_domain_update(struct protection_domain *domain); void amd_iommu_domain_flush_complete(struct protection_domain *domain); diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 90b7d7950a9ef..809d74faa1a5d 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -902,12 +902,6 @@ extern int amd_iommu_max_glx_val; extern u64 amd_iommu_efr; extern u64 amd_iommu_efr2; -/* - * This function flushes all internal caches of - * the IOMMU used by this driver. - */ -void iommu_flush_all_caches(struct amd_iommu *iommu); - static inline int get_ioapic_devid(int id) { struct devid_map *entry; diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 64bcf3df37ee5..c83bd0c2a1c92 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -2223,7 +2223,7 @@ static int __init amd_iommu_init_pci(void) init_device_table_dma(pci_seg); for_each_iommu(iommu) - iommu_flush_all_caches(iommu); + amd_iommu_flush_all_caches(iommu); print_iommu_info(); @@ -2773,7 +2773,7 @@ static void early_enable_iommu(struct amd_iommu *iommu) iommu_enable_xt(iommu); iommu_enable_irtcachedis(iommu); iommu_enable(iommu); - iommu_flush_all_caches(iommu); + amd_iommu_flush_all_caches(iommu); } /* @@ -2829,7 +2829,7 @@ static void early_enable_iommus(void) iommu_enable_xt(iommu); iommu_enable_irtcachedis(iommu); iommu_set_device_table(iommu); - iommu_flush_all_caches(iommu); + amd_iommu_flush_all_caches(iommu); } } } @@ -3293,7 +3293,7 @@ static int __init state_next(void) uninit_device_table_dma(pci_seg); for_each_iommu(iommu) - iommu_flush_all_caches(iommu); + amd_iommu_flush_all_caches(iommu); } } return ret; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index b14046b6e1e77..c2578b567e35c 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1392,7 +1392,7 @@ static void amd_iommu_flush_irt_all(struct amd_iommu *iommu) iommu_completion_wait(iommu); } -void iommu_flush_all_caches(struct amd_iommu *iommu) +void amd_iommu_flush_all_caches(struct amd_iommu *iommu) { if (check_feature(FEATURE_IA)) { amd_iommu_flush_all(iommu); -- GitLab From 3f2571fed2fa9e9ea61fd990a9a4fc20ca83b62e Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 22 Nov 2023 09:02:08 +0000 Subject: [PATCH 0272/1290] iommu/amd: Remove redundant domain flush from attach_device() Domain flush was introduced in attach_device() path to handle kdump scenario. Later init code was enhanced to handle kdump scenario where it also takes care of flushing everything including TLB (see early_enable_iommus()). Hence remove redundant flush from attach_device() function. Signed-off-by: Vasant Hegde Reviewed-by: Suravee Suthikulpanit Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231122090215.6191-3-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index c2578b567e35c..2330766863793 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1896,15 +1896,6 @@ static int attach_device(struct device *dev, do_attach(dev_data, domain); - /* - * We might boot into a crash-kernel here. The crashed kernel - * left the caches in the IOMMU dirty. So we have to flush - * here to evict all dirty stuff. - */ - amd_iommu_domain_flush_tlb_pde(domain); - - amd_iommu_domain_flush_complete(domain); - out: spin_unlock(&dev_data->lock); -- GitLab From a976da66e8e547cd07a9792f5854c2f73c456a21 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 22 Nov 2023 09:02:09 +0000 Subject: [PATCH 0273/1290] iommu/amd: Remove redundant passing of PDE bit Current code always sets PDE bit in INVALIDATE_IOMMU_PAGES command. Hence get rid of 'pde' variable across functions. We can re-introduce this bit whenever its needed. Suggested-by: Suravee Suthikulpanit Signed-off-by: Vasant Hegde Reviewed-by: Suravee Suthikulpanit Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231122090215.6191-4-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 2330766863793..b1551330758b5 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1124,7 +1124,7 @@ static inline u64 build_inv_address(u64 address, size_t size) } static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, - size_t size, u16 domid, int pde) + size_t size, u16 domid) { u64 inv_address = build_inv_address(address, size); @@ -1133,8 +1133,8 @@ static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, cmd->data[2] = lower_32_bits(inv_address); cmd->data[3] = upper_32_bits(inv_address); CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); - if (pde) /* PDE bit - we want to flush everything, not only the PTEs */ - cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; + /* PDE bit - we want to flush everything, not only the PTEs */ + cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; } static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep, @@ -1341,7 +1341,7 @@ static void amd_iommu_flush_tlb_all(struct amd_iommu *iommu) for (dom_id = 0; dom_id <= last_bdf; ++dom_id) { struct iommu_cmd cmd; build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, - dom_id, 1); + dom_id); iommu_queue_command(iommu, &cmd); } @@ -1352,8 +1352,7 @@ static void amd_iommu_flush_tlb_domid(struct amd_iommu *iommu, u32 dom_id) { struct iommu_cmd cmd; - build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, - dom_id, 1); + build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, dom_id); iommu_queue_command(iommu, &cmd); iommu_completion_wait(iommu); @@ -1476,13 +1475,13 @@ static int device_flush_dte(struct iommu_dev_data *dev_data) * page. Otherwise it flushes the whole TLB of the IOMMU. */ static void __domain_flush_pages(struct protection_domain *domain, - u64 address, size_t size, int pde) + u64 address, size_t size) { struct iommu_dev_data *dev_data; struct iommu_cmd cmd; int ret = 0, i; - build_inv_iommu_pages(&cmd, address, size, domain->id, pde); + build_inv_iommu_pages(&cmd, address, size, domain->id); for (i = 0; i < amd_iommu_get_num_iommus(); ++i) { if (!domain->dev_iommu[i]) @@ -1507,10 +1506,10 @@ static void __domain_flush_pages(struct protection_domain *domain, } static void domain_flush_pages(struct protection_domain *domain, - u64 address, size_t size, int pde) + u64 address, size_t size) { if (likely(!amd_iommu_np_cache)) { - __domain_flush_pages(domain, address, size, pde); + __domain_flush_pages(domain, address, size); return; } @@ -1543,7 +1542,7 @@ static void domain_flush_pages(struct protection_domain *domain, flush_size = 1ul << min_alignment; - __domain_flush_pages(domain, address, flush_size, pde); + __domain_flush_pages(domain, address, flush_size); address += flush_size; size -= flush_size; } @@ -1552,7 +1551,7 @@ static void domain_flush_pages(struct protection_domain *domain, /* Flush the whole IO/TLB for a given protection domain - including PDE */ void amd_iommu_domain_flush_tlb_pde(struct protection_domain *domain) { - domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1); + domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS); } void amd_iommu_domain_flush_complete(struct protection_domain *domain) @@ -1579,7 +1578,7 @@ static void domain_flush_np_cache(struct protection_domain *domain, unsigned long flags; spin_lock_irqsave(&domain->lock, flags); - domain_flush_pages(domain, iova, size, 1); + domain_flush_pages(domain, iova, size); amd_iommu_domain_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); } @@ -2591,7 +2590,7 @@ static void amd_iommu_iotlb_sync(struct iommu_domain *domain, unsigned long flags; spin_lock_irqsave(&dom->lock, flags); - domain_flush_pages(dom, gather->start, gather->end - gather->start + 1, 1); + domain_flush_pages(dom, gather->start, gather->end - gather->start + 1); amd_iommu_domain_flush_complete(dom); spin_unlock_irqrestore(&dom->lock, flags); } -- GitLab From cf62924daf9f0363c82e4332d9ac9630f1d76c42 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 22 Nov 2023 09:02:10 +0000 Subject: [PATCH 0274/1290] iommu/amd: Add support to invalidate multiple guest pages Current interface supports invalidating single page or entire guest translation information for a single process address space. IOMMU CMD_INV_IOMMU_PAGES and CMD_INV_IOTLB_PAGES commands supports invalidating range of pages. Add support to invalidate multiple pages. This is preparatory patch before consolidating host and guest invalidation code into single function. Following patches will consolidation tlb invalidation code. Signed-off-by: Vasant Hegde Reviewed-by: Suravee Suthikulpanit Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231122090215.6191-5-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index b1551330758b5..b26db4f0b6508 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1152,40 +1152,36 @@ static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep, } static void build_inv_iommu_pasid(struct iommu_cmd *cmd, u16 domid, u32 pasid, - u64 address, bool size) + u64 address, size_t size) { - memset(cmd, 0, sizeof(*cmd)); + u64 inv_address = build_inv_address(address, size); - address &= ~(0xfffULL); + memset(cmd, 0, sizeof(*cmd)); cmd->data[0] = pasid; cmd->data[1] = domid; - cmd->data[2] = lower_32_bits(address); - cmd->data[3] = upper_32_bits(address); + cmd->data[2] = lower_32_bits(inv_address); + cmd->data[3] = upper_32_bits(inv_address); cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK; - if (size) - cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); } static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, u32 pasid, - int qdep, u64 address, bool size) + int qdep, u64 address, size_t size) { - memset(cmd, 0, sizeof(*cmd)); + u64 inv_address = build_inv_address(address, size); - address &= ~(0xfffULL); + memset(cmd, 0, sizeof(*cmd)); cmd->data[0] = devid; cmd->data[0] |= ((pasid >> 8) & 0xff) << 16; cmd->data[0] |= (qdep & 0xff) << 24; cmd->data[1] = devid; cmd->data[1] |= (pasid & 0xff) << 16; - cmd->data[2] = lower_32_bits(address); + cmd->data[2] = lower_32_bits(inv_address); cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK; - cmd->data[3] = upper_32_bits(address); - if (size) - cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; + cmd->data[3] = upper_32_bits(inv_address); CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES); } @@ -2656,7 +2652,7 @@ const struct iommu_ops amd_iommu_ops = { }; static int __flush_pasid(struct protection_domain *domain, u32 pasid, - u64 address, bool size) + u64 address, size_t size) { struct iommu_dev_data *dev_data; struct iommu_cmd cmd; @@ -2720,7 +2716,7 @@ out: static int __amd_iommu_flush_page(struct protection_domain *domain, u32 pasid, u64 address) { - return __flush_pasid(domain, pasid, address, false); + return __flush_pasid(domain, pasid, address, PAGE_SIZE); } int amd_iommu_flush_page(struct iommu_domain *dom, u32 pasid, @@ -2739,8 +2735,7 @@ int amd_iommu_flush_page(struct iommu_domain *dom, u32 pasid, static int __amd_iommu_flush_tlb(struct protection_domain *domain, u32 pasid) { - return __flush_pasid(domain, pasid, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, - true); + return __flush_pasid(domain, pasid, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS); } int amd_iommu_flush_tlb(struct iommu_domain *dom, u32 pasid) -- GitLab From 4f0a600799237ed95b403f24354305b0f81ccbb4 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 22 Nov 2023 09:02:11 +0000 Subject: [PATCH 0275/1290] iommu/amd: Refactor IOMMU tlb invalidation code build_inv_iommu_pages() and build_inv_iommu_pasid() pretty much duplicates the code. Hence enhance build_inv_iommu_pages() to invalidate guest pages as well. And remove build_inv_iommu_pasid(). Suggested-by: Kishon Vijay Abraham I Signed-off-by: Vasant Hegde Reviewed-by: Suravee Suthikulpanit Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231122090215.6191-6-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 36 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index b26db4f0b6508..8c09e8f639133 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1124,17 +1124,23 @@ static inline u64 build_inv_address(u64 address, size_t size) } static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, - size_t size, u16 domid) + size_t size, u16 domid, + ioasid_t pasid, bool gn) { u64 inv_address = build_inv_address(address, size); memset(cmd, 0, sizeof(*cmd)); + cmd->data[1] |= domid; cmd->data[2] = lower_32_bits(inv_address); cmd->data[3] = upper_32_bits(inv_address); - CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); /* PDE bit - we want to flush everything, not only the PTEs */ cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; + if (gn) { + cmd->data[0] |= pasid; + cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK; + } + CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); } static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep, @@ -1151,22 +1157,6 @@ static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep, CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES); } -static void build_inv_iommu_pasid(struct iommu_cmd *cmd, u16 domid, u32 pasid, - u64 address, size_t size) -{ - u64 inv_address = build_inv_address(address, size); - - memset(cmd, 0, sizeof(*cmd)); - - cmd->data[0] = pasid; - cmd->data[1] = domid; - cmd->data[2] = lower_32_bits(inv_address); - cmd->data[3] = upper_32_bits(inv_address); - cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; - cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK; - CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); -} - static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, u32 pasid, int qdep, u64 address, size_t size) { @@ -1337,7 +1327,7 @@ static void amd_iommu_flush_tlb_all(struct amd_iommu *iommu) for (dom_id = 0; dom_id <= last_bdf; ++dom_id) { struct iommu_cmd cmd; build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, - dom_id); + dom_id, IOMMU_NO_PASID, false); iommu_queue_command(iommu, &cmd); } @@ -1348,7 +1338,8 @@ static void amd_iommu_flush_tlb_domid(struct amd_iommu *iommu, u32 dom_id) { struct iommu_cmd cmd; - build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, dom_id); + build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, + dom_id, IOMMU_NO_PASID, false); iommu_queue_command(iommu, &cmd); iommu_completion_wait(iommu); @@ -1477,7 +1468,8 @@ static void __domain_flush_pages(struct protection_domain *domain, struct iommu_cmd cmd; int ret = 0, i; - build_inv_iommu_pages(&cmd, address, size, domain->id); + build_inv_iommu_pages(&cmd, address, size, domain->id, + IOMMU_NO_PASID, false); for (i = 0; i < amd_iommu_get_num_iommus(); ++i) { if (!domain->dev_iommu[i]) @@ -2661,7 +2653,7 @@ static int __flush_pasid(struct protection_domain *domain, u32 pasid, if (!(domain->flags & PD_IOMMUV2_MASK)) return -EINVAL; - build_inv_iommu_pasid(&cmd, domain->id, pasid, address, size); + build_inv_iommu_pages(&cmd, address, size, domain->id, pasid, true); /* * IOMMU TLB needs to be flushed before Device TLB to -- GitLab From bbf85fe10faadbd237a7b8df4423de4ab4bd67e7 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 22 Nov 2023 09:02:12 +0000 Subject: [PATCH 0276/1290] iommu/amd: Refactor device iotlb invalidation code build_inv_iotlb_pages() and build_inv_iotlb_pasid() pretty much duplicates the code. Enhance build_inv_iotlb_pages() to invalidate guest IOTLB as well. And remove build_inv_iotlb_pasid() function. Suggested-by: Kishon Vijay Abraham I Signed-off-by: Vasant Hegde Reviewed-by: Suravee Suthikulpanit Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231122090215.6191-7-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 8c09e8f639133..a071e1d52c3e8 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1144,34 +1144,24 @@ static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, } static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep, - u64 address, size_t size) + u64 address, size_t size, + ioasid_t pasid, bool gn) { u64 inv_address = build_inv_address(address, size); memset(cmd, 0, sizeof(*cmd)); + cmd->data[0] = devid; cmd->data[0] |= (qdep & 0xff) << 24; cmd->data[1] = devid; cmd->data[2] = lower_32_bits(inv_address); cmd->data[3] = upper_32_bits(inv_address); - CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES); -} - -static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, u32 pasid, - int qdep, u64 address, size_t size) -{ - u64 inv_address = build_inv_address(address, size); - - memset(cmd, 0, sizeof(*cmd)); + if (gn) { + cmd->data[0] |= ((pasid >> 8) & 0xff) << 16; + cmd->data[1] |= (pasid & 0xff) << 16; + cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK; + } - cmd->data[0] = devid; - cmd->data[0] |= ((pasid >> 8) & 0xff) << 16; - cmd->data[0] |= (qdep & 0xff) << 24; - cmd->data[1] = devid; - cmd->data[1] |= (pasid & 0xff) << 16; - cmd->data[2] = lower_32_bits(inv_address); - cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK; - cmd->data[3] = upper_32_bits(inv_address); CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES); } @@ -1404,7 +1394,8 @@ static int device_flush_iotlb(struct iommu_dev_data *dev_data, if (!iommu) return -EINVAL; - build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address, size); + build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address, + size, IOMMU_NO_PASID, false); return iommu_queue_command(iommu, &cmd); } @@ -2687,8 +2678,8 @@ static int __flush_pasid(struct protection_domain *domain, u32 pasid, iommu = rlookup_amd_iommu(dev_data->dev); if (!iommu) continue; - build_inv_iotlb_pasid(&cmd, dev_data->devid, pasid, - qdep, address, size); + build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, + address, size, pasid, true); ret = iommu_queue_command(iommu, &cmd); if (ret != 0) -- GitLab From 8d004ac1c67bc1584b4b6b90429487dac5b2d83f Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 22 Nov 2023 09:02:13 +0000 Subject: [PATCH 0277/1290] iommu/amd: Consolidate amd_iommu_domain_flush_complete() call Call amd_iommu_domain_flush_complete() from domain_flush_pages(). That way we can remove explicit call of amd_iommu_domain_flush_complete() from various places. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231122090215.6191-8-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/io_pgtable.c | 1 - drivers/iommu/amd/iommu.c | 21 ++++++++++----------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c index 6c0621f6f572a..ca22546e4d1a5 100644 --- a/drivers/iommu/amd/io_pgtable.c +++ b/drivers/iommu/amd/io_pgtable.c @@ -425,7 +425,6 @@ out: * increase_address_space(). */ amd_iommu_domain_flush_tlb_pde(dom); - amd_iommu_domain_flush_complete(dom); spin_unlock_irqrestore(&dom->lock, flags); } diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index a071e1d52c3e8..cb8f51d5f83dd 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1489,6 +1489,10 @@ static void domain_flush_pages(struct protection_domain *domain, { if (likely(!amd_iommu_np_cache)) { __domain_flush_pages(domain, address, size); + + /* Wait until IOMMU TLB and all device IOTLB flushes are complete */ + amd_iommu_domain_flush_complete(domain); + return; } @@ -1525,6 +1529,9 @@ static void domain_flush_pages(struct protection_domain *domain, address += flush_size; size -= flush_size; } + + /* Wait until IOMMU TLB and all device IOTLB flushes are complete */ + amd_iommu_domain_flush_complete(domain); } /* Flush the whole IO/TLB for a given protection domain - including PDE */ @@ -1558,7 +1565,6 @@ static void domain_flush_np_cache(struct protection_domain *domain, spin_lock_irqsave(&domain->lock, flags); domain_flush_pages(domain, iova, size); - amd_iommu_domain_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); } } @@ -1836,12 +1842,9 @@ static void do_detach(struct iommu_dev_data *dev_data) /* Flush the DTE entry */ device_flush_dte(dev_data); - /* Flush IOTLB */ + /* Flush IOTLB and wait for the flushes to finish */ amd_iommu_domain_flush_tlb_pde(domain); - /* Wait for the flushes to finish */ - amd_iommu_domain_flush_complete(domain); - /* decrease reference counters - needs to happen after the flushes */ domain->dev_iommu[iommu->index] -= 1; domain->dev_cnt -= 1; @@ -2018,7 +2021,6 @@ void amd_iommu_domain_update(struct protection_domain *domain) /* Flush domain TLB(s) and wait for completion */ amd_iommu_domain_flush_tlb_pde(domain); - amd_iommu_domain_flush_complete(domain); } /***************************************************************************** @@ -2451,10 +2453,9 @@ static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, } /* Flush IOTLB to mark IOPTE dirty on the next translation(s) */ - if (domain_flush) { + if (domain_flush) amd_iommu_domain_flush_tlb_pde(pdomain); - amd_iommu_domain_flush_complete(pdomain); - } + pdomain->dirty_tracking = enable; spin_unlock_irqrestore(&pdomain->lock, flags); @@ -2558,7 +2559,6 @@ static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) spin_lock_irqsave(&dom->lock, flags); amd_iommu_domain_flush_tlb_pde(dom); - amd_iommu_domain_flush_complete(dom); spin_unlock_irqrestore(&dom->lock, flags); } @@ -2570,7 +2570,6 @@ static void amd_iommu_iotlb_sync(struct iommu_domain *domain, spin_lock_irqsave(&dom->lock, flags); domain_flush_pages(dom, gather->start, gather->end - gather->start + 1); - amd_iommu_domain_flush_complete(dom); spin_unlock_irqrestore(&dom->lock, flags); } -- GitLab From 2c535dd37d677abd11b278fce89b8bdb0a55facb Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 22 Nov 2023 09:02:14 +0000 Subject: [PATCH 0278/1290] iommu/amd: Make domain_flush_pages as global function - Rename domain_flush_pages() -> amd_iommu_domain_flush_pages() and make it as global function. - Rename amd_iommu_domain_flush_tlb_pde() -> amd_iommu_domain_flush_all() and make it as static. - Convert v1 page table (io_pgtble.c) to use amd_iommu_domain_flush_pages(). Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231122090215.6191-9-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu.h | 3 ++- drivers/iommu/amd/io_pgtable.c | 4 +++- drivers/iommu/amd/iommu.c | 22 ++++++++++++---------- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index 234db57cd3209..8b3601f285fd6 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -61,7 +61,8 @@ void amd_iommu_flush_all_caches(struct amd_iommu *iommu); void amd_iommu_update_and_flush_device_table(struct protection_domain *domain); void amd_iommu_domain_update(struct protection_domain *domain); void amd_iommu_domain_flush_complete(struct protection_domain *domain); -void amd_iommu_domain_flush_tlb_pde(struct protection_domain *domain); +void amd_iommu_domain_flush_pages(struct protection_domain *domain, + u64 address, size_t size); int amd_iommu_flush_tlb(struct iommu_domain *dom, u32 pasid); int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, u32 pasid, unsigned long cr3); diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c index ca22546e4d1a5..2a0d1e97e52fd 100644 --- a/drivers/iommu/amd/io_pgtable.c +++ b/drivers/iommu/amd/io_pgtable.c @@ -369,6 +369,8 @@ static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova, bool updated = false; u64 __pte, *pte; int ret, i, count; + size_t size = pgcount << __ffs(pgsize); + unsigned long o_iova = iova; BUG_ON(!IS_ALIGNED(iova, pgsize)); BUG_ON(!IS_ALIGNED(paddr, pgsize)); @@ -424,7 +426,7 @@ out: * Updates and flushing already happened in * increase_address_space(). */ - amd_iommu_domain_flush_tlb_pde(dom); + amd_iommu_domain_flush_pages(dom, o_iova, size); spin_unlock_irqrestore(&dom->lock, flags); } diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index cb8f51d5f83dd..77cf1e3de053b 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1484,8 +1484,8 @@ static void __domain_flush_pages(struct protection_domain *domain, WARN_ON(ret); } -static void domain_flush_pages(struct protection_domain *domain, - u64 address, size_t size) +void amd_iommu_domain_flush_pages(struct protection_domain *domain, + u64 address, size_t size) { if (likely(!amd_iommu_np_cache)) { __domain_flush_pages(domain, address, size); @@ -1535,9 +1535,10 @@ static void domain_flush_pages(struct protection_domain *domain, } /* Flush the whole IO/TLB for a given protection domain - including PDE */ -void amd_iommu_domain_flush_tlb_pde(struct protection_domain *domain) +static void amd_iommu_domain_flush_all(struct protection_domain *domain) { - domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS); + amd_iommu_domain_flush_pages(domain, 0, + CMD_INV_IOMMU_ALL_PAGES_ADDRESS); } void amd_iommu_domain_flush_complete(struct protection_domain *domain) @@ -1564,7 +1565,7 @@ static void domain_flush_np_cache(struct protection_domain *domain, unsigned long flags; spin_lock_irqsave(&domain->lock, flags); - domain_flush_pages(domain, iova, size); + amd_iommu_domain_flush_pages(domain, iova, size); spin_unlock_irqrestore(&domain->lock, flags); } } @@ -1843,7 +1844,7 @@ static void do_detach(struct iommu_dev_data *dev_data) device_flush_dte(dev_data); /* Flush IOTLB and wait for the flushes to finish */ - amd_iommu_domain_flush_tlb_pde(domain); + amd_iommu_domain_flush_all(domain); /* decrease reference counters - needs to happen after the flushes */ domain->dev_iommu[iommu->index] -= 1; @@ -2020,7 +2021,7 @@ void amd_iommu_domain_update(struct protection_domain *domain) amd_iommu_update_and_flush_device_table(domain); /* Flush domain TLB(s) and wait for completion */ - amd_iommu_domain_flush_tlb_pde(domain); + amd_iommu_domain_flush_all(domain); } /***************************************************************************** @@ -2454,7 +2455,7 @@ static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, /* Flush IOTLB to mark IOPTE dirty on the next translation(s) */ if (domain_flush) - amd_iommu_domain_flush_tlb_pde(pdomain); + amd_iommu_domain_flush_all(pdomain); pdomain->dirty_tracking = enable; spin_unlock_irqrestore(&pdomain->lock, flags); @@ -2558,7 +2559,7 @@ static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) unsigned long flags; spin_lock_irqsave(&dom->lock, flags); - amd_iommu_domain_flush_tlb_pde(dom); + amd_iommu_domain_flush_all(dom); spin_unlock_irqrestore(&dom->lock, flags); } @@ -2569,7 +2570,8 @@ static void amd_iommu_iotlb_sync(struct iommu_domain *domain, unsigned long flags; spin_lock_irqsave(&dom->lock, flags); - domain_flush_pages(dom, gather->start, gather->end - gather->start + 1); + amd_iommu_domain_flush_pages(dom, gather->start, + gather->end - gather->start + 1); spin_unlock_irqrestore(&dom->lock, flags); } -- GitLab From c7fc12354be0ba47566d55f4ebdc6a47bd69d5ed Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Wed, 22 Nov 2023 09:02:15 +0000 Subject: [PATCH 0279/1290] iommu/amd/pgtbl_v2: Invalidate updated page ranges only Enhance __domain_flush_pages() to detect domain page table mode and use that info to build invalidation commands. So that we can use amd_iommu_domain_flush_pages() to invalidate v2 page table. Also pass PASID, gn variable to device_flush_iotlb() so that it can build IOTLB invalidation command for both v1 and v2 page table. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231122090215.6191-10-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/io_pgtable_v2.c | 10 ++-------- drivers/iommu/amd/iommu.c | 28 ++++++++++++++++++++-------- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/drivers/iommu/amd/io_pgtable_v2.c b/drivers/iommu/amd/io_pgtable_v2.c index f818a7e254d42..6d69ba60744f0 100644 --- a/drivers/iommu/amd/io_pgtable_v2.c +++ b/drivers/iommu/amd/io_pgtable_v2.c @@ -244,7 +244,6 @@ static int iommu_v2_map_pages(struct io_pgtable_ops *ops, unsigned long iova, unsigned long mapped_size = 0; unsigned long o_iova = iova; size_t size = pgcount << __ffs(pgsize); - int count = 0; int ret = 0; bool updated = false; @@ -265,19 +264,14 @@ static int iommu_v2_map_pages(struct io_pgtable_ops *ops, unsigned long iova, *pte = set_pte_attr(paddr, map_size, prot); - count++; iova += map_size; paddr += map_size; mapped_size += map_size; } out: - if (updated) { - if (count > 1) - amd_iommu_flush_tlb(&pdom->domain, 0); - else - amd_iommu_flush_page(&pdom->domain, 0, o_iova); - } + if (updated) + amd_iommu_domain_flush_pages(pdom, o_iova, size); if (mapped) *mapped += mapped_size; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 77cf1e3de053b..255ea754c0cdd 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -85,6 +85,11 @@ static void detach_device(struct device *dev); * ****************************************************************************/ +static inline bool pdom_is_v2_pgtbl_mode(struct protection_domain *pdom) +{ + return (pdom && (pdom->flags & PD_IOMMUV2_MASK)); +} + static inline int get_acpihid_device_id(struct device *dev, struct acpihid_map_entry **entry) { @@ -1382,8 +1387,8 @@ void amd_iommu_flush_all_caches(struct amd_iommu *iommu) /* * Command send function for flushing on-device TLB */ -static int device_flush_iotlb(struct iommu_dev_data *dev_data, - u64 address, size_t size) +static int device_flush_iotlb(struct iommu_dev_data *dev_data, u64 address, + size_t size, ioasid_t pasid, bool gn) { struct amd_iommu *iommu; struct iommu_cmd cmd; @@ -1395,7 +1400,7 @@ static int device_flush_iotlb(struct iommu_dev_data *dev_data, return -EINVAL; build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address, - size, IOMMU_NO_PASID, false); + size, pasid, gn); return iommu_queue_command(iommu, &cmd); } @@ -1441,8 +1446,11 @@ static int device_flush_dte(struct iommu_dev_data *dev_data) return ret; } - if (dev_data->ats_enabled) - ret = device_flush_iotlb(dev_data, 0, ~0UL); + if (dev_data->ats_enabled) { + /* Invalidate the entire contents of an IOTLB */ + ret = device_flush_iotlb(dev_data, 0, ~0UL, + IOMMU_NO_PASID, false); + } return ret; } @@ -1458,9 +1466,13 @@ static void __domain_flush_pages(struct protection_domain *domain, struct iommu_dev_data *dev_data; struct iommu_cmd cmd; int ret = 0, i; + ioasid_t pasid = IOMMU_NO_PASID; + bool gn = false; + + if (pdom_is_v2_pgtbl_mode(domain)) + gn = true; - build_inv_iommu_pages(&cmd, address, size, domain->id, - IOMMU_NO_PASID, false); + build_inv_iommu_pages(&cmd, address, size, domain->id, pasid, gn); for (i = 0; i < amd_iommu_get_num_iommus(); ++i) { if (!domain->dev_iommu[i]) @@ -1478,7 +1490,7 @@ static void __domain_flush_pages(struct protection_domain *domain, if (!dev_data->ats_enabled) continue; - ret |= device_flush_iotlb(dev_data, address, size); + ret |= device_flush_iotlb(dev_data, address, size, pasid, gn); } WARN_ON(ret); -- GitLab From b6b2264ba2090bb63665458e4b9369781df7dd4d Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sat, 9 Dec 2023 23:12:40 +0000 Subject: [PATCH 0280/1290] iommu/apple-dart: Fix spelling mistake "grups" -> "groups" There is a spelling mistake in a dev_err message. Fix it. Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20231209231240.4056082-1-colin.i.king@gmail.com Signed-off-by: Joerg Roedel --- drivers/iommu/apple-dart.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c index 7438e9c82ba98..497a516747bd7 100644 --- a/drivers/iommu/apple-dart.c +++ b/drivers/iommu/apple-dart.c @@ -911,7 +911,7 @@ static struct iommu_group *apple_dart_device_group(struct device *dev) ret = apple_dart_merge_master_cfg(group_master_cfg, cfg); if (ret) { - dev_err(dev, "Failed to merge DART IOMMU grups.\n"); + dev_err(dev, "Failed to merge DART IOMMU groups.\n"); iommu_group_put(group); res = ERR_PTR(ret); goto out; -- GitLab From 70f008fb3ea9bd2e6727eebc858405acd49a212b Mon Sep 17 00:00:00 2001 From: Amelie Delaunay Date: Fri, 24 Nov 2023 17:02:35 +0100 Subject: [PATCH 0281/1290] dmaengine: dmatest: prevent using swiotlb buffer with nobounce parameter Source and destination data buffers are allocated with GPF_KERNEL flag. It means that, if the DDR is more than 2GB, buffers can be allocated above the 32-bit addressable space. In this case, and if the dma controller is only 32-bit compatible, swiotlb bounce buffer, located in the 32-bit addressable space, is used and introduces a memcpy. To prevent this extra memcpy, due to swiotlb bounce buffer use because source or destination data buffer is allocated above the 32-bit addressable space, force source and destination data buffers allocation with GPF_DMA instead, when nobounce parameter is true. Signed-off-by: Amelie Delaunay Link: https://lore.kernel.org/r/20231124160235.2459326-1-amelie.delaunay@foss.st.com Signed-off-by: Vinod Koul --- drivers/dma/dmatest.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c index ffe621695e472..a4f6088378492 100644 --- a/drivers/dma/dmatest.c +++ b/drivers/dma/dmatest.c @@ -21,6 +21,10 @@ #include #include +static bool nobounce; +module_param(nobounce, bool, 0644); +MODULE_PARM_DESC(nobounce, "Prevent using swiotlb buffer (default: use swiotlb buffer)"); + static unsigned int test_buf_size = 16384; module_param(test_buf_size, uint, 0644); MODULE_PARM_DESC(test_buf_size, "Size of the memcpy test buffer"); @@ -90,6 +94,7 @@ MODULE_PARM_DESC(polled, "Use polling for completion instead of interrupts"); /** * struct dmatest_params - test parameters. + * @nobounce: prevent using swiotlb buffer * @buf_size: size of the memcpy test buffer * @channel: bus ID of the channel to test * @device: bus ID of the DMA Engine to test @@ -106,6 +111,7 @@ MODULE_PARM_DESC(polled, "Use polling for completion instead of interrupts"); * @polled: use polling for completion instead of interrupts */ struct dmatest_params { + bool nobounce; unsigned int buf_size; char channel[20]; char device[32]; @@ -215,6 +221,7 @@ struct dmatest_done { struct dmatest_data { u8 **raw; u8 **aligned; + gfp_t gfp_flags; unsigned int cnt; unsigned int off; }; @@ -533,7 +540,7 @@ static int dmatest_alloc_test_data(struct dmatest_data *d, goto err; for (i = 0; i < d->cnt; i++) { - d->raw[i] = kmalloc(buf_size + align, GFP_KERNEL); + d->raw[i] = kmalloc(buf_size + align, d->gfp_flags); if (!d->raw[i]) goto err; @@ -655,6 +662,13 @@ static int dmatest_func(void *data) goto err_free_coefs; } + src->gfp_flags = GFP_KERNEL; + dst->gfp_flags = GFP_KERNEL; + if (params->nobounce) { + src->gfp_flags = GFP_DMA; + dst->gfp_flags = GFP_DMA; + } + if (dmatest_alloc_test_data(src, buf_size, align) < 0) goto err_free_coefs; @@ -1093,6 +1107,7 @@ static void add_threaded_test(struct dmatest_info *info) struct dmatest_params *params = &info->params; /* Copy test parameters */ + params->nobounce = nobounce; params->buf_size = test_buf_size; strscpy(params->channel, strim(test_channel), sizeof(params->channel)); strscpy(params->device, strim(test_device), sizeof(params->device)); -- GitLab From 8596ba324356a7392a6639024de8c9ae7a9fce92 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 29 Nov 2023 14:35:40 -0800 Subject: [PATCH 0282/1290] perf stat: Fix help message for --metric-no-threshold option Copy-paste error led to help message for metric-no-threshold repeating that of metric-no-merge. Fixes: 1fd09e299bdd434b ("perf metric: Add --metric-no-threshold option") Reported-by: Stephane Eranian Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231129223540.2247030-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index d22228eddccbf..0bfa70791cfcf 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1255,7 +1255,7 @@ static struct option stat_options[] = { OPT_BOOLEAN(0, "metric-no-merge", &stat_config.metric_no_merge, "don't try to share events between metrics in a group"), OPT_BOOLEAN(0, "metric-no-threshold", &stat_config.metric_no_threshold, - "don't try to share events between metrics in a group "), + "disable adding events for the metric threshold calculation"), OPT_BOOLEAN(0, "topdown", &topdown_run, "measure top-down statistics"), OPT_UINTEGER(0, "td-level", &stat_config.topdown_level, -- GitLab From 02db1749f30fb88638e19fb16f2470724529eb81 Mon Sep 17 00:00:00 2001 From: Andreas Kemnade Date: Mon, 11 Dec 2023 23:17:57 +0100 Subject: [PATCH 0283/1290] Input: omap4-keypad - react on keypresses if device is runtime-suspended According to SWPU235AB, table 26-6, fclk is required to generate events at least on OMAP4460, so keep fclk enabled all the time the device is opened. Suggested-by: Tony Lindgren Signed-off-by: Andreas Kemnade Reviewed-by: Tony Lindgren Link: https://lore.kernel.org/r/20231211221757.517427-1-andreas@kemnade.info Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/omap4-keypad.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/input/keyboard/omap4-keypad.c b/drivers/input/keyboard/omap4-keypad.c index d3f8688fdd9c3..040b340995d89 100644 --- a/drivers/input/keyboard/omap4-keypad.c +++ b/drivers/input/keyboard/omap4-keypad.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -83,6 +84,7 @@ struct omap4_keypad { bool no_autorepeat; u64 keys; unsigned short *keymap; + struct clk *fck; }; static int kbd_readl(struct omap4_keypad *keypad_data, u32 offset) @@ -209,6 +211,10 @@ static int omap4_keypad_open(struct input_dev *input) if (error) return error; + error = clk_prepare_enable(keypad_data->fck); + if (error) + goto out; + disable_irq(keypad_data->irq); kbd_writel(keypad_data, OMAP4_KBD_CTRL, @@ -226,10 +232,11 @@ static int omap4_keypad_open(struct input_dev *input) enable_irq(keypad_data->irq); +out: pm_runtime_mark_last_busy(dev); pm_runtime_put_autosuspend(dev); - return 0; + return error; } static void omap4_keypad_stop(struct omap4_keypad *keypad_data) @@ -258,6 +265,7 @@ static void omap4_keypad_close(struct input_dev *input) disable_irq(keypad_data->irq); omap4_keypad_stop(keypad_data); enable_irq(keypad_data->irq); + clk_disable_unprepare(keypad_data->fck); pm_runtime_mark_last_busy(dev); pm_runtime_put_autosuspend(dev); @@ -356,6 +364,11 @@ static int omap4_keypad_probe(struct platform_device *pdev) } keypad_data->irq = irq; + keypad_data->fck = devm_clk_get(&pdev->dev, "fck"); + if (IS_ERR(keypad_data->fck)) + return dev_err_probe(&pdev->dev, PTR_ERR(keypad_data->fck), + "unable to get fck"); + mutex_init(&keypad_data->lock); platform_set_drvdata(pdev, keypad_data); -- GitLab From 97a7d8950f676862aa5d26c50cbcdeb6304f31be Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 10 Dec 2023 18:20:57 +0100 Subject: [PATCH 0284/1290] Input: xpad - remove usage of the deprecated ida_simple_xx() API ida_alloc() and ida_free() should be preferred to the deprecated ida_simple_get() and ida_simple_remove(). This is less verbose. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/a3e30e30f18cc5d6f032c8013ce9d900c8e223e5.1702228806.git.christophe.jaillet@wanadoo.fr Signed-off-by: Dmitry Torokhov --- drivers/input/joystick/xpad.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index ede380551e55c..1a58629c12b9a 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -1665,7 +1665,7 @@ static int xpad_led_probe(struct usb_xpad *xpad) if (!led) return -ENOMEM; - xpad->pad_nr = ida_simple_get(&xpad_pad_seq, 0, 0, GFP_KERNEL); + xpad->pad_nr = ida_alloc(&xpad_pad_seq, GFP_KERNEL); if (xpad->pad_nr < 0) { error = xpad->pad_nr; goto err_free_mem; @@ -1688,7 +1688,7 @@ static int xpad_led_probe(struct usb_xpad *xpad) return 0; err_free_id: - ida_simple_remove(&xpad_pad_seq, xpad->pad_nr); + ida_free(&xpad_pad_seq, xpad->pad_nr); err_free_mem: kfree(led); xpad->led = NULL; @@ -1701,7 +1701,7 @@ static void xpad_led_disconnect(struct usb_xpad *xpad) if (xpad_led) { led_classdev_unregister(&xpad_led->led_cdev); - ida_simple_remove(&xpad_pad_seq, xpad->pad_nr); + ida_free(&xpad_pad_seq, xpad->pad_nr); kfree(xpad_led); } } -- GitLab From 8f23f5dba6b4693448144bde4dd6f537543442c2 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 27 Oct 2023 08:05:20 +0800 Subject: [PATCH 0285/1290] iommu: Change kconfig around IOMMU_SVA Linus suggested that the kconfig here is confusing: https://lore.kernel.org/all/CAHk-=wgUiAtiszwseM1p2fCJ+sC4XWQ+YN4TanFhUgvUqjr9Xw@mail.gmail.com/ Let's break it into three kconfigs controlling distinct things: - CONFIG_IOMMU_MM_DATA controls if the mm_struct has the additional fields for the IOMMU. Currently only PASID, but later patches store a struct iommu_mm_data * - CONFIG_ARCH_HAS_CPU_PASID controls if the arch needs the scheduling bit for keeping track of the ENQCMD instruction. x86 will select this if IOMMU_SVA is enabled - IOMMU_SVA controls if the IOMMU core compiles in the SVA support code for iommu driver use and the IOMMU exported API This way ARM will not enable CONFIG_ARCH_HAS_CPU_PASID Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231027000525.1278806-2-tina.zhang@intel.com Signed-off-by: Joerg Roedel --- arch/Kconfig | 5 +++++ arch/x86/Kconfig | 1 + arch/x86/kernel/traps.c | 2 +- drivers/iommu/Kconfig | 1 + include/linux/iommu.h | 2 +- include/linux/mm_types.h | 2 +- include/linux/sched.h | 2 +- kernel/fork.c | 2 +- mm/Kconfig | 3 +++ mm/init-mm.c | 2 +- 10 files changed, 16 insertions(+), 6 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index f4b210ab06129..3e49f862670e3 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -301,6 +301,11 @@ config ARCH_HAS_DMA_CLEAR_UNCACHED config ARCH_HAS_CPU_FINALIZE_INIT bool +# The architecture has a per-task state that includes the mm's PASID +config ARCH_HAS_CPU_PASID + bool + select IOMMU_MM_DATA + # Select if arch init_task must go in the __init_task_data section config ARCH_TASK_STRUCT_ON_STACK bool diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3762f41bb0929..68a2ec36a46ec 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -71,6 +71,7 @@ config X86 select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION select ARCH_HAS_CPU_FINALIZE_INIT + select ARCH_HAS_CPU_PASID if IOMMU_SVA select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index c876f1d36a81a..2b62dbb3396ad 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -565,7 +565,7 @@ static bool fixup_iopl_exception(struct pt_regs *regs) */ static bool try_fixup_enqcmd_gp(void) { -#ifdef CONFIG_IOMMU_SVA +#ifdef CONFIG_ARCH_HAS_CPU_PASID u32 pasid; /* diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 7673bb82945b6..9a29d742617e3 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -160,6 +160,7 @@ config IOMMU_DMA # Shared Virtual Addressing config IOMMU_SVA + select IOMMU_MM_DATA bool config FSL_PAMU diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c7394b39599c8..cd3f398095bf3 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1337,7 +1337,7 @@ static inline bool tegra_dev_iommu_get_stream_id(struct device *dev, u32 *stream return false; } -#ifdef CONFIG_IOMMU_SVA +#ifdef CONFIG_IOMMU_MM_DATA static inline void mm_pasid_init(struct mm_struct *mm) { mm->pasid = IOMMU_PASID_INVALID; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 957ce38768b2a..41f248608dd98 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -938,7 +938,7 @@ struct mm_struct { #endif struct work_struct async_put_work; -#ifdef CONFIG_IOMMU_SVA +#ifdef CONFIG_IOMMU_MM_DATA u32 pasid; #endif #ifdef CONFIG_KSM diff --git a/include/linux/sched.h b/include/linux/sched.h index 292c316972485..70888a36677b6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -954,7 +954,7 @@ struct task_struct { /* Recursion prevention for eventfd_signal() */ unsigned in_eventfd:1; #endif -#ifdef CONFIG_IOMMU_SVA +#ifdef CONFIG_ARCH_HAS_CPU_PASID unsigned pasid_activated:1; #endif #ifdef CONFIG_CPU_SUP_INTEL diff --git a/kernel/fork.c b/kernel/fork.c index 10917c3e1f036..43fd9bc1a5224 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1179,7 +1179,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->use_memdelay = 0; #endif -#ifdef CONFIG_IOMMU_SVA +#ifdef CONFIG_ARCH_HAS_CPU_PASID tsk->pasid_activated = 0; #endif diff --git a/mm/Kconfig b/mm/Kconfig index 89971a894b605..0143f4d905c96 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1270,6 +1270,9 @@ config LOCK_MM_AND_FIND_VMA bool depends on !STACK_GROWSUP +config IOMMU_MM_DATA + bool + source "mm/damon/Kconfig" endmenu diff --git a/mm/init-mm.c b/mm/init-mm.c index cfd367822cdd2..c52dc2740a3de 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -44,7 +44,7 @@ struct mm_struct init_mm = { #endif .user_ns = &init_user_ns, .cpu_bitmap = CPU_BITS_NONE, -#ifdef CONFIG_IOMMU_SVA +#ifdef CONFIG_IOMMU_MM_DATA .pasid = IOMMU_PASID_INVALID, #endif INIT_MM_CONTEXT(init_mm) -- GitLab From 5c79705d7ce8986036c73e89024d189d4f9df1ff Mon Sep 17 00:00:00 2001 From: Tina Zhang Date: Fri, 27 Oct 2023 08:05:21 +0800 Subject: [PATCH 0286/1290] iommu/vt-d: Remove mm->pasid in intel_sva_bind_mm() The pasid is passed in as a parameter through .set_dev_pasid() callback. Thus, intel_sva_bind_mm() can directly use it instead of retrieving the pasid value from mm->pasid. Suggested-by: Lu Baolu Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Signed-off-by: Tina Zhang Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231027000525.1278806-3-tina.zhang@intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/svm.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index ac12f76c1212a..4dbfa970dab53 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -316,21 +316,22 @@ out: } static int intel_svm_bind_mm(struct intel_iommu *iommu, struct device *dev, - struct mm_struct *mm) + struct iommu_domain *domain, ioasid_t pasid) { struct device_domain_info *info = dev_iommu_priv_get(dev); + struct mm_struct *mm = domain->mm; struct intel_svm_dev *sdev; struct intel_svm *svm; unsigned long sflags; int ret = 0; - svm = pasid_private_find(mm->pasid); + svm = pasid_private_find(pasid); if (!svm) { svm = kzalloc(sizeof(*svm), GFP_KERNEL); if (!svm) return -ENOMEM; - svm->pasid = mm->pasid; + svm->pasid = pasid; svm->mm = mm; INIT_LIST_HEAD_RCU(&svm->devs); @@ -368,7 +369,7 @@ static int intel_svm_bind_mm(struct intel_iommu *iommu, struct device *dev, /* Setup the pasid table: */ sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0; - ret = intel_pasid_setup_first_level(iommu, dev, mm->pgd, mm->pasid, + ret = intel_pasid_setup_first_level(iommu, dev, mm->pgd, pasid, FLPT_DEFAULT_DID, sflags); if (ret) goto free_sdev; @@ -382,7 +383,7 @@ free_sdev: free_svm: if (list_empty(&svm->devs)) { mmu_notifier_unregister(&svm->notifier, mm); - pasid_private_remove(mm->pasid); + pasid_private_remove(pasid); kfree(svm); } @@ -822,9 +823,8 @@ static int intel_svm_set_dev_pasid(struct iommu_domain *domain, { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; - struct mm_struct *mm = domain->mm; - return intel_svm_bind_mm(iommu, dev, mm); + return intel_svm_bind_mm(iommu, dev, domain, pasid); } static void intel_svm_domain_free(struct iommu_domain *domain) -- GitLab From 2396046d75d3c0b2cfead852a77efd023f8539dc Mon Sep 17 00:00:00 2001 From: Tina Zhang Date: Fri, 27 Oct 2023 08:05:22 +0800 Subject: [PATCH 0287/1290] iommu: Add mm_get_enqcmd_pasid() helper function mm_get_enqcmd_pasid() should be used by architecture code and closely related to learn the PASID value that the x86 ENQCMD operation should use for the mm. For the moment SMMUv3 uses this without any connection to ENQCMD, it will be cleaned up similar to how the prior patch made VT-d use the PASID argument of set_dev_pasid(). The motivation is to replace mm->pasid with an iommu private data structure that is introduced in a later patch. Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Tested-by: Nicolin Chen Signed-off-by: Tina Zhang Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231027000525.1278806-4-tina.zhang@intel.com Signed-off-by: Joerg Roedel --- arch/x86/kernel/traps.c | 2 +- .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 23 ++++++++++++------- drivers/iommu/iommu-sva.c | 2 +- include/linux/iommu.h | 12 ++++++++++ 4 files changed, 29 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 2b62dbb3396ad..5944d759afe7e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -591,7 +591,7 @@ static bool try_fixup_enqcmd_gp(void) if (!mm_valid_pasid(current->mm)) return false; - pasid = current->mm->pasid; + pasid = mm_get_enqcmd_pasid(current->mm); /* * Did this thread already have its PASID activated? diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index 353248ab18e76..05722121f00e7 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -246,7 +246,8 @@ static void arm_smmu_mm_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, smmu_domain); } - arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, start, size); + arm_smmu_atc_inv_domain(smmu_domain, mm_get_enqcmd_pasid(mm), start, + size); } static void arm_smmu_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) @@ -264,10 +265,11 @@ static void arm_smmu_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) * DMA may still be running. Keep the cd valid to avoid C_BAD_CD events, * but disable translation. */ - arm_smmu_update_ctx_desc_devices(smmu_domain, mm->pasid, &quiet_cd); + arm_smmu_update_ctx_desc_devices(smmu_domain, mm_get_enqcmd_pasid(mm), + &quiet_cd); arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_mn->cd->asid); - arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, 0, 0); + arm_smmu_atc_inv_domain(smmu_domain, mm_get_enqcmd_pasid(mm), 0, 0); smmu_mn->cleared = true; mutex_unlock(&sva_lock); @@ -325,10 +327,13 @@ arm_smmu_mmu_notifier_get(struct arm_smmu_domain *smmu_domain, spin_lock_irqsave(&smmu_domain->devices_lock, flags); list_for_each_entry(master, &smmu_domain->devices, domain_head) { - ret = arm_smmu_write_ctx_desc(master, mm->pasid, cd); + ret = arm_smmu_write_ctx_desc(master, mm_get_enqcmd_pasid(mm), + cd); if (ret) { - list_for_each_entry_from_reverse(master, &smmu_domain->devices, domain_head) - arm_smmu_write_ctx_desc(master, mm->pasid, NULL); + list_for_each_entry_from_reverse( + master, &smmu_domain->devices, domain_head) + arm_smmu_write_ctx_desc( + master, mm_get_enqcmd_pasid(mm), NULL); break; } } @@ -358,7 +363,8 @@ static void arm_smmu_mmu_notifier_put(struct arm_smmu_mmu_notifier *smmu_mn) list_del(&smmu_mn->list); - arm_smmu_update_ctx_desc_devices(smmu_domain, mm->pasid, NULL); + arm_smmu_update_ctx_desc_devices(smmu_domain, mm_get_enqcmd_pasid(mm), + NULL); /* * If we went through clear(), we've already invalidated, and no @@ -366,7 +372,8 @@ static void arm_smmu_mmu_notifier_put(struct arm_smmu_mmu_notifier *smmu_mn) */ if (!smmu_mn->cleared) { arm_smmu_tlb_inv_asid(smmu_domain->smmu, cd->asid); - arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, 0, 0); + arm_smmu_atc_inv_domain(smmu_domain, mm_get_enqcmd_pasid(mm), 0, + 0); } /* Frees smmu_mn */ diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index b78671a8a9143..4a2f5699747f1 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -141,7 +141,7 @@ u32 iommu_sva_get_pasid(struct iommu_sva *handle) { struct iommu_domain *domain = handle->domain; - return domain->mm->pasid; + return mm_get_enqcmd_pasid(domain->mm); } EXPORT_SYMBOL_GPL(iommu_sva_get_pasid); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index cd3f398095bf3..4fb239c6ca8d5 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1346,6 +1346,12 @@ static inline bool mm_valid_pasid(struct mm_struct *mm) { return mm->pasid != IOMMU_PASID_INVALID; } + +static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm) +{ + return mm->pasid; +} + void mm_pasid_drop(struct mm_struct *mm); struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm); @@ -1368,6 +1374,12 @@ static inline u32 iommu_sva_get_pasid(struct iommu_sva *handle) } static inline void mm_pasid_init(struct mm_struct *mm) {} static inline bool mm_valid_pasid(struct mm_struct *mm) { return false; } + +static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm) +{ + return IOMMU_PASID_INVALID; +} + static inline void mm_pasid_drop(struct mm_struct *mm) {} #endif /* CONFIG_IOMMU_SVA */ -- GitLab From 541a3e257d48c16b77d19f39ed939ef5832046df Mon Sep 17 00:00:00 2001 From: Tina Zhang Date: Fri, 27 Oct 2023 08:05:23 +0800 Subject: [PATCH 0288/1290] mm: Add structure to keep sva information Introduce iommu_mm_data structure to keep sva information (pasid and the related sva domains). Add iommu_mm pointer, pointing to an instance of iommu_mm_data structure, to mm. Reviewed-by: Vasant Hegde Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Tested-by: Nicolin Chen Signed-off-by: Tina Zhang Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231027000525.1278806-5-tina.zhang@intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 5 +++++ include/linux/mm_types.h | 2 ++ 2 files changed, 7 insertions(+) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 4fb239c6ca8d5..f7b1b469e98d6 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -812,6 +812,11 @@ struct iommu_sva { struct iommu_domain *domain; }; +struct iommu_mm_data { + u32 pasid; + struct list_head sva_domains; +}; + int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode, const struct iommu_ops *ops); void iommu_fwspec_free(struct device *dev); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 41f248608dd98..0b4314fab4787 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -727,6 +727,7 @@ struct mm_cid { #endif struct kioctx_table; +struct iommu_mm_data; struct mm_struct { struct { /* @@ -940,6 +941,7 @@ struct mm_struct { #ifdef CONFIG_IOMMU_MM_DATA u32 pasid; + struct iommu_mm_data *iommu_mm; #endif #ifdef CONFIG_KSM /* -- GitLab From 092edaddb660376648acb97678570ed5d8299768 Mon Sep 17 00:00:00 2001 From: Tina Zhang Date: Fri, 27 Oct 2023 08:05:24 +0800 Subject: [PATCH 0289/1290] iommu: Support mm PASID 1:n with sva domains Each mm bound to devices gets a PASID and corresponding sva domains allocated in iommu_sva_bind_device(), which are referenced by iommu_mm field of the mm. The PASID is released in __mmdrop(), while a sva domain is released when no one is using it (the reference count is decremented in iommu_sva_unbind_device()). However, although sva domains and their PASID are separate objects such that their own life cycles could be handled independently, an enqcmd use case may require releasing the PASID in releasing the mm (i.e., once a PASID is allocated for a mm, it will be permanently used by the mm and won't be released until the end of mm) and only allows to drop the PASID after the sva domains are released. To this end, mmgrab() is called in iommu_sva_domain_alloc() to increment the mm reference count and mmdrop() is invoked in iommu_domain_free() to decrement the mm reference count. Since the required info of PASID and sva domains is kept in struct iommu_mm_data of a mm, use mm->iommu_mm field instead of the old pasid field in mm struct. The sva domain list is protected by iommu_sva_lock. Besides, this patch removes mm_pasid_init(), as with the introduced iommu_mm structure, initializing mm pasid in mm_init() is unnecessary. Reviewed-by: Lu Baolu Reviewed-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Tested-by: Nicolin Chen Signed-off-by: Tina Zhang Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231027000525.1278806-6-tina.zhang@intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu-sva.c | 92 +++++++++++++++++++++++---------------- include/linux/iommu.h | 23 ++++++++-- 2 files changed, 74 insertions(+), 41 deletions(-) diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index 4a2f5699747f1..5175e8d85247b 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -12,32 +12,42 @@ static DEFINE_MUTEX(iommu_sva_lock); /* Allocate a PASID for the mm within range (inclusive) */ -static int iommu_sva_alloc_pasid(struct mm_struct *mm, struct device *dev) +static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct device *dev) { + struct iommu_mm_data *iommu_mm; ioasid_t pasid; - int ret = 0; + + lockdep_assert_held(&iommu_sva_lock); if (!arch_pgtable_dma_compat(mm)) - return -EBUSY; + return ERR_PTR(-EBUSY); - mutex_lock(&iommu_sva_lock); + iommu_mm = mm->iommu_mm; /* Is a PASID already associated with this mm? */ - if (mm_valid_pasid(mm)) { - if (mm->pasid >= dev->iommu->max_pasids) - ret = -EOVERFLOW; - goto out; + if (iommu_mm) { + if (iommu_mm->pasid >= dev->iommu->max_pasids) + return ERR_PTR(-EOVERFLOW); + return iommu_mm; } + iommu_mm = kzalloc(sizeof(struct iommu_mm_data), GFP_KERNEL); + if (!iommu_mm) + return ERR_PTR(-ENOMEM); + pasid = iommu_alloc_global_pasid(dev); if (pasid == IOMMU_PASID_INVALID) { - ret = -ENOSPC; - goto out; + kfree(iommu_mm); + return ERR_PTR(-ENOSPC); } - mm->pasid = pasid; - ret = 0; -out: - mutex_unlock(&iommu_sva_lock); - return ret; + iommu_mm->pasid = pasid; + INIT_LIST_HEAD(&iommu_mm->sva_domains); + /* + * Make sure the write to mm->iommu_mm is not reordered in front of + * initialization to iommu_mm fields. If it does, readers may see a + * valid iommu_mm with uninitialized values. + */ + smp_store_release(&mm->iommu_mm, iommu_mm); + return iommu_mm; } /** @@ -58,31 +68,33 @@ out: */ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) { + struct iommu_mm_data *iommu_mm; struct iommu_domain *domain; struct iommu_sva *handle; int ret; + mutex_lock(&iommu_sva_lock); + /* Allocate mm->pasid if necessary. */ - ret = iommu_sva_alloc_pasid(mm, dev); - if (ret) - return ERR_PTR(ret); + iommu_mm = iommu_alloc_mm_data(mm, dev); + if (IS_ERR(iommu_mm)) { + ret = PTR_ERR(iommu_mm); + goto out_unlock; + } handle = kzalloc(sizeof(*handle), GFP_KERNEL); - if (!handle) - return ERR_PTR(-ENOMEM); - - mutex_lock(&iommu_sva_lock); - /* Search for an existing domain. */ - domain = iommu_get_domain_for_dev_pasid(dev, mm->pasid, - IOMMU_DOMAIN_SVA); - if (IS_ERR(domain)) { - ret = PTR_ERR(domain); + if (!handle) { + ret = -ENOMEM; goto out_unlock; } - if (domain) { - domain->users++; - goto out; + /* Search for an existing domain. */ + list_for_each_entry(domain, &mm->iommu_mm->sva_domains, next) { + ret = iommu_attach_device_pasid(domain, dev, iommu_mm->pasid); + if (!ret) { + domain->users++; + goto out; + } } /* Allocate a new domain and set it on device pasid. */ @@ -92,23 +104,23 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm goto out_unlock; } - ret = iommu_attach_device_pasid(domain, dev, mm->pasid); + ret = iommu_attach_device_pasid(domain, dev, iommu_mm->pasid); if (ret) goto out_free_domain; domain->users = 1; + list_add(&domain->next, &mm->iommu_mm->sva_domains); + out: mutex_unlock(&iommu_sva_lock); handle->dev = dev; handle->domain = domain; - return handle; out_free_domain: iommu_domain_free(domain); + kfree(handle); out_unlock: mutex_unlock(&iommu_sva_lock); - kfree(handle); - return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(iommu_sva_bind_device); @@ -124,12 +136,13 @@ EXPORT_SYMBOL_GPL(iommu_sva_bind_device); void iommu_sva_unbind_device(struct iommu_sva *handle) { struct iommu_domain *domain = handle->domain; - ioasid_t pasid = domain->mm->pasid; + struct iommu_mm_data *iommu_mm = domain->mm->iommu_mm; struct device *dev = handle->dev; mutex_lock(&iommu_sva_lock); + iommu_detach_device_pasid(domain, dev, iommu_mm->pasid); if (--domain->users == 0) { - iommu_detach_device_pasid(domain, dev, pasid); + list_del(&domain->next); iommu_domain_free(domain); } mutex_unlock(&iommu_sva_lock); @@ -205,8 +218,11 @@ out_put_mm: void mm_pasid_drop(struct mm_struct *mm) { - if (likely(!mm_valid_pasid(mm))) + struct iommu_mm_data *iommu_mm = mm->iommu_mm; + + if (!iommu_mm) return; - iommu_free_global_pasid(mm->pasid); + iommu_free_global_pasid(iommu_mm->pasid); + kfree(iommu_mm); } diff --git a/include/linux/iommu.h b/include/linux/iommu.h index f7b1b469e98d6..c6bbbe0901d0c 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -121,6 +121,11 @@ struct iommu_domain { struct { /* IOMMU_DOMAIN_SVA */ struct mm_struct *mm; int users; + /* + * Next iommu_domain in mm->iommu_mm->sva-domains list + * protected by iommu_sva_lock. + */ + struct list_head next; }; }; }; @@ -1345,16 +1350,28 @@ static inline bool tegra_dev_iommu_get_stream_id(struct device *dev, u32 *stream #ifdef CONFIG_IOMMU_MM_DATA static inline void mm_pasid_init(struct mm_struct *mm) { - mm->pasid = IOMMU_PASID_INVALID; + /* + * During dup_mm(), a new mm will be memcpy'd from an old one and that makes + * the new mm and the old one point to a same iommu_mm instance. When either + * one of the two mms gets released, the iommu_mm instance is freed, leaving + * the other mm running into a use-after-free/double-free problem. To avoid + * the problem, zeroing the iommu_mm pointer of a new mm is needed here. + */ + mm->iommu_mm = NULL; } + static inline bool mm_valid_pasid(struct mm_struct *mm) { - return mm->pasid != IOMMU_PASID_INVALID; + return READ_ONCE(mm->iommu_mm); } static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm) { - return mm->pasid; + struct iommu_mm_data *iommu_mm = READ_ONCE(mm->iommu_mm); + + if (!iommu_mm) + return IOMMU_PASID_INVALID; + return iommu_mm->pasid; } void mm_pasid_drop(struct mm_struct *mm); -- GitLab From 1fa05c932dc71c474da38e4fd0456131128f8486 Mon Sep 17 00:00:00 2001 From: Tina Zhang Date: Fri, 27 Oct 2023 08:05:25 +0800 Subject: [PATCH 0290/1290] mm: Deprecate pasid field Drop the pasid field, as all the information needed for sva domain management has been moved to the newly added iommu_mm field. Reviewed-by: Lu Baolu Reviewed-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Signed-off-by: Tina Zhang Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231027000525.1278806-7-tina.zhang@intel.com Signed-off-by: Joerg Roedel --- include/linux/mm_types.h | 1 - mm/init-mm.c | 3 --- 2 files changed, 4 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0b4314fab4787..ec71c91e210ba 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -940,7 +940,6 @@ struct mm_struct { struct work_struct async_put_work; #ifdef CONFIG_IOMMU_MM_DATA - u32 pasid; struct iommu_mm_data *iommu_mm; #endif #ifdef CONFIG_KSM diff --git a/mm/init-mm.c b/mm/init-mm.c index c52dc2740a3de..24c8093792745 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -44,9 +44,6 @@ struct mm_struct init_mm = { #endif .user_ns = &init_user_ns, .cpu_bitmap = CPU_BITS_NONE, -#ifdef CONFIG_IOMMU_MM_DATA - .pasid = IOMMU_PASID_INVALID, -#endif INIT_MM_CONTEXT(init_mm) }; -- GitLab From 7be423336eccc872249d37900c19c1d24f171353 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Fri, 8 Dec 2023 09:53:14 +0800 Subject: [PATCH 0291/1290] iommu: Set owner token to SVA domain Commit a9c362db3920 ("iommu: Validate that devices match domains") added an owner token to the iommu_domain. This token is checked during domain attachment to RID or PASID through the generic iommu interfaces. The SVA domains are attached to PASIDs through those iommu interfaces. Therefore, they require the owner token to be set during allocation. Otherwise, they fail to attach. Set the owner token for SVA domains. Fixes: a9c362db3920 ("iommu: Validate that devices match domains") Cc: Robin Murphy Signed-off-by: Lu Baolu Reviewed-by: Robin Murphy Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231208015314.320663-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 0d25468d53a68..d0a28667479a8 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3617,6 +3617,7 @@ struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, domain->type = IOMMU_DOMAIN_SVA; mmgrab(mm); domain->mm = mm; + domain->owner = ops; domain->iopf_handler = iommu_sva_handle_iopf; domain->fault_data = mm; -- GitLab From 4720287c7bf76e59d19d4dfbdc3f54eeea6fd46b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 7 Dec 2023 14:03:08 -0400 Subject: [PATCH 0292/1290] iommu: Remove struct iommu_ops *iommu from arch_setup_dma_ops() This is not being used to pass ops, it is just a way to tell if an iommu driver was probed. These days this can be detected directly via device_iommu_mapped(). Call device_iommu_mapped() in the two places that need to check it and remove the iommu parameter everywhere. Reviewed-by: Jerry Snitselaar Reviewed-by: Lu Baolu Reviewed-by: Moritz Fischer Acked-by: Christoph Hellwig Acked-by: Rob Herring Tested-by: Hector Martin Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v2-16e4def25ebb+820-iommu_fwspec_p1_jgg@nvidia.com Signed-off-by: Joerg Roedel --- arch/arc/mm/dma.c | 2 +- arch/arm/mm/dma-mapping-nommu.c | 2 +- arch/arm/mm/dma-mapping.c | 10 +++++----- arch/arm64/mm/dma-mapping.c | 4 ++-- arch/mips/mm/dma-noncoherent.c | 2 +- arch/riscv/mm/dma-noncoherent.c | 2 +- drivers/acpi/scan.c | 3 +-- drivers/hv/hv_common.c | 2 +- drivers/of/device.c | 2 +- include/linux/dma-map-ops.h | 4 ++-- 10 files changed, 16 insertions(+), 17 deletions(-) diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c index 2a7fbbb83b705..197707bc76588 100644 --- a/arch/arc/mm/dma.c +++ b/arch/arc/mm/dma.c @@ -91,7 +91,7 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, * Plug in direct dma map ops. */ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, - const struct iommu_ops *iommu, bool coherent) + bool coherent) { /* * IOC hardware snoops all DMA traffic keeping the caches consistent diff --git a/arch/arm/mm/dma-mapping-nommu.c b/arch/arm/mm/dma-mapping-nommu.c index cfd9c933d2f09..b94850b579952 100644 --- a/arch/arm/mm/dma-mapping-nommu.c +++ b/arch/arm/mm/dma-mapping-nommu.c @@ -34,7 +34,7 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, } void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, - const struct iommu_ops *iommu, bool coherent) + bool coherent) { if (IS_ENABLED(CONFIG_CPU_V7M)) { /* diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 5409225b4abc0..6c359a3af8d9c 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -1713,7 +1713,7 @@ void arm_iommu_detach_device(struct device *dev) EXPORT_SYMBOL_GPL(arm_iommu_detach_device); static void arm_setup_iommu_dma_ops(struct device *dev, u64 dma_base, u64 size, - const struct iommu_ops *iommu, bool coherent) + bool coherent) { struct dma_iommu_mapping *mapping; @@ -1748,7 +1748,7 @@ static void arm_teardown_iommu_dma_ops(struct device *dev) #else static void arm_setup_iommu_dma_ops(struct device *dev, u64 dma_base, u64 size, - const struct iommu_ops *iommu, bool coherent) + bool coherent) { } @@ -1757,7 +1757,7 @@ static void arm_teardown_iommu_dma_ops(struct device *dev) { } #endif /* CONFIG_ARM_DMA_USE_IOMMU */ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, - const struct iommu_ops *iommu, bool coherent) + bool coherent) { /* * Due to legacy code that sets the ->dma_coherent flag from a bus @@ -1776,8 +1776,8 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, if (dev->dma_ops) return; - if (iommu) - arm_setup_iommu_dma_ops(dev, dma_base, size, iommu, coherent); + if (device_iommu_mapped(dev)) + arm_setup_iommu_dma_ops(dev, dma_base, size, coherent); xen_setup_dma_ops(dev); dev->archdata.dma_ops_setup = true; diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 3cb101e8cb29b..61886e43e3a10 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -47,7 +47,7 @@ void arch_teardown_dma_ops(struct device *dev) #endif void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, - const struct iommu_ops *iommu, bool coherent) + bool coherent) { int cls = cache_line_size_of_cpu(); @@ -58,7 +58,7 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, ARCH_DMA_MINALIGN, cls); dev->dma_coherent = coherent; - if (iommu) + if (device_iommu_mapped(dev)) iommu_setup_dma_ops(dev, dma_base, dma_base + size - 1); xen_setup_dma_ops(dev); diff --git a/arch/mips/mm/dma-noncoherent.c b/arch/mips/mm/dma-noncoherent.c index 3c4fc97b9f394..0f3cec663a12c 100644 --- a/arch/mips/mm/dma-noncoherent.c +++ b/arch/mips/mm/dma-noncoherent.c @@ -138,7 +138,7 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, #ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, - const struct iommu_ops *iommu, bool coherent) + bool coherent) { dev->dma_coherent = coherent; } diff --git a/arch/riscv/mm/dma-noncoherent.c b/arch/riscv/mm/dma-noncoherent.c index 4e4e469b8dd66..843107f834b23 100644 --- a/arch/riscv/mm/dma-noncoherent.c +++ b/arch/riscv/mm/dma-noncoherent.c @@ -129,7 +129,7 @@ void arch_dma_prep_coherent(struct page *page, size_t size) } void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, - const struct iommu_ops *iommu, bool coherent) + bool coherent) { WARN_TAINT(!coherent && riscv_cbom_block_size > ARCH_DMA_MINALIGN, TAINT_CPU_OUT_OF_SPEC, diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 02bb2cce423f4..444a0b3c72f2d 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -1641,8 +1641,7 @@ int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr, if (PTR_ERR(iommu) == -EPROBE_DEFER) return -EPROBE_DEFER; - arch_setup_dma_ops(dev, 0, U64_MAX, - iommu, attr == DEV_DMA_COHERENT); + arch_setup_dma_ops(dev, 0, U64_MAX, attr == DEV_DMA_COHERENT); return 0; } diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index 4372f5d146ab2..0285a74363b3d 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -488,7 +488,7 @@ void hv_setup_dma_ops(struct device *dev, bool coherent) * Hyper-V does not offer a vIOMMU in the guest * VM, so pass 0/NULL for the IOMMU settings */ - arch_setup_dma_ops(dev, 0, 0, NULL, coherent); + arch_setup_dma_ops(dev, 0, 0, coherent); } EXPORT_SYMBOL_GPL(hv_setup_dma_ops); diff --git a/drivers/of/device.c b/drivers/of/device.c index 1ca42ad9dd159..65c71be71a8d4 100644 --- a/drivers/of/device.c +++ b/drivers/of/device.c @@ -193,7 +193,7 @@ int of_dma_configure_id(struct device *dev, struct device_node *np, dev_dbg(dev, "device is%sbehind an iommu\n", iommu ? " " : " not "); - arch_setup_dma_ops(dev, dma_start, size, iommu, coherent); + arch_setup_dma_ops(dev, dma_start, size, coherent); if (!iommu) of_dma_set_restricted_buffer(dev, np); diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index a52e508d1869f..e9cc317e9d7de 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -427,10 +427,10 @@ bool arch_dma_unmap_sg_direct(struct device *dev, struct scatterlist *sg, #ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, - const struct iommu_ops *iommu, bool coherent); + bool coherent); #else static inline void arch_setup_dma_ops(struct device *dev, u64 dma_base, - u64 size, const struct iommu_ops *iommu, bool coherent) + u64 size, bool coherent) { } #endif /* CONFIG_ARCH_HAS_SETUP_DMA_OPS */ -- GitLab From 6ff6e184f1f4d4993d45ca3f934c8288890965fe Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 7 Dec 2023 14:03:09 -0400 Subject: [PATCH 0293/1290] iommmu/of: Do not return struct iommu_ops from of_iommu_configure() Nothing needs this pointer. Return a normal error code with the usual IOMMU semantic that ENODEV means 'there is no IOMMU driver'. Reviewed-by: Jerry Snitselaar Reviewed-by: Lu Baolu Acked-by: Rob Herring Tested-by: Hector Martin Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v2-16e4def25ebb+820-iommu_fwspec_p1_jgg@nvidia.com Signed-off-by: Joerg Roedel --- drivers/iommu/of_iommu.c | 31 +++++++++++++++++++------------ drivers/of/device.c | 22 +++++++++++++++------- include/linux/of_iommu.h | 13 ++++++------- 3 files changed, 40 insertions(+), 26 deletions(-) diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index 5ecca53847d32..c6510d7e7b241 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -107,16 +107,22 @@ static int of_iommu_configure_device(struct device_node *master_np, of_iommu_configure_dev(master_np, dev); } -const struct iommu_ops *of_iommu_configure(struct device *dev, - struct device_node *master_np, - const u32 *id) +/* + * Returns: + * 0 on success, an iommu was configured + * -ENODEV if the device does not have any IOMMU + * -EPROBEDEFER if probing should be tried again + * -errno fatal errors + */ +int of_iommu_configure(struct device *dev, struct device_node *master_np, + const u32 *id) { const struct iommu_ops *ops = NULL; struct iommu_fwspec *fwspec; int err = NO_IOMMU; if (!master_np) - return NULL; + return -ENODEV; /* Serialise to make dev->iommu stable under our potential fwspec */ mutex_lock(&iommu_probe_device_lock); @@ -124,7 +130,7 @@ const struct iommu_ops *of_iommu_configure(struct device *dev, if (fwspec) { if (fwspec->ops) { mutex_unlock(&iommu_probe_device_lock); - return fwspec->ops; + return 0; } /* In the deferred case, start again from scratch */ iommu_fwspec_free(dev); @@ -169,14 +175,15 @@ const struct iommu_ops *of_iommu_configure(struct device *dev, err = iommu_probe_device(dev); /* Ignore all other errors apart from EPROBE_DEFER */ - if (err == -EPROBE_DEFER) { - ops = ERR_PTR(err); - } else if (err < 0) { - dev_dbg(dev, "Adding to IOMMU failed: %d\n", err); - ops = NULL; + if (err < 0) { + if (err == -EPROBE_DEFER) + return err; + dev_dbg(dev, "Adding to IOMMU failed: %pe\n", ERR_PTR(err)); + return err; } - - return ops; + if (!ops) + return -ENODEV; + return 0; } static enum iommu_resv_type __maybe_unused diff --git a/drivers/of/device.c b/drivers/of/device.c index 65c71be71a8d4..873d933e8e6d1 100644 --- a/drivers/of/device.c +++ b/drivers/of/device.c @@ -93,12 +93,12 @@ of_dma_set_restricted_buffer(struct device *dev, struct device_node *np) int of_dma_configure_id(struct device *dev, struct device_node *np, bool force_dma, const u32 *id) { - const struct iommu_ops *iommu; const struct bus_dma_region *map = NULL; struct device_node *bus_np; u64 dma_start = 0; u64 mask, end, size = 0; bool coherent; + int iommu_ret; int ret; if (np == dev->of_node) @@ -181,21 +181,29 @@ int of_dma_configure_id(struct device *dev, struct device_node *np, dev_dbg(dev, "device is%sdma coherent\n", coherent ? " " : " not "); - iommu = of_iommu_configure(dev, np, id); - if (PTR_ERR(iommu) == -EPROBE_DEFER) { + iommu_ret = of_iommu_configure(dev, np, id); + if (iommu_ret == -EPROBE_DEFER) { /* Don't touch range map if it wasn't set from a valid dma-ranges */ if (!ret) dev->dma_range_map = NULL; kfree(map); return -EPROBE_DEFER; - } + } else if (iommu_ret == -ENODEV) { + dev_dbg(dev, "device is not behind an iommu\n"); + } else if (iommu_ret) { + dev_err(dev, "iommu configuration for device failed with %pe\n", + ERR_PTR(iommu_ret)); - dev_dbg(dev, "device is%sbehind an iommu\n", - iommu ? " " : " not "); + /* + * Historically this routine doesn't fail driver probing + * due to errors in of_iommu_configure() + */ + } else + dev_dbg(dev, "device is behind an iommu\n"); arch_setup_dma_ops(dev, dma_start, size, coherent); - if (!iommu) + if (iommu_ret) of_dma_set_restricted_buffer(dev, np); return 0; diff --git a/include/linux/of_iommu.h b/include/linux/of_iommu.h index 9a5e6b410dd2f..e61cbbe12dac6 100644 --- a/include/linux/of_iommu.h +++ b/include/linux/of_iommu.h @@ -8,20 +8,19 @@ struct iommu_ops; #ifdef CONFIG_OF_IOMMU -extern const struct iommu_ops *of_iommu_configure(struct device *dev, - struct device_node *master_np, - const u32 *id); +extern int of_iommu_configure(struct device *dev, struct device_node *master_np, + const u32 *id); extern void of_iommu_get_resv_regions(struct device *dev, struct list_head *list); #else -static inline const struct iommu_ops *of_iommu_configure(struct device *dev, - struct device_node *master_np, - const u32 *id) +static inline int of_iommu_configure(struct device *dev, + struct device_node *master_np, + const u32 *id) { - return NULL; + return -ENODEV; } static inline void of_iommu_get_resv_regions(struct device *dev, -- GitLab From 5b4ea8b06eb79234a244ffc1f7405aa968f62069 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 7 Dec 2023 14:03:10 -0400 Subject: [PATCH 0294/1290] iommu/of: Use -ENODEV consistently in of_iommu_configure() Instead of returning 1 and trying to handle positive error codes just stick to the convention of returning -ENODEV. Remove references to ops from of_iommu_configure(), a NULL ops will already generate an error code. There is no reason to check dev->bus, if err=0 at this point then the called configure functions thought there was an iommu and we should try to probe it. Remove it. Reviewed-by: Jerry Snitselaar Reviewed-by: Moritz Fischer Tested-by: Hector Martin Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v2-16e4def25ebb+820-iommu_fwspec_p1_jgg@nvidia.com Signed-off-by: Joerg Roedel --- drivers/iommu/of_iommu.c | 49 ++++++++++++---------------------------- 1 file changed, 15 insertions(+), 34 deletions(-) diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index c6510d7e7b241..164317bfb8a81 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -17,8 +17,6 @@ #include #include -#define NO_IOMMU 1 - static int of_iommu_xlate(struct device *dev, struct of_phandle_args *iommu_spec) { @@ -29,7 +27,7 @@ static int of_iommu_xlate(struct device *dev, ops = iommu_ops_from_fwnode(fwnode); if ((ops && !ops->of_xlate) || !of_device_is_available(iommu_spec->np)) - return NO_IOMMU; + return -ENODEV; ret = iommu_fwspec_init(dev, &iommu_spec->np->fwnode, ops); if (ret) @@ -61,7 +59,7 @@ static int of_iommu_configure_dev_id(struct device_node *master_np, "iommu-map-mask", &iommu_spec.np, iommu_spec.args); if (err) - return err == -ENODEV ? NO_IOMMU : err; + return err; err = of_iommu_xlate(dev, &iommu_spec); of_node_put(iommu_spec.np); @@ -72,7 +70,7 @@ static int of_iommu_configure_dev(struct device_node *master_np, struct device *dev) { struct of_phandle_args iommu_spec; - int err = NO_IOMMU, idx = 0; + int err = -ENODEV, idx = 0; while (!of_parse_phandle_with_args(master_np, "iommus", "#iommu-cells", @@ -117,9 +115,8 @@ static int of_iommu_configure_device(struct device_node *master_np, int of_iommu_configure(struct device *dev, struct device_node *master_np, const u32 *id) { - const struct iommu_ops *ops = NULL; struct iommu_fwspec *fwspec; - int err = NO_IOMMU; + int err; if (!master_np) return -ENODEV; @@ -153,37 +150,21 @@ int of_iommu_configure(struct device *dev, struct device_node *master_np, } else { err = of_iommu_configure_device(master_np, dev, id); } - - /* - * Two success conditions can be represented by non-negative err here: - * >0 : there is no IOMMU, or one was unavailable for non-fatal reasons - * 0 : we found an IOMMU, and dev->fwspec is initialised appropriately - * <0 : any actual error - */ - if (!err) { - /* The fwspec pointer changed, read it again */ - fwspec = dev_iommu_fwspec_get(dev); - ops = fwspec->ops; - } mutex_unlock(&iommu_probe_device_lock); - /* - * If we have reason to believe the IOMMU driver missed the initial - * probe for dev, replay it to get things in order. - */ - if (!err && dev->bus) - err = iommu_probe_device(dev); - - /* Ignore all other errors apart from EPROBE_DEFER */ - if (err < 0) { - if (err == -EPROBE_DEFER) - return err; - dev_dbg(dev, "Adding to IOMMU failed: %pe\n", ERR_PTR(err)); + if (err == -ENODEV || err == -EPROBE_DEFER) return err; - } - if (!ops) - return -ENODEV; + if (err) + goto err_log; + + err = iommu_probe_device(dev); + if (err) + goto err_log; return 0; + +err_log: + dev_dbg(dev, "Adding to IOMMU failed: %pe\n", ERR_PTR(err)); + return err; } static enum iommu_resv_type __maybe_unused -- GitLab From 64945d1b0ed169aeffa59020941e4ac45ebc315a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 7 Dec 2023 14:03:11 -0400 Subject: [PATCH 0295/1290] iommu: Mark dev_iommu_get() with lockdep Allocation of dev->iommu must be done under the iommu_probe_device_lock. Mark this with lockdep to discourage future mistakes. Reviewed-by: Jerry Snitselaar Tested-by: Hector Martin Reviewed-by: Lu Baolu Reviewed-by: Moritz Fischer Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4-v2-16e4def25ebb+820-iommu_fwspec_p1_jgg@nvidia.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index d0a28667479a8..df58025c001b3 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -334,6 +334,8 @@ static struct dev_iommu *dev_iommu_get(struct device *dev) { struct dev_iommu *param = dev->iommu; + lockdep_assert_held(&iommu_probe_device_lock); + if (param) return param; -- GitLab From eda1a94caf6b05482bbf57dc244e7a31a9dba77c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 7 Dec 2023 14:03:12 -0400 Subject: [PATCH 0296/1290] iommu: Mark dev_iommu_priv_set() with a lockdep A perfect driver would only call dev_iommu_priv_set() from its probe callback. We've made it functionally correct to call it from the of_xlate by adding a lock around that call. lockdep assert that iommu_probe_device_lock is held to discourage misuse. Exclude PPC kernels with CONFIG_FSL_PAMU turned on because FSL_PAMU uses a global static for its priv and abuses priv for its domain. Remove the pointless stores of NULL, all these are on paths where the core code will free dev->iommu after the op returns. Reviewed-by: Lu Baolu Reviewed-by: Jerry Snitselaar Tested-by: Hector Martin Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/5-v2-16e4def25ebb+820-iommu_fwspec_p1_jgg@nvidia.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 2 -- drivers/iommu/apple-dart.c | 1 - drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 1 - drivers/iommu/arm/arm-smmu/arm-smmu.c | 1 - drivers/iommu/intel/iommu.c | 2 -- drivers/iommu/iommu.c | 9 +++++++++ drivers/iommu/omap-iommu.c | 1 - include/linux/iommu.h | 5 +---- 8 files changed, 10 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index fcc987f5d4edc..8199c678c2dc2 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -551,8 +551,6 @@ static void amd_iommu_uninit_device(struct device *dev) if (dev_data->domain) detach_device(dev); - dev_iommu_priv_set(dev, NULL); - /* * We keep dev_data around for unplugged devices and reuse it when the * device is re-plugged - not doing so would introduce a ton of races. diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c index ee05f4824bfad..56cfc33042e0b 100644 --- a/drivers/iommu/apple-dart.c +++ b/drivers/iommu/apple-dart.c @@ -740,7 +740,6 @@ static void apple_dart_release_device(struct device *dev) { struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev); - dev_iommu_priv_set(dev, NULL); kfree(cfg); } diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index fc4317c25b6d5..1855d3892b15f 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2695,7 +2695,6 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev) err_free_master: kfree(master); - dev_iommu_priv_set(dev, NULL); return ERR_PTR(ret); } diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 4d09c00478927..adc7937fd8a3a 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1420,7 +1420,6 @@ static void arm_smmu_release_device(struct device *dev) arm_smmu_rpm_put(cfg->smmu); - dev_iommu_priv_set(dev, NULL); kfree(cfg); } diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 897159dba47de..511589341074f 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4461,7 +4461,6 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev) ret = intel_pasid_alloc_table(dev); if (ret) { dev_err(dev, "PASID table allocation failed\n"); - dev_iommu_priv_set(dev, NULL); kfree(info); return ERR_PTR(ret); } @@ -4479,7 +4478,6 @@ static void intel_iommu_release_device(struct device *dev) dmar_remove_one_dev_info(dev); intel_pasid_free_table(dev); intel_iommu_debugfs_remove_dev(info); - dev_iommu_priv_set(dev, NULL); kfree(info); set_dma_ops(dev, NULL); } diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index df58025c001b3..68e648b557670 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -387,6 +387,15 @@ static u32 dev_iommu_get_max_pasids(struct device *dev) return min_t(u32, max_pasids, dev->iommu->iommu_dev->max_pasids); } +void dev_iommu_priv_set(struct device *dev, void *priv) +{ + /* FSL_PAMU does something weird */ + if (!IS_ENABLED(CONFIG_FSL_PAMU)) + lockdep_assert_held(&iommu_probe_device_lock); + dev->iommu->priv = priv; +} +EXPORT_SYMBOL_GPL(dev_iommu_priv_set); + /* * Init the dev->iommu and dev->iommu_group in the struct device and get the * driver probed diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index c66b070841dd4..c9528065a59af 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1719,7 +1719,6 @@ static void omap_iommu_release_device(struct device *dev) if (!dev->of_node || !arch_data) return; - dev_iommu_priv_set(dev, NULL); kfree(arch_data); } diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c6bbbe0901d0c..3a556996fea7f 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -850,10 +850,7 @@ static inline void *dev_iommu_priv_get(struct device *dev) return NULL; } -static inline void dev_iommu_priv_set(struct device *dev, void *priv) -{ - dev->iommu->priv = priv; -} +void dev_iommu_priv_set(struct device *dev, void *priv); extern struct mutex iommu_probe_device_lock; int iommu_probe_device(struct device *dev); -- GitLab From cdbc723f2da11331bc92858da50dc7a7610ed9bd Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 7 Dec 2023 14:03:13 -0400 Subject: [PATCH 0297/1290] acpi: Do not return struct iommu_ops from acpi_iommu_configure_id() Nothing needs this pointer. Return a normal error code with the usual IOMMU semantic that ENODEV means 'there is no IOMMU driver'. Acked-by: Rafael J. Wysocki Reviewed-by: Jerry Snitselaar Reviewed-by: Lu Baolu Reviewed-by: Moritz Fischer Tested-by: Hector Martin Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/6-v2-16e4def25ebb+820-iommu_fwspec_p1_jgg@nvidia.com Signed-off-by: Joerg Roedel --- drivers/acpi/scan.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 444a0b3c72f2d..340ba720c7212 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -1562,8 +1562,7 @@ static inline const struct iommu_ops *acpi_iommu_fwspec_ops(struct device *dev) return fwspec ? fwspec->ops : NULL; } -static const struct iommu_ops *acpi_iommu_configure_id(struct device *dev, - const u32 *id_in) +static int acpi_iommu_configure_id(struct device *dev, const u32 *id_in) { int err; const struct iommu_ops *ops; @@ -1577,7 +1576,7 @@ static const struct iommu_ops *acpi_iommu_configure_id(struct device *dev, ops = acpi_iommu_fwspec_ops(dev); if (ops) { mutex_unlock(&iommu_probe_device_lock); - return ops; + return 0; } err = iort_iommu_configure_id(dev, id_in); @@ -1594,12 +1593,14 @@ static const struct iommu_ops *acpi_iommu_configure_id(struct device *dev, /* Ignore all other errors apart from EPROBE_DEFER */ if (err == -EPROBE_DEFER) { - return ERR_PTR(err); + return err; } else if (err) { dev_dbg(dev, "Adding to IOMMU failed: %d\n", err); - return NULL; + return -ENODEV; } - return acpi_iommu_fwspec_ops(dev); + if (!acpi_iommu_fwspec_ops(dev)) + return -ENODEV; + return 0; } #else /* !CONFIG_IOMMU_API */ @@ -1611,10 +1612,9 @@ int acpi_iommu_fwspec_init(struct device *dev, u32 id, return -ENODEV; } -static const struct iommu_ops *acpi_iommu_configure_id(struct device *dev, - const u32 *id_in) +static int acpi_iommu_configure_id(struct device *dev, const u32 *id_in) { - return NULL; + return -ENODEV; } #endif /* !CONFIG_IOMMU_API */ @@ -1628,7 +1628,7 @@ static const struct iommu_ops *acpi_iommu_configure_id(struct device *dev, int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr, const u32 *input_id) { - const struct iommu_ops *iommu; + int ret; if (attr == DEV_DMA_NOT_SUPPORTED) { set_dma_ops(dev, &dma_dummy_ops); @@ -1637,10 +1637,15 @@ int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr, acpi_arch_dma_setup(dev); - iommu = acpi_iommu_configure_id(dev, input_id); - if (PTR_ERR(iommu) == -EPROBE_DEFER) + ret = acpi_iommu_configure_id(dev, input_id); + if (ret == -EPROBE_DEFER) return -EPROBE_DEFER; + /* + * Historically this routine doesn't fail driver probing due to errors + * in acpi_iommu_configure_id() + */ + arch_setup_dma_ops(dev, 0, U64_MAX, attr == DEV_DMA_COHERENT); return 0; -- GitLab From bf9cd9fef9f15531680325f956f81317d46a159d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 7 Dec 2023 14:03:14 -0400 Subject: [PATCH 0298/1290] iommu/tegra: Use tegra_dev_iommu_get_stream_id() in the remaining places This API was defined to formalize the access to internal iommu details on some Tegra SOCs, but a few callers got missed. Add them. The helper already masks by 0xFFFF so remove this code from the callers. Suggested-by: Thierry Reding Reviewed-by: Thierry Reding Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/7-v2-16e4def25ebb+820-iommu_fwspec_p1_jgg@nvidia.com Signed-off-by: Joerg Roedel --- drivers/dma/tegra186-gpc-dma.c | 8 +++----- drivers/gpu/drm/nouveau/nvkm/subdev/ltc/gp10b.c | 9 ++------- drivers/memory/tegra/tegra186.c | 14 ++++++++------ 3 files changed, 13 insertions(+), 18 deletions(-) diff --git a/drivers/dma/tegra186-gpc-dma.c b/drivers/dma/tegra186-gpc-dma.c index fa4d4142a68a2..88547a23825b1 100644 --- a/drivers/dma/tegra186-gpc-dma.c +++ b/drivers/dma/tegra186-gpc-dma.c @@ -1348,8 +1348,8 @@ static int tegra_dma_program_sid(struct tegra_dma_channel *tdc, int stream_id) static int tegra_dma_probe(struct platform_device *pdev) { const struct tegra_dma_chip_data *cdata = NULL; - struct iommu_fwspec *iommu_spec; - unsigned int stream_id, i; + unsigned int i; + u32 stream_id; struct tegra_dma *tdma; int ret; @@ -1378,12 +1378,10 @@ static int tegra_dma_probe(struct platform_device *pdev) tdma->dma_dev.dev = &pdev->dev; - iommu_spec = dev_iommu_fwspec_get(&pdev->dev); - if (!iommu_spec) { + if (!tegra_dev_iommu_get_stream_id(&pdev->dev, &stream_id)) { dev_err(&pdev->dev, "Missing iommu stream-id\n"); return -EINVAL; } - stream_id = iommu_spec->ids[0] & 0xffff; ret = device_property_read_u32(&pdev->dev, "dma-channel-mask", &tdma->chan_mask); diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/ltc/gp10b.c b/drivers/gpu/drm/nouveau/nvkm/subdev/ltc/gp10b.c index e7e8fdf3adab7..29682722b0b36 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/ltc/gp10b.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/ltc/gp10b.c @@ -28,19 +28,14 @@ static void gp10b_ltc_init(struct nvkm_ltc *ltc) { struct nvkm_device *device = ltc->subdev.device; - struct iommu_fwspec *spec; + u32 sid; nvkm_wr32(device, 0x17e27c, ltc->ltc_nr); nvkm_wr32(device, 0x17e000, ltc->ltc_nr); nvkm_wr32(device, 0x100800, ltc->ltc_nr); - spec = dev_iommu_fwspec_get(device->dev); - if (spec) { - u32 sid = spec->ids[0] & 0xffff; - - /* stream ID */ + if (tegra_dev_iommu_get_stream_id(device->dev, &sid)) nvkm_wr32(device, 0x160000, sid << 2); - } } static const struct nvkm_ltc_func diff --git a/drivers/memory/tegra/tegra186.c b/drivers/memory/tegra/tegra186.c index 533f85a4b2bdb..9cbf22a10a827 100644 --- a/drivers/memory/tegra/tegra186.c +++ b/drivers/memory/tegra/tegra186.c @@ -111,9 +111,12 @@ static void tegra186_mc_client_sid_override(struct tegra_mc *mc, static int tegra186_mc_probe_device(struct tegra_mc *mc, struct device *dev) { #if IS_ENABLED(CONFIG_IOMMU_API) - struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct of_phandle_args args; unsigned int i, index = 0; + u32 sid; + + if (!tegra_dev_iommu_get_stream_id(dev, &sid)) + return 0; while (!of_parse_phandle_with_args(dev->of_node, "interconnects", "#interconnect-cells", index, &args)) { @@ -121,11 +124,10 @@ static int tegra186_mc_probe_device(struct tegra_mc *mc, struct device *dev) for (i = 0; i < mc->soc->num_clients; i++) { const struct tegra_mc_client *client = &mc->soc->clients[i]; - if (client->id == args.args[0]) { - u32 sid = fwspec->ids[0] & MC_SID_STREAMID_OVERRIDE_MASK; - - tegra186_mc_client_sid_override(mc, client, sid); - } + if (client->id == args.args[0]) + tegra186_mc_client_sid_override( + mc, client, + sid & MC_SID_STREAMID_OVERRIDE_MASK); } } -- GitLab From 54dae6d5d3f091568322c4a69ddd3459a81545ad Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sun, 12 Nov 2023 19:45:22 +0100 Subject: [PATCH 0299/1290] dt-bindings: iommu: arm,smmu: document clocks for the SM8350 GPU SMMU Document the clocks for Qualcomm SM8350 Adreno GPU SMMU, already used in DTS: sm8350-hdk.dtb: iommu@3da0000: clock-names: False schema does not allow ['bus', 'iface', 'ahb', 'hlos1_vote_gpu_smmu', 'cx_gmu', 'hub_cx_int', 'hub_aon'] Signed-off-by: Krzysztof Kozlowski Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20231112184522.3759-1-krzysztof.kozlowski@linaro.org Signed-off-by: Will Deacon --- .../devicetree/bindings/iommu/arm,smmu.yaml | 25 ++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/iommu/arm,smmu.yaml b/Documentation/devicetree/bindings/iommu/arm,smmu.yaml index aa9e1c0895a50..e89f8a5c44e42 100644 --- a/Documentation/devicetree/bindings/iommu/arm,smmu.yaml +++ b/Documentation/devicetree/bindings/iommu/arm,smmu.yaml @@ -429,6 +429,30 @@ allOf: - description: interface clock required to access smmu's registers through the TCU's programming interface. + - if: + properties: + compatible: + items: + - enum: + - qcom,sm8350-smmu-500 + - const: qcom,adreno-smmu + - const: qcom,smmu-500 + - const: arm,mmu-500 + then: + properties: + clock-names: + items: + - const: bus + - const: iface + - const: ahb + - const: hlos1_vote_gpu_smmu + - const: cx_gmu + - const: hub_cx_int + - const: hub_aon + clocks: + minItems: 7 + maxItems: 7 + - if: properties: compatible: @@ -472,7 +496,6 @@ allOf: - qcom,sdx65-smmu-500 - qcom,sm6350-smmu-500 - qcom,sm6375-smmu-500 - - qcom,sm8350-smmu-500 - qcom,sm8450-smmu-500 - qcom,sm8550-smmu-500 then: -- GitLab From 61683b47df44fa7efd4039676436e711f7e31c70 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Tue, 28 Nov 2023 09:41:23 +0100 Subject: [PATCH 0300/1290] dt-bindings: iommu: arm,smmu: document the SM8650 System MMU Document the System MMU on the SM8650 Platform. Acked-by: Rob Herring Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20231128-topic-sm8650-upstream-bindings-smmu-v2-1-d3fbcaf1ea92@linaro.org Signed-off-by: Will Deacon --- Documentation/devicetree/bindings/iommu/arm,smmu.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/iommu/arm,smmu.yaml b/Documentation/devicetree/bindings/iommu/arm,smmu.yaml index e89f8a5c44e42..b4b76437369a3 100644 --- a/Documentation/devicetree/bindings/iommu/arm,smmu.yaml +++ b/Documentation/devicetree/bindings/iommu/arm,smmu.yaml @@ -56,6 +56,7 @@ properties: - qcom,sm8350-smmu-500 - qcom,sm8450-smmu-500 - qcom,sm8550-smmu-500 + - qcom,sm8650-smmu-500 - const: qcom,smmu-500 - const: arm,mmu-500 @@ -498,6 +499,7 @@ allOf: - qcom,sm6375-smmu-500 - qcom,sm8450-smmu-500 - qcom,sm8550-smmu-500 + - qcom,sm8650-smmu-500 then: properties: clock-names: false -- GitLab From fa27b35c9102e3e2b75455517f7091b294cc0552 Mon Sep 17 00:00:00 2001 From: Rajendra Nayak Date: Fri, 24 Nov 2023 15:36:05 +0530 Subject: [PATCH 0301/1290] dt-bindings: arm-smmu: Add compatible for X1E80100 SoC Add the SoC specific compatible for X1E80100 implementing arm,mmu-500. Signed-off-by: Rajendra Nayak Co-developed-by: Sibi Sankar Signed-off-by: Sibi Sankar Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20231124100608.29964-3-quic_sibis@quicinc.com Signed-off-by: Will Deacon --- Documentation/devicetree/bindings/iommu/arm,smmu.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/iommu/arm,smmu.yaml b/Documentation/devicetree/bindings/iommu/arm,smmu.yaml index b4b76437369a3..a5adfba939243 100644 --- a/Documentation/devicetree/bindings/iommu/arm,smmu.yaml +++ b/Documentation/devicetree/bindings/iommu/arm,smmu.yaml @@ -57,6 +57,7 @@ properties: - qcom,sm8450-smmu-500 - qcom,sm8550-smmu-500 - qcom,sm8650-smmu-500 + - qcom,x1e80100-smmu-500 - const: qcom,smmu-500 - const: arm,mmu-500 @@ -500,6 +501,7 @@ allOf: - qcom,sm8450-smmu-500 - qcom,sm8550-smmu-500 - qcom,sm8650-smmu-500 + - qcom,x1e80100-smmu-500 then: properties: clock-names: false -- GitLab From 4fff78dc2490fc0c67669c610dbc42921dd23a1c Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Mon, 4 Dec 2023 13:55:20 +0100 Subject: [PATCH 0302/1290] dt-bindings: arm-smmu: Document SM8[45]50 GPU SMMU SM8450 and SM8550 both use a Qualcomm-modified MMU500 for their GPU. In both cases, it requires a set of clocks to be enabled. Describe that. Reviewed-by: Krzysztof Kozlowski Signed-off-by: Konrad Dybcio Link: https://lore.kernel.org/r/20231127-topic-a7xx_dt-v2-1-2a437588e563@linaro.org Signed-off-by: Will Deacon --- .../devicetree/bindings/iommu/arm,smmu.yaml | 48 ++++++++++++++++++- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/iommu/arm,smmu.yaml b/Documentation/devicetree/bindings/iommu/arm,smmu.yaml index a5adfba939243..a4042ae247702 100644 --- a/Documentation/devicetree/bindings/iommu/arm,smmu.yaml +++ b/Documentation/devicetree/bindings/iommu/arm,smmu.yaml @@ -91,6 +91,8 @@ properties: - qcom,sm8150-smmu-500 - qcom,sm8250-smmu-500 - qcom,sm8350-smmu-500 + - qcom,sm8450-smmu-500 + - qcom,sm8550-smmu-500 - const: qcom,adreno-smmu - const: qcom,smmu-500 - const: arm,mmu-500 @@ -479,6 +481,50 @@ allOf: - description: Voter clock required for HLOS SMMU access - description: Interface clock required for register access + - if: + properties: + compatible: + const: qcom,sm8450-smmu-500 + then: + properties: + clock-names: + items: + - const: gmu + - const: hub + - const: hlos + - const: bus + - const: iface + - const: ahb + + clocks: + items: + - description: GMU clock + - description: GPU HUB clock + - description: HLOS vote clock + - description: GPU memory bus clock + - description: GPU SNoC bus clock + - description: GPU AHB clock + + - if: + properties: + compatible: + const: qcom,sm8550-smmu-500 + then: + properties: + clock-names: + items: + - const: hlos + - const: bus + - const: iface + - const: ahb + + clocks: + items: + - description: HLOS vote clock + - description: GPU memory bus clock + - description: GPU SNoC bus clock + - description: GPU AHB clock + # Disallow clocks for all other platforms with specific compatibles - if: properties: @@ -498,8 +544,6 @@ allOf: - qcom,sdx65-smmu-500 - qcom,sm6350-smmu-500 - qcom,sm6375-smmu-500 - - qcom,sm8450-smmu-500 - - qcom,sm8550-smmu-500 - qcom,sm8650-smmu-500 - qcom,x1e80100-smmu-500 then: -- GitLab From afc95681c3068956fed1241a1ff1612c066c75ac Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Sun, 10 Dec 2023 10:06:53 -0800 Subject: [PATCH 0303/1290] iommu/arm-smmu-qcom: Add missing GMU entry to match table In some cases the firmware expects cbndx 1 to be assigned to the GMU, so we also want the default domain for the GMU to be an identy domain. This way it does not get a context bank assigned. Without this, both of_dma_configure() and drm/msm's iommu_domain_attach() will trigger allocating and configuring a context bank. So GMU ends up attached to both cbndx 1 and later cbndx 2. This arrangement seemingly confounds and surprises the firmware if the GPU later triggers a translation fault, resulting (on sc8280xp / lenovo x13s, at least) in the SMMU getting wedged and the GPU stuck without memory access. Cc: stable@vger.kernel.org Signed-off-by: Rob Clark Tested-by: Johan Hovold Reviewed-by: Robin Murphy Link: https://lore.kernel.org/r/20231210180655.75542-1-robdclark@gmail.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 549ae4dba3a68..d326fa230b96e 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -243,6 +243,7 @@ static int qcom_adreno_smmu_init_context(struct arm_smmu_domain *smmu_domain, static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = { { .compatible = "qcom,adreno" }, + { .compatible = "qcom,adreno-gmu" }, { .compatible = "qcom,mdp4" }, { .compatible = "qcom,mdss" }, { .compatible = "qcom,sc7180-mdss" }, -- GitLab From 28af105cb65043f20a16c5e2519d66390b1f9bb9 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Wed, 29 Nov 2023 15:44:02 +0100 Subject: [PATCH 0304/1290] iommu/arm-smmu-qcom: Add QCM2290 MDSS compatible Add the QCM2290 MDSS compatible to clients compatible list, as it also needs the workarounds. Reviewed-by: Dmitry Baryshkov Signed-off-by: Konrad Dybcio Link: https://lore.kernel.org/r/20231125-topic-rb1_feat-v3-5-4cbb567743bb@linaro.org Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index d326fa230b96e..8b04ece00420d 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -246,6 +246,7 @@ static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = { { .compatible = "qcom,adreno-gmu" }, { .compatible = "qcom,mdp4" }, { .compatible = "qcom,mdss" }, + { .compatible = "qcom,qcm2290-mdss" }, { .compatible = "qcom,sc7180-mdss" }, { .compatible = "qcom,sc7180-mss-pil" }, { .compatible = "qcom,sc7280-mdss" }, -- GitLab From 268dd4edb748a1e2f298b04854681453df6e77a2 Mon Sep 17 00:00:00 2001 From: Vladimir Lypak Date: Wed, 11 Oct 2023 19:57:26 +0200 Subject: [PATCH 0305/1290] iommu/qcom: restore IOMMU state if needed If the IOMMU has a power domain then some state will be lost in qcom_iommu_suspend and TZ will reset device if we don't call qcom_scm_restore_sec_cfg before accessing it again. Signed-off-by: Vladimir Lypak [luca@z3ntu.xyz: reword commit message a bit] Signed-off-by: Luca Weiss Link: https://lore.kernel.org/r/20231011-msm8953-iommu-restore-v1-1-48a0c93809a2@z3ntu.xyz Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu/qcom_iommu.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c index 97b2122032b23..67abeb02cf536 100644 --- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c +++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c @@ -900,8 +900,16 @@ static void qcom_iommu_device_remove(struct platform_device *pdev) static int __maybe_unused qcom_iommu_resume(struct device *dev) { struct qcom_iommu_dev *qcom_iommu = dev_get_drvdata(dev); + int ret; + + ret = clk_bulk_prepare_enable(CLK_NUM, qcom_iommu->clks); + if (ret < 0) + return ret; + + if (dev->pm_domain) + return qcom_scm_restore_sec_cfg(qcom_iommu->sec_id, 0); - return clk_bulk_prepare_enable(CLK_NUM, qcom_iommu->clks); + return ret; } static int __maybe_unused qcom_iommu_suspend(struct device *dev) -- GitLab From b41932f544586e606a7493cbc6dfb3c97c6b44d7 Mon Sep 17 00:00:00 2001 From: Wenkai Lin Date: Wed, 6 Dec 2023 08:57:27 +0800 Subject: [PATCH 0306/1290] iommu/arm-smmu-v3: disable stall for quiet_cd In the stall model, invalid transactions were expected to be stalled and aborted by the IOPF handler. However, when killing a test case with a huge amount of data, the accelerator streamline can not stop until all data is consumed even if the page fault handler reports errors. As a result, the kill may take a long time, about 10 seconds with numerous iopf interrupts. So disable stall for quiet_cd in the non-force stall model, since force stall model (STALL_MODEL==0b10) requires CD.S must be 1. Signed-off-by: Zhangfei Gao Signed-off-by: Wenkai Lin Suggested-by: Jean-Philippe Brucker Reviewed-by: Jason Gunthorpe Reviewed-by: Jean-Philippe Brucker Link: https://lore.kernel.org/r/20231206005727.46150-1-zhangfei.gao@linaro.org Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 7445454c2af24..7086e5fa41ff6 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1063,6 +1063,7 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid, bool cd_live; __le64 *cdptr; struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table; + struct arm_smmu_device *smmu = master->smmu; if (WARN_ON(ssid >= (1 << cd_table->s1cdmax))) return -E2BIG; @@ -1077,6 +1078,8 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid, if (!cd) { /* (5) */ val = 0; } else if (cd == &quiet_cd) { /* (4) */ + if (!(smmu->features & ARM_SMMU_FEAT_STALL_FORCE)) + val &= ~(CTXDESC_CD_0_S | CTXDESC_CD_0_R); val |= CTXDESC_CD_0_TCR_EPD0; } else if (cd_live) { /* (3) */ val &= ~CTXDESC_CD_0_ASID; -- GitLab From 1e536e10689700e006989dea33918cce348e04b6 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 8 Dec 2023 09:07:37 -0800 Subject: [PATCH 0307/1290] x86/cpu: Detect TDX partial write machine check erratum TDX memory has integrity and confidentiality protections. Violations of this integrity protection are supposed to only affect TDX operations and are never supposed to affect the host kernel itself. In other words, the host kernel should never, itself, see machine checks induced by the TDX integrity hardware. Alas, the first few generations of TDX hardware have an erratum. A partial write to a TDX private memory cacheline will silently "poison" the line. Subsequent reads will consume the poison and generate a machine check. According to the TDX hardware spec, neither of these things should have happened. Virtually all kernel memory accesses operations happen in full cachelines. In practice, writing a "byte" of memory usually reads a 64 byte cacheline of memory, modifies it, then writes the whole line back. Those operations do not trigger this problem. This problem is triggered by "partial" writes where a write transaction of less than cacheline lands at the memory controller. The CPU does these via non-temporal write instructions (like MOVNTI), or through UC/WC memory mappings. The issue can also be triggered away from the CPU by devices doing partial writes via DMA. With this erratum, there are additional things need to be done. To prepare for those changes, add a CPU bug bit to indicate this erratum. Note this bug reflects the hardware thus it is detected regardless of whether the kernel is built with TDX support or not. Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Reviewed-by: Kirill A. Shutemov Reviewed-by: David Hildenbrand Reviewed-by: Dave Hansen Link: https://lore.kernel.org/all/20231208170740.53979-17-dave.hansen%40intel.com --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/virt/vmx/tdx/tdx.c | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 45ddc6b6baaa1..dbec4a0cf36a8 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -496,6 +496,7 @@ #define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */ #define X86_BUG_SMT_RSB X86_BUG(29) /* CPU is vulnerable to Cross-Thread Return Address Predictions */ #define X86_BUG_GDS X86_BUG(30) /* CPU is affected by Gather Data Sampling */ +#define X86_BUG_TDX_PW_MCE X86_BUG(31) /* CPU may incur #MC if non-TD software does partial write to TDX private memory */ /* BUG word 2 */ #define X86_BUG_SRSO X86_BUG(1*32 + 0) /* AMD SRSO bug */ diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 6d030f64720ba..06b04b9c3236d 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -33,6 +33,8 @@ #include #include #include +#include +#include #include "tdx.h" static u32 tdx_global_keyid __ro_after_init; @@ -1308,6 +1310,21 @@ static struct notifier_block tdx_memory_nb = { .notifier_call = tdx_memory_notifier, }; +static void __init check_tdx_erratum(void) +{ + /* + * These CPUs have an erratum. A partial write from non-TD + * software (e.g. via MOVNTI variants or UC/WC mapping) to TDX + * private memory poisons that memory, and a subsequent read of + * that memory triggers #MC. + */ + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_SAPPHIRERAPIDS_X: + case INTEL_FAM6_EMERALDRAPIDS_X: + setup_force_cpu_bug(X86_BUG_TDX_PW_MCE); + } +} + void __init tdx_init(void) { u32 tdx_keyid_start, nr_tdx_keyids; @@ -1361,4 +1378,6 @@ void __init tdx_init(void) tdx_nr_guest_keyids = nr_tdx_keyids - 1; setup_force_cpu_cap(X86_FEATURE_TDX_HOST_PLATFORM); + + check_tdx_erratum(); } -- GitLab From 70060463cb2ba72bbb77c7ced503f53e09380f17 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 8 Dec 2023 09:07:38 -0800 Subject: [PATCH 0308/1290] x86/mce: Differentiate real hardware #MCs from TDX erratum ones The first few generations of TDX hardware have an erratum. Triggering it in Linux requires some kind of kernel bug involving relatively exotic memory writes to TDX private memory and will manifest via spurious-looking machine checks when reading the affected memory. Make an effort to detect these TDX-induced machine checks and spit out a new blurb to dmesg so folks do not think their hardware is failing. == Background == Virtually all kernel memory accesses operations happen in full cachelines. In practice, writing a "byte" of memory usually reads a 64 byte cacheline of memory, modifies it, then writes the whole line back. Those operations do not trigger this problem. This problem is triggered by "partial" writes where a write transaction of less than cacheline lands at the memory controller. The CPU does these via non-temporal write instructions (like MOVNTI), or through UC/WC memory mappings. The issue can also be triggered away from the CPU by devices doing partial writes via DMA. == Problem == A partial write to a TDX private memory cacheline will silently "poison" the line. Subsequent reads will consume the poison and generate a machine check. According to the TDX hardware spec, neither of these things should have happened. To add insult to injury, the Linux machine code will present these as a literal "Hardware error" when they were, in fact, a software-triggered issue. == Solution == In the end, this issue is hard to trigger. Rather than do something rash (and incomplete) like unmap TDX private memory from the direct map, improve the machine check handler. Currently, the #MC handler doesn't distinguish whether the memory is TDX private memory or not but just dump, for instance, below message: [...] mce: [Hardware Error]: CPU 147: Machine Check Exception: f Bank 1: bd80000000100134 [...] mce: [Hardware Error]: RIP 10: {__tlb_remove_page_size+0x10/0xa0} ... [...] mce: [Hardware Error]: Run the above through 'mcelog --ascii' [...] mce: [Hardware Error]: Machine check: Data load in unrecoverable area of kernel [...] Kernel panic - not syncing: Fatal local machine check Which says "Hardware Error" and "Data load in unrecoverable area of kernel". Ideally, it's better for the log to say "software bug around TDX private memory" instead of "Hardware Error". But in reality the real hardware memory error can happen, and sadly such software-triggered #MC cannot be distinguished from the real hardware error. Also, the error message is used by userspace tool 'mcelog' to parse, so changing the output may break userspace. So keep the "Hardware Error". The "Data load in unrecoverable area of kernel" is also helpful, so keep it too. Instead of modifying above error log, improve the error log by printing additional TDX related message to make the log like: ... [...] mce: [Hardware Error]: Machine check: Data load in unrecoverable area of kernel [...] mce: [Hardware Error]: Machine Check: TDX private memory error. Possible kernel bug. Adding this additional message requires determination of whether the memory page is TDX private memory. There is no existing infrastructure to do that. Add an interface to query the TDX module to fill this gap. == Impact == This issue requires some kind of kernel bug to trigger. TDX private memory should never be mapped UC/WC. A partial write originating from these mappings would require *two* bugs, first mapping the wrong page, then writing the wrong memory. It would also be detectable using traditional memory corruption techniques like DEBUG_PAGEALLOC. MOVNTI (and friends) could cause this issue with something like a simple buffer overrun or use-after-free on the direct map. It should also be detectable with normal debug techniques. The one place where this might get nasty would be if the CPU read data then wrote back the same data. That would trigger this problem but would not, for instance, set off mechanisms like slab redzoning because it doesn't actually corrupt data. With an IOMMU at least, the DMA exposure is similar to the UC/WC issue. TDX private memory would first need to be incorrectly mapped into the I/O space and then a later DMA to that mapping would actually cause the poisoning event. [ dhansen: changelog tweaks ] Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Reviewed-by: Kirill A. Shutemov Reviewed-by: Yuan Yao Reviewed-by: Dave Hansen Reviewed-by: Tony Luck Link: https://lore.kernel.org/all/20231208170740.53979-18-dave.hansen%40intel.com --- arch/x86/include/asm/tdx.h | 4 ++ arch/x86/kernel/cpu/mce/core.c | 15 +++++ arch/x86/virt/vmx/tdx/tdx.c | 109 +++++++++++++++++++++++++++++++++ arch/x86/virt/vmx/tdx/tdx.h | 5 ++ 4 files changed, 133 insertions(+) diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index c54948e1999c0..eba178996d845 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -32,6 +32,8 @@ #ifndef __ASSEMBLY__ +#include + /* * Used by the #VE exception handler to gather the #VE exception * info from the TDX module. This is a software only structure @@ -113,10 +115,12 @@ static inline u64 sc_retry(sc_func_t func, u64 fn, #define seamcall_saved_ret(_fn, _args) sc_retry(__seamcall_saved_ret, (_fn), (_args)) int tdx_cpu_enable(void); int tdx_enable(void); +const char *tdx_dump_mce_info(struct mce *m); #else static inline void tdx_init(void) { } static inline int tdx_cpu_enable(void) { return -ENODEV; } static inline int tdx_enable(void) { return -ENODEV; } +static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; } #endif /* CONFIG_INTEL_TDX_HOST */ #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 7b397370b4d64..00d2b6b1dc24b 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -52,6 +52,7 @@ #include #include #include +#include #include "internal.h" @@ -228,11 +229,20 @@ static void wait_for_panic(void) panic("Panicing machine check CPU died"); } +static const char *mce_dump_aux_info(struct mce *m) +{ + if (boot_cpu_has_bug(X86_BUG_TDX_PW_MCE)) + return tdx_dump_mce_info(m); + + return NULL; +} + static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) { struct llist_node *pending; struct mce_evt_llist *l; int apei_err = 0; + const char *memmsg; /* * Allow instrumentation around external facilities usage. Not that it @@ -283,6 +293,11 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) } if (exp) pr_emerg(HW_ERR "Machine check: %s\n", exp); + + memmsg = mce_dump_aux_info(final); + if (memmsg) + pr_emerg(HW_ERR "Machine check: %s\n", memmsg); + if (!fake_panic) { if (panic_timeout == 0) panic_timeout = mca_cfg.panic_timeout; diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 06b04b9c3236d..4d6826a76f788 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -35,6 +36,7 @@ #include #include #include +#include #include "tdx.h" static u32 tdx_global_keyid __ro_after_init; @@ -942,6 +944,13 @@ static int construct_tdmrs(struct list_head *tmb_list, if (ret) tdmrs_free_pamt_all(tdmr_list); + /* + * The tdmr_info_list is read-only from here on out. + * Ensure that these writes are seen by other CPUs. + * Pairs with a smp_rmb() in is_pamt_page(). + */ + smp_wmb(); + return ret; } @@ -1235,6 +1244,106 @@ int tdx_enable(void) } EXPORT_SYMBOL_GPL(tdx_enable); +static bool is_pamt_page(unsigned long phys) +{ + struct tdmr_info_list *tdmr_list = &tdx_tdmr_list; + int i; + + /* Ensure that all remote 'tdmr_list' writes are visible: */ + smp_rmb(); + + /* + * The TDX module is no longer returning TDX_SYS_NOT_READY and + * is initialized. The 'tdmr_list' was initialized long ago + * and is now read-only. + */ + for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { + unsigned long base, size; + + tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size); + + if (phys >= base && phys < (base + size)) + return true; + } + + return false; +} + +/* + * Return whether the memory page at the given physical address is TDX + * private memory or not. + * + * This can be imprecise for two known reasons: + * 1. PAMTs are private memory and exist before the TDX module is + * ready and TDH_PHYMEM_PAGE_RDMD works. This is a relatively + * short window that occurs once per boot. + * 2. TDH_PHYMEM_PAGE_RDMD reflects the TDX module's knowledge of the + * page. However, the page can still cause #MC until it has been + * fully converted to shared using 64-byte writes like MOVDIR64B. + * Buggy hosts might still leave #MC-causing memory in place which + * this function can not detect. + */ +static bool paddr_is_tdx_private(unsigned long phys) +{ + struct tdx_module_args args = { + .rcx = phys & PAGE_MASK, + }; + u64 sret; + + if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM)) + return false; + + /* Get page type from the TDX module */ + sret = __seamcall_ret(TDH_PHYMEM_PAGE_RDMD, &args); + + /* + * The SEAMCALL will not return success unless there is a + * working, "ready" TDX module. Assume an absence of TDX + * private pages until SEAMCALL is working. + */ + if (sret) + return false; + + /* + * SEAMCALL was successful -- read page type (via RCX): + * + * - PT_NDA: Page is not used by the TDX module + * - PT_RSVD: Reserved for Non-TDX use + * - Others: Page is used by the TDX module + * + * Note PAMT pages are marked as PT_RSVD but they are also TDX + * private memory. + */ + switch (args.rcx) { + case PT_NDA: + return false; + case PT_RSVD: + return is_pamt_page(phys); + default: + return true; + } +} + +/* + * Some TDX-capable CPUs have an erratum. A write to TDX private + * memory poisons that memory, and a subsequent read of that memory + * triggers #MC. + * + * Help distinguish erratum-triggered #MCs from a normal hardware one. + * Just print additional message to show such #MC may be result of the + * erratum. + */ +const char *tdx_dump_mce_info(struct mce *m) +{ + if (!m || !mce_is_memory_error(m) || !mce_usable_address(m)) + return NULL; + + if (!paddr_is_tdx_private(m->addr)) + return NULL; + + return "TDX private memory error. Possible kernel bug."; +} + static __init int record_keyid_partitioning(u32 *tdx_keyid_start, u32 *nr_tdx_keyids) { diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index c0610f0bb88ce..b701f69485d32 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -14,6 +14,7 @@ /* * TDX module SEAMCALL leaf functions */ +#define TDH_PHYMEM_PAGE_RDMD 24 #define TDH_SYS_KEY_CONFIG 31 #define TDH_SYS_INIT 33 #define TDH_SYS_RD 34 @@ -21,6 +22,10 @@ #define TDH_SYS_TDMR_INIT 36 #define TDH_SYS_CONFIG 45 +/* TDX page types */ +#define PT_NDA 0x0 +#define PT_RSVD 0x1 + /* * Global scope metadata field ID. * -- GitLab From 4e1c7dddc71708c21d7fe69cc5f8297ffb7c6965 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 8 Dec 2023 09:07:39 -0800 Subject: [PATCH 0309/1290] Documentation/x86: Add documentation for TDX host support Add documentation for TDX host kernel support. There is already one file Documentation/x86/tdx.rst containing documentation for TDX guest internals. Also reuse it for TDX host kernel support. Introduce a new level menu "TDX Guest Support" and move existing materials under it, and add a new menu for TDX host kernel support. Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Reviewed-by: Dave Hansen Link: https://lore.kernel.org/all/20231208170740.53979-19-dave.hansen%40intel.com --- Documentation/arch/x86/tdx.rst | 207 +++++++++++++++++++++++++++++++-- 1 file changed, 196 insertions(+), 11 deletions(-) diff --git a/Documentation/arch/x86/tdx.rst b/Documentation/arch/x86/tdx.rst index dc8d9fd2c3f76..719043cd8b469 100644 --- a/Documentation/arch/x86/tdx.rst +++ b/Documentation/arch/x86/tdx.rst @@ -10,6 +10,191 @@ encrypting the guest memory. In TDX, a special module running in a special mode sits between the host and the guest and manages the guest/host separation. +TDX Host Kernel Support +======================= + +TDX introduces a new CPU mode called Secure Arbitration Mode (SEAM) and +a new isolated range pointed by the SEAM Ranger Register (SEAMRR). A +CPU-attested software module called 'the TDX module' runs inside the new +isolated range to provide the functionalities to manage and run protected +VMs. + +TDX also leverages Intel Multi-Key Total Memory Encryption (MKTME) to +provide crypto-protection to the VMs. TDX reserves part of MKTME KeyIDs +as TDX private KeyIDs, which are only accessible within the SEAM mode. +BIOS is responsible for partitioning legacy MKTME KeyIDs and TDX KeyIDs. + +Before the TDX module can be used to create and run protected VMs, it +must be loaded into the isolated range and properly initialized. The TDX +architecture doesn't require the BIOS to load the TDX module, but the +kernel assumes it is loaded by the BIOS. + +TDX boot-time detection +----------------------- + +The kernel detects TDX by detecting TDX private KeyIDs during kernel +boot. Below dmesg shows when TDX is enabled by BIOS:: + + [..] virt/tdx: BIOS enabled: private KeyID range: [16, 64) + +TDX module initialization +--------------------------------------- + +The kernel talks to the TDX module via the new SEAMCALL instruction. The +TDX module implements SEAMCALL leaf functions to allow the kernel to +initialize it. + +If the TDX module isn't loaded, the SEAMCALL instruction fails with a +special error. In this case the kernel fails the module initialization +and reports the module isn't loaded:: + + [..] virt/tdx: module not loaded + +Initializing the TDX module consumes roughly ~1/256th system RAM size to +use it as 'metadata' for the TDX memory. It also takes additional CPU +time to initialize those metadata along with the TDX module itself. Both +are not trivial. The kernel initializes the TDX module at runtime on +demand. + +Besides initializing the TDX module, a per-cpu initialization SEAMCALL +must be done on one cpu before any other SEAMCALLs can be made on that +cpu. + +The kernel provides two functions, tdx_enable() and tdx_cpu_enable() to +allow the user of TDX to enable the TDX module and enable TDX on local +cpu respectively. + +Making SEAMCALL requires VMXON has been done on that CPU. Currently only +KVM implements VMXON. For now both tdx_enable() and tdx_cpu_enable() +don't do VMXON internally (not trivial), but depends on the caller to +guarantee that. + +To enable TDX, the caller of TDX should: 1) temporarily disable CPU +hotplug; 2) do VMXON and tdx_enable_cpu() on all online cpus; 3) call +tdx_enable(). For example:: + + cpus_read_lock(); + on_each_cpu(vmxon_and_tdx_cpu_enable()); + ret = tdx_enable(); + cpus_read_unlock(); + if (ret) + goto no_tdx; + // TDX is ready to use + +And the caller of TDX must guarantee the tdx_cpu_enable() has been +successfully done on any cpu before it wants to run any other SEAMCALL. +A typical usage is do both VMXON and tdx_cpu_enable() in CPU hotplug +online callback, and refuse to online if tdx_cpu_enable() fails. + +User can consult dmesg to see whether the TDX module has been initialized. + +If the TDX module is initialized successfully, dmesg shows something +like below:: + + [..] virt/tdx: 262668 KBs allocated for PAMT + [..] virt/tdx: module initialized + +If the TDX module failed to initialize, dmesg also shows it failed to +initialize:: + + [..] virt/tdx: module initialization failed ... + +TDX Interaction to Other Kernel Components +------------------------------------------ + +TDX Memory Policy +~~~~~~~~~~~~~~~~~ + +TDX reports a list of "Convertible Memory Region" (CMR) to tell the +kernel which memory is TDX compatible. The kernel needs to build a list +of memory regions (out of CMRs) as "TDX-usable" memory and pass those +regions to the TDX module. Once this is done, those "TDX-usable" memory +regions are fixed during module's lifetime. + +To keep things simple, currently the kernel simply guarantees all pages +in the page allocator are TDX memory. Specifically, the kernel uses all +system memory in the core-mm "at the time of TDX module initialization" +as TDX memory, and in the meantime, refuses to online any non-TDX-memory +in the memory hotplug. + +Physical Memory Hotplug +~~~~~~~~~~~~~~~~~~~~~~~ + +Note TDX assumes convertible memory is always physically present during +machine's runtime. A non-buggy BIOS should never support hot-removal of +any convertible memory. This implementation doesn't handle ACPI memory +removal but depends on the BIOS to behave correctly. + +CPU Hotplug +~~~~~~~~~~~ + +TDX module requires the per-cpu initialization SEAMCALL must be done on +one cpu before any other SEAMCALLs can be made on that cpu. The kernel +provides tdx_cpu_enable() to let the user of TDX to do it when the user +wants to use a new cpu for TDX task. + +TDX doesn't support physical (ACPI) CPU hotplug. During machine boot, +TDX verifies all boot-time present logical CPUs are TDX compatible before +enabling TDX. A non-buggy BIOS should never support hot-add/removal of +physical CPU. Currently the kernel doesn't handle physical CPU hotplug, +but depends on the BIOS to behave correctly. + +Note TDX works with CPU logical online/offline, thus the kernel still +allows to offline logical CPU and online it again. + +Kexec() +~~~~~~~ + +TDX host support currently lacks the ability to handle kexec. For +simplicity only one of them can be enabled in the Kconfig. This will be +fixed in the future. + +Erratum +~~~~~~~ + +The first few generations of TDX hardware have an erratum. A partial +write to a TDX private memory cacheline will silently "poison" the +line. Subsequent reads will consume the poison and generate a machine +check. + +A partial write is a memory write where a write transaction of less than +cacheline lands at the memory controller. The CPU does these via +non-temporal write instructions (like MOVNTI), or through UC/WC memory +mappings. Devices can also do partial writes via DMA. + +Theoretically, a kernel bug could do partial write to TDX private memory +and trigger unexpected machine check. What's more, the machine check +code will present these as "Hardware error" when they were, in fact, a +software-triggered issue. But in the end, this issue is hard to trigger. + +If the platform has such erratum, the kernel prints additional message in +machine check handler to tell user the machine check may be caused by +kernel bug on TDX private memory. + +Interaction vs S3 and deeper states +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +TDX cannot survive from S3 and deeper states. The hardware resets and +disables TDX completely when platform goes to S3 and deeper. Both TDX +guests and the TDX module get destroyed permanently. + +The kernel uses S3 for suspend-to-ram, and use S4 and deeper states for +hibernation. Currently, for simplicity, the kernel chooses to make TDX +mutually exclusive with S3 and hibernation. + +The kernel disables TDX during early boot when hibernation support is +available:: + + [..] virt/tdx: initialization failed: Hibernation support is enabled + +Add 'nohibernate' kernel command line to disable hibernation in order to +use TDX. + +ACPI S3 is disabled during kernel early boot if TDX is enabled. The user +needs to turn off TDX in the BIOS in order to use S3. + +TDX Guest Support +================= Since the host cannot directly access guest registers or memory, much normal functionality of a hypervisor must be moved into the guest. This is implemented using a Virtualization Exception (#VE) that is handled by the @@ -20,7 +205,7 @@ TDX includes new hypercall-like mechanisms for communicating from the guest to the hypervisor or the TDX module. New TDX Exceptions -================== +------------------ TDX guests behave differently from bare-metal and traditional VMX guests. In TDX guests, otherwise normal instructions or memory accesses can cause @@ -30,7 +215,7 @@ Instructions marked with an '*' conditionally cause exceptions. The details for these instructions are discussed below. Instruction-based #VE ---------------------- +~~~~~~~~~~~~~~~~~~~~~ - Port I/O (INS, OUTS, IN, OUT) - HLT @@ -41,7 +226,7 @@ Instruction-based #VE - CPUID* Instruction-based #GP ---------------------- +~~~~~~~~~~~~~~~~~~~~~ - All VMX instructions: INVEPT, INVVPID, VMCLEAR, VMFUNC, VMLAUNCH, VMPTRLD, VMPTRST, VMREAD, VMRESUME, VMWRITE, VMXOFF, VMXON @@ -52,7 +237,7 @@ Instruction-based #GP - RDMSR*,WRMSR* RDMSR/WRMSR Behavior --------------------- +~~~~~~~~~~~~~~~~~~~~ MSR access behavior falls into three categories: @@ -73,7 +258,7 @@ trapping and handling in the TDX module. Other than possibly being slow, these MSRs appear to function just as they would on bare metal. CPUID Behavior --------------- +~~~~~~~~~~~~~~ For some CPUID leaves and sub-leaves, the virtualized bit fields of CPUID return values (in guest EAX/EBX/ECX/EDX) are configurable by the @@ -93,7 +278,7 @@ not know how to handle. The guest kernel may ask the hypervisor for the value with a hypercall. #VE on Memory Accesses -====================== +---------------------- There are essentially two classes of TDX memory: private and shared. Private memory receives full TDX protections. Its content is protected @@ -107,7 +292,7 @@ entries. This helps ensure that a guest does not place sensitive information in shared memory, exposing it to the untrusted hypervisor. #VE on Shared Memory --------------------- +~~~~~~~~~~~~~~~~~~~~ Access to shared mappings can cause a #VE. The hypervisor ultimately controls whether a shared memory access causes a #VE, so the guest must be @@ -127,7 +312,7 @@ be careful not to access device MMIO regions unless it is also prepared to handle a #VE. #VE on Private Pages --------------------- +~~~~~~~~~~~~~~~~~~~~ An access to private mappings can also cause a #VE. Since all kernel memory is also private memory, the kernel might theoretically need to @@ -145,7 +330,7 @@ The hypervisor is permitted to unilaterally move accepted pages to a to handle the exception. Linux #VE handler -================= +----------------- Just like page faults or #GP's, #VE exceptions can be either handled or be fatal. Typically, an unhandled userspace #VE results in a SIGSEGV. @@ -167,7 +352,7 @@ While the block is in place, any #VE is elevated to a double fault (#DF) which is not recoverable. MMIO handling -============= +------------- In non-TDX VMs, MMIO is usually implemented by giving a guest access to a mapping which will cause a VMEXIT on access, and then the hypervisor @@ -189,7 +374,7 @@ MMIO access via other means (like structure overlays) may result in an oops. Shared Memory Conversions -========================= +------------------------- All TDX guest memory starts out as private at boot. This memory can not be accessed by the hypervisor. However, some kernel users like device -- GitLab From cb8eb06d50fcf4a478813a612f68c38cca45c742 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 8 Dec 2023 09:07:40 -0800 Subject: [PATCH 0310/1290] x86/virt/tdx: Disable TDX host support when kexec is enabled TDX host support currently lacks the ability to handle kexec. Disable TDX when kexec is enabled. Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20231208170740.53979-20-dave.hansen%40intel.com --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e255d8ae5e77e..01cdb16acff6e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1973,6 +1973,7 @@ config INTEL_TDX_HOST depends on X86_X2APIC select ARCH_KEEP_MEMBLOCK depends on CONTIG_ALLOC + depends on !KEXEC_CORE help Intel Trust Domain Extensions (TDX) protects guest VMs from malicious host and certain physical attacks. This option enables necessary TDX -- GitLab From 48219b089d84f109e8a81d8a7fa1bbc2e6e5f97d Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 28 Nov 2023 22:01:58 -0800 Subject: [PATCH 0311/1290] libperf cpumap: Rename perf_cpu_map__dummy_new() to perf_cpu_map__new_any_cpu() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename perf_cpu_map__dummy_new() to perf_cpu_map__new_any_cpu() to better indicate this is creating a CPU map for the perf_event_open "any" CPU case. Reviewed-by: James Clark Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexandre Ghiti Cc: Andrew Jones Cc: André Almeida Cc: Athira Jajeev Cc: Atish Patra Cc: Changbin Du Cc: Darren Hart Cc: Davidlohr Bueso Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Paolo Bonzini Cc: Paran Lee Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Jihong Cc: Yang Li Cc: Yanteng Si Cc: bpf@vger.kernel.org Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20231129060211.1890454-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/Documentation/libperf.txt | 2 +- tools/lib/perf/cpumap.c | 4 ++-- tools/lib/perf/evsel.c | 2 +- tools/lib/perf/include/perf/cpumap.h | 4 ++-- tools/lib/perf/libperf.map | 2 +- tools/lib/perf/tests/test-cpumap.c | 2 +- tools/lib/perf/tests/test-evlist.c | 2 +- tools/perf/tests/cpumap.c | 2 +- tools/perf/tests/sw-clock.c | 2 +- tools/perf/tests/task-exit.c | 2 +- tools/perf/util/evlist.c | 2 +- tools/perf/util/evsel.c | 2 +- 12 files changed, 14 insertions(+), 14 deletions(-) diff --git a/tools/lib/perf/Documentation/libperf.txt b/tools/lib/perf/Documentation/libperf.txt index a8f1a237931b1..a256a26598b04 100644 --- a/tools/lib/perf/Documentation/libperf.txt +++ b/tools/lib/perf/Documentation/libperf.txt @@ -37,7 +37,7 @@ SYNOPSIS struct perf_cpu_map; - struct perf_cpu_map *perf_cpu_map__dummy_new(void); + struct perf_cpu_map *perf_cpu_map__new_any_cpu(void); struct perf_cpu_map *perf_cpu_map__new(const char *cpu_list); struct perf_cpu_map *perf_cpu_map__read(FILE *file); struct perf_cpu_map *perf_cpu_map__get(struct perf_cpu_map *map); diff --git a/tools/lib/perf/cpumap.c b/tools/lib/perf/cpumap.c index 2a5a292173740..2bd6aba3d8c9a 100644 --- a/tools/lib/perf/cpumap.c +++ b/tools/lib/perf/cpumap.c @@ -27,7 +27,7 @@ struct perf_cpu_map *perf_cpu_map__alloc(int nr_cpus) return result; } -struct perf_cpu_map *perf_cpu_map__dummy_new(void) +struct perf_cpu_map *perf_cpu_map__new_any_cpu(void) { struct perf_cpu_map *cpus = perf_cpu_map__alloc(1); @@ -271,7 +271,7 @@ struct perf_cpu_map *perf_cpu_map__new(const char *cpu_list) else if (*cpu_list != '\0') cpus = cpu_map__default_new(); else - cpus = perf_cpu_map__dummy_new(); + cpus = perf_cpu_map__new_any_cpu(); invalid: free(tmp_cpus); out: diff --git a/tools/lib/perf/evsel.c b/tools/lib/perf/evsel.c index 8b51b008a81f1..c07160953224a 100644 --- a/tools/lib/perf/evsel.c +++ b/tools/lib/perf/evsel.c @@ -120,7 +120,7 @@ int perf_evsel__open(struct perf_evsel *evsel, struct perf_cpu_map *cpus, static struct perf_cpu_map *empty_cpu_map; if (empty_cpu_map == NULL) { - empty_cpu_map = perf_cpu_map__dummy_new(); + empty_cpu_map = perf_cpu_map__new_any_cpu(); if (empty_cpu_map == NULL) return -ENOMEM; } diff --git a/tools/lib/perf/include/perf/cpumap.h b/tools/lib/perf/include/perf/cpumap.h index e38d859a384d2..d0bf218ada119 100644 --- a/tools/lib/perf/include/perf/cpumap.h +++ b/tools/lib/perf/include/perf/cpumap.h @@ -19,9 +19,9 @@ struct perf_cache { struct perf_cpu_map; /** - * perf_cpu_map__dummy_new - a map with a singular "any CPU"/dummy -1 value. + * perf_cpu_map__new_any_cpu - a map with a singular "any CPU"/dummy -1 value. */ -LIBPERF_API struct perf_cpu_map *perf_cpu_map__dummy_new(void); +LIBPERF_API struct perf_cpu_map *perf_cpu_map__new_any_cpu(void); LIBPERF_API struct perf_cpu_map *perf_cpu_map__default_new(void); LIBPERF_API struct perf_cpu_map *perf_cpu_map__new(const char *cpu_list); LIBPERF_API struct perf_cpu_map *perf_cpu_map__read(FILE *file); diff --git a/tools/lib/perf/libperf.map b/tools/lib/perf/libperf.map index 190b56ae923ad..a8ff64baea3e1 100644 --- a/tools/lib/perf/libperf.map +++ b/tools/lib/perf/libperf.map @@ -1,7 +1,7 @@ LIBPERF_0.0.1 { global: libperf_init; - perf_cpu_map__dummy_new; + perf_cpu_map__new_any_cpu; perf_cpu_map__default_new; perf_cpu_map__get; perf_cpu_map__put; diff --git a/tools/lib/perf/tests/test-cpumap.c b/tools/lib/perf/tests/test-cpumap.c index 87b0510a556ff..2c359bdb951ee 100644 --- a/tools/lib/perf/tests/test-cpumap.c +++ b/tools/lib/perf/tests/test-cpumap.c @@ -21,7 +21,7 @@ int test_cpumap(int argc, char **argv) libperf_init(libperf_print); - cpus = perf_cpu_map__dummy_new(); + cpus = perf_cpu_map__new_any_cpu(); if (!cpus) return -1; diff --git a/tools/lib/perf/tests/test-evlist.c b/tools/lib/perf/tests/test-evlist.c index ed616fc19b4f2..ab63878bacb99 100644 --- a/tools/lib/perf/tests/test-evlist.c +++ b/tools/lib/perf/tests/test-evlist.c @@ -261,7 +261,7 @@ static int test_mmap_thread(void) threads = perf_thread_map__new_dummy(); __T("failed to create threads", threads); - cpus = perf_cpu_map__dummy_new(); + cpus = perf_cpu_map__new_any_cpu(); __T("failed to create cpus", cpus); perf_thread_map__set_pid(threads, 0, pid); diff --git a/tools/perf/tests/cpumap.c b/tools/perf/tests/cpumap.c index 7730fc2ab40b7..bd8e396f3e57b 100644 --- a/tools/perf/tests/cpumap.c +++ b/tools/perf/tests/cpumap.c @@ -213,7 +213,7 @@ static int test__cpu_map_intersect(struct test_suite *test __maybe_unused, static int test__cpu_map_equal(struct test_suite *test __maybe_unused, int subtest __maybe_unused) { - struct perf_cpu_map *any = perf_cpu_map__dummy_new(); + struct perf_cpu_map *any = perf_cpu_map__new_any_cpu(); struct perf_cpu_map *one = perf_cpu_map__new("1"); struct perf_cpu_map *two = perf_cpu_map__new("2"); struct perf_cpu_map *empty = perf_cpu_map__intersect(one, two); diff --git a/tools/perf/tests/sw-clock.c b/tools/perf/tests/sw-clock.c index 4d7493fa01059..290716783ac6a 100644 --- a/tools/perf/tests/sw-clock.c +++ b/tools/perf/tests/sw-clock.c @@ -62,7 +62,7 @@ static int __test__sw_clock_freq(enum perf_sw_ids clock_id) } evlist__add(evlist, evsel); - cpus = perf_cpu_map__dummy_new(); + cpus = perf_cpu_map__new_any_cpu(); threads = thread_map__new_by_tid(getpid()); if (!cpus || !threads) { err = -ENOMEM; diff --git a/tools/perf/tests/task-exit.c b/tools/perf/tests/task-exit.c index 968dddde6ddaf..d33d0952025cf 100644 --- a/tools/perf/tests/task-exit.c +++ b/tools/perf/tests/task-exit.c @@ -70,7 +70,7 @@ static int test__task_exit(struct test_suite *test __maybe_unused, int subtest _ * evlist__prepare_workload we'll fill in the only thread * we're monitoring, the one forked there. */ - cpus = perf_cpu_map__dummy_new(); + cpus = perf_cpu_map__new_any_cpu(); threads = thread_map__new_by_tid(-1); if (!cpus || !threads) { err = -ENOMEM; diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index e36da58522efb..ff7f85ded89d7 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1056,7 +1056,7 @@ int evlist__create_maps(struct evlist *evlist, struct target *target) return -1; if (target__uses_dummy_map(target)) - cpus = perf_cpu_map__dummy_new(); + cpus = perf_cpu_map__new_any_cpu(); else cpus = perf_cpu_map__new(target->cpu_list); diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 532f34d9fcb50..6d7c9c58a9bcb 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1801,7 +1801,7 @@ static int __evsel__prepare_open(struct evsel *evsel, struct perf_cpu_map *cpus, if (cpus == NULL) { if (empty_cpu_map == NULL) { - empty_cpu_map = perf_cpu_map__dummy_new(); + empty_cpu_map = perf_cpu_map__new_any_cpu(); if (empty_cpu_map == NULL) return -ENOMEM; } -- GitLab From 8f60f870a9af53295ab4301da05ca453f115a6b6 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 28 Nov 2023 22:01:59 -0800 Subject: [PATCH 0312/1290] libperf cpumap: Rename perf_cpu_map__default_new() to perf_cpu_map__new_online_cpus() and prefer sysfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename perf_cpu_map__default_new() to perf_cpu_map__new_online_cpus() to better indicate what the implementation does. Read the online CPUs from /sys/devices/system/cpu/online first before using sysconf() as it can't accurately configure holes in the CPU map. If sysconf() is used, warn when the configured and online processors disagree. When reading from a file, if the read doesn't yield a CPU map then return an empty map rather than the default online. This avoids recursion but also better yields being able to detect failures. Add more comments. Reviewed-by: James Clark Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexandre Ghiti Cc: Andrew Jones Cc: André Almeida Cc: Athira Jajeev Cc: Atish Patra Cc: Changbin Du Cc: Darren Hart Cc: Davidlohr Bueso Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Paolo Bonzini Cc: Paran Lee Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Jihong Cc: Yang Li Cc: Yanteng Si Cc: bpf@vger.kernel.org Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20231129060211.1890454-3-irogers@google.com [ s/syfs/sysfs/g typo ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/cpumap.c | 59 +++++++++++++++++----------- tools/lib/perf/include/perf/cpumap.h | 15 ++++++- tools/lib/perf/libperf.map | 2 +- tools/lib/perf/tests/test-cpumap.c | 2 +- 4 files changed, 51 insertions(+), 27 deletions(-) diff --git a/tools/lib/perf/cpumap.c b/tools/lib/perf/cpumap.c index 2bd6aba3d8c9a..3aa80d0d26e86 100644 --- a/tools/lib/perf/cpumap.c +++ b/tools/lib/perf/cpumap.c @@ -9,6 +9,7 @@ #include #include #include +#include "internal.h" void perf_cpu_map__set_nr(struct perf_cpu_map *map, int nr_cpus) { @@ -66,15 +67,21 @@ void perf_cpu_map__put(struct perf_cpu_map *map) } } -static struct perf_cpu_map *cpu_map__default_new(void) +static struct perf_cpu_map *cpu_map__new_sysconf(void) { struct perf_cpu_map *cpus; - int nr_cpus; + int nr_cpus, nr_cpus_conf; nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); if (nr_cpus < 0) return NULL; + nr_cpus_conf = sysconf(_SC_NPROCESSORS_CONF); + if (nr_cpus != nr_cpus_conf) { + pr_warning("Number of online CPUs (%d) differs from the number configured (%d) the CPU map will only cover the first %d CPUs.", + nr_cpus, nr_cpus_conf, nr_cpus); + } + cpus = perf_cpu_map__alloc(nr_cpus); if (cpus != NULL) { int i; @@ -86,9 +93,27 @@ static struct perf_cpu_map *cpu_map__default_new(void) return cpus; } -struct perf_cpu_map *perf_cpu_map__default_new(void) +static struct perf_cpu_map *cpu_map__new_sysfs_online(void) { - return cpu_map__default_new(); + struct perf_cpu_map *cpus = NULL; + FILE *onlnf; + + onlnf = fopen("/sys/devices/system/cpu/online", "r"); + if (onlnf) { + cpus = perf_cpu_map__read(onlnf); + fclose(onlnf); + } + return cpus; +} + +struct perf_cpu_map *perf_cpu_map__new_online_cpus(void) +{ + struct perf_cpu_map *cpus = cpu_map__new_sysfs_online(); + + if (cpus) + return cpus; + + return cpu_map__new_sysconf(); } @@ -180,27 +205,11 @@ struct perf_cpu_map *perf_cpu_map__read(FILE *file) if (nr_cpus > 0) cpus = cpu_map__trim_new(nr_cpus, tmp_cpus); - else - cpus = cpu_map__default_new(); out_free_tmp: free(tmp_cpus); return cpus; } -static struct perf_cpu_map *cpu_map__read_all_cpu_map(void) -{ - struct perf_cpu_map *cpus = NULL; - FILE *onlnf; - - onlnf = fopen("/sys/devices/system/cpu/online", "r"); - if (!onlnf) - return cpu_map__default_new(); - - cpus = perf_cpu_map__read(onlnf); - fclose(onlnf); - return cpus; -} - struct perf_cpu_map *perf_cpu_map__new(const char *cpu_list) { struct perf_cpu_map *cpus = NULL; @@ -211,7 +220,7 @@ struct perf_cpu_map *perf_cpu_map__new(const char *cpu_list) int max_entries = 0; if (!cpu_list) - return cpu_map__read_all_cpu_map(); + return perf_cpu_map__new_online_cpus(); /* * must handle the case of empty cpumap to cover @@ -268,9 +277,11 @@ struct perf_cpu_map *perf_cpu_map__new(const char *cpu_list) if (nr_cpus > 0) cpus = cpu_map__trim_new(nr_cpus, tmp_cpus); - else if (*cpu_list != '\0') - cpus = cpu_map__default_new(); - else + else if (*cpu_list != '\0') { + pr_warning("Unexpected characters at end of cpu list ('%s'), using online CPUs.", + cpu_list); + cpus = perf_cpu_map__new_online_cpus(); + } else cpus = perf_cpu_map__new_any_cpu(); invalid: free(tmp_cpus); diff --git a/tools/lib/perf/include/perf/cpumap.h b/tools/lib/perf/include/perf/cpumap.h index d0bf218ada119..b24bd8b8f34e1 100644 --- a/tools/lib/perf/include/perf/cpumap.h +++ b/tools/lib/perf/include/perf/cpumap.h @@ -22,7 +22,20 @@ struct perf_cpu_map; * perf_cpu_map__new_any_cpu - a map with a singular "any CPU"/dummy -1 value. */ LIBPERF_API struct perf_cpu_map *perf_cpu_map__new_any_cpu(void); -LIBPERF_API struct perf_cpu_map *perf_cpu_map__default_new(void); +/** + * perf_cpu_map__new_online_cpus - a map read from + * /sys/devices/system/cpu/online if + * available. If reading wasn't possible a map + * is created using the online processors + * assuming the first 'n' processors are all + * online. + */ +LIBPERF_API struct perf_cpu_map *perf_cpu_map__new_online_cpus(void); +/** + * perf_cpu_map__new - create a map from the given cpu_list such as "0-7". If no + * cpu_list argument is provided then + * perf_cpu_map__new_online_cpus is returned. + */ LIBPERF_API struct perf_cpu_map *perf_cpu_map__new(const char *cpu_list); LIBPERF_API struct perf_cpu_map *perf_cpu_map__read(FILE *file); LIBPERF_API struct perf_cpu_map *perf_cpu_map__get(struct perf_cpu_map *map); diff --git a/tools/lib/perf/libperf.map b/tools/lib/perf/libperf.map index a8ff64baea3e1..8a71f841498e4 100644 --- a/tools/lib/perf/libperf.map +++ b/tools/lib/perf/libperf.map @@ -2,7 +2,7 @@ LIBPERF_0.0.1 { global: libperf_init; perf_cpu_map__new_any_cpu; - perf_cpu_map__default_new; + perf_cpu_map__new_online_cpus; perf_cpu_map__get; perf_cpu_map__put; perf_cpu_map__new; diff --git a/tools/lib/perf/tests/test-cpumap.c b/tools/lib/perf/tests/test-cpumap.c index 2c359bdb951ee..c998b1dae8631 100644 --- a/tools/lib/perf/tests/test-cpumap.c +++ b/tools/lib/perf/tests/test-cpumap.c @@ -29,7 +29,7 @@ int test_cpumap(int argc, char **argv) perf_cpu_map__put(cpus); perf_cpu_map__put(cpus); - cpus = perf_cpu_map__default_new(); + cpus = perf_cpu_map__new_online_cpus(); if (!cpus) return -1; -- GitLab From 923ca62a7b1edceaa61eb6ac8dc56fdac51913b8 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 28 Nov 2023 22:02:00 -0800 Subject: [PATCH 0313/1290] libperf cpumap: Rename perf_cpu_map__empty() to perf_cpu_map__has_any_cpu_or_is_empty() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The name perf_cpu_map_empty is misleading as true is also returned when the map contains an "any" CPU (aka dummy) map. Rename to perf_cpu_map__has_any_cpu_or_is_empty(), later changes will (re)introduce perf_cpu_map__empty() and perf_cpu_map__has_any_cpu(). Reviewed-by: James Clark Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexandre Ghiti Cc: Andrew Jones Cc: André Almeida Cc: Athira Jajeev Cc: Atish Patra Cc: Changbin Du Cc: Darren Hart Cc: Davidlohr Bueso Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Paolo Bonzini Cc: Paran Lee Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Jihong Cc: Yang Li Cc: Yanteng Si Cc: bpf@vger.kernel.org Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20231129060211.1890454-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/Documentation/libperf.txt | 2 +- tools/lib/perf/cpumap.c | 2 +- tools/lib/perf/evlist.c | 4 ++-- tools/lib/perf/include/perf/cpumap.h | 4 ++-- tools/lib/perf/libperf.map | 2 +- tools/perf/arch/arm/util/cs-etm.c | 10 +++++----- tools/perf/arch/arm64/util/arm-spe.c | 4 ++-- tools/perf/arch/x86/util/intel-bts.c | 4 ++-- tools/perf/arch/x86/util/intel-pt.c | 10 +++++----- tools/perf/builtin-c2c.c | 2 +- tools/perf/builtin-stat.c | 6 +++--- tools/perf/util/auxtrace.c | 4 ++-- tools/perf/util/record.c | 2 +- tools/perf/util/stat.c | 2 +- 14 files changed, 29 insertions(+), 29 deletions(-) diff --git a/tools/lib/perf/Documentation/libperf.txt b/tools/lib/perf/Documentation/libperf.txt index a256a26598b04..fcfb9499ef9cd 100644 --- a/tools/lib/perf/Documentation/libperf.txt +++ b/tools/lib/perf/Documentation/libperf.txt @@ -46,7 +46,7 @@ SYNOPSIS void perf_cpu_map__put(struct perf_cpu_map *map); int perf_cpu_map__cpu(const struct perf_cpu_map *cpus, int idx); int perf_cpu_map__nr(const struct perf_cpu_map *cpus); - bool perf_cpu_map__empty(const struct perf_cpu_map *map); + bool perf_cpu_map__has_any_cpu_or_is_empty(const struct perf_cpu_map *map); int perf_cpu_map__max(struct perf_cpu_map *map); bool perf_cpu_map__has(const struct perf_cpu_map *map, int cpu); diff --git a/tools/lib/perf/cpumap.c b/tools/lib/perf/cpumap.c index 3aa80d0d26e86..4adcd7920d033 100644 --- a/tools/lib/perf/cpumap.c +++ b/tools/lib/perf/cpumap.c @@ -311,7 +311,7 @@ int perf_cpu_map__nr(const struct perf_cpu_map *cpus) return cpus ? __perf_cpu_map__nr(cpus) : 1; } -bool perf_cpu_map__empty(const struct perf_cpu_map *map) +bool perf_cpu_map__has_any_cpu_or_is_empty(const struct perf_cpu_map *map) { return map ? __perf_cpu_map__cpu(map, 0).cpu == -1 : true; } diff --git a/tools/lib/perf/evlist.c b/tools/lib/perf/evlist.c index 3acbbccc19019..75f36218fdd98 100644 --- a/tools/lib/perf/evlist.c +++ b/tools/lib/perf/evlist.c @@ -619,7 +619,7 @@ static int perf_evlist__nr_mmaps(struct perf_evlist *evlist) /* One for each CPU */ nr_mmaps = perf_cpu_map__nr(evlist->all_cpus); - if (perf_cpu_map__empty(evlist->all_cpus)) { + if (perf_cpu_map__has_any_cpu_or_is_empty(evlist->all_cpus)) { /* Plus one for each thread */ nr_mmaps += perf_thread_map__nr(evlist->threads); /* Minus the per-thread CPU (-1) */ @@ -653,7 +653,7 @@ int perf_evlist__mmap_ops(struct perf_evlist *evlist, if (evlist->pollfd.entries == NULL && perf_evlist__alloc_pollfd(evlist) < 0) return -ENOMEM; - if (perf_cpu_map__empty(cpus)) + if (perf_cpu_map__has_any_cpu_or_is_empty(cpus)) return mmap_per_thread(evlist, ops, mp); return mmap_per_cpu(evlist, ops, mp); diff --git a/tools/lib/perf/include/perf/cpumap.h b/tools/lib/perf/include/perf/cpumap.h index b24bd8b8f34e1..9cf361fc5edcb 100644 --- a/tools/lib/perf/include/perf/cpumap.h +++ b/tools/lib/perf/include/perf/cpumap.h @@ -47,9 +47,9 @@ LIBPERF_API void perf_cpu_map__put(struct perf_cpu_map *map); LIBPERF_API struct perf_cpu perf_cpu_map__cpu(const struct perf_cpu_map *cpus, int idx); LIBPERF_API int perf_cpu_map__nr(const struct perf_cpu_map *cpus); /** - * perf_cpu_map__empty - is map either empty or the "any CPU"/dummy value. + * perf_cpu_map__has_any_cpu_or_is_empty - is map either empty or has the "any CPU"/dummy value. */ -LIBPERF_API bool perf_cpu_map__empty(const struct perf_cpu_map *map); +LIBPERF_API bool perf_cpu_map__has_any_cpu_or_is_empty(const struct perf_cpu_map *map); LIBPERF_API struct perf_cpu perf_cpu_map__max(const struct perf_cpu_map *map); LIBPERF_API bool perf_cpu_map__has(const struct perf_cpu_map *map, struct perf_cpu cpu); LIBPERF_API bool perf_cpu_map__equal(const struct perf_cpu_map *lhs, diff --git a/tools/lib/perf/libperf.map b/tools/lib/perf/libperf.map index 8a71f841498e4..10b3f37226426 100644 --- a/tools/lib/perf/libperf.map +++ b/tools/lib/perf/libperf.map @@ -9,7 +9,7 @@ LIBPERF_0.0.1 { perf_cpu_map__read; perf_cpu_map__nr; perf_cpu_map__cpu; - perf_cpu_map__empty; + perf_cpu_map__has_any_cpu_or_is_empty; perf_cpu_map__max; perf_cpu_map__has; perf_thread_map__new_array; diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index 2cf873d71dff0..c6b7b3066324e 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -211,7 +211,7 @@ static int cs_etm_validate_config(struct auxtrace_record *itr, * program can run on any CPUs in this case, thus don't skip * validation. */ - if (!perf_cpu_map__empty(event_cpus) && + if (!perf_cpu_map__has_any_cpu_or_is_empty(event_cpus) && !perf_cpu_map__has(event_cpus, cpu)) continue; @@ -435,7 +435,7 @@ static int cs_etm_recording_options(struct auxtrace_record *itr, * Also the case of per-cpu mmaps, need the contextID in order to be notified * when a context switch happened. */ - if (!perf_cpu_map__empty(cpus)) { + if (!perf_cpu_map__has_any_cpu_or_is_empty(cpus)) { evsel__set_config_if_unset(cs_etm_pmu, cs_etm_evsel, "timestamp", 1); evsel__set_config_if_unset(cs_etm_pmu, cs_etm_evsel, @@ -461,7 +461,7 @@ static int cs_etm_recording_options(struct auxtrace_record *itr, evsel->core.attr.sample_period = 1; /* In per-cpu case, always need the time of mmap events etc */ - if (!perf_cpu_map__empty(cpus)) + if (!perf_cpu_map__has_any_cpu_or_is_empty(cpus)) evsel__set_sample_bit(evsel, TIME); err = cs_etm_validate_config(itr, cs_etm_evsel); @@ -539,7 +539,7 @@ cs_etm_info_priv_size(struct auxtrace_record *itr __maybe_unused, struct perf_cpu_map *online_cpus = perf_cpu_map__new(NULL); /* cpu map is not empty, we have specific CPUs to work with */ - if (!perf_cpu_map__empty(event_cpus)) { + if (!perf_cpu_map__has_any_cpu_or_is_empty(event_cpus)) { for (i = 0; i < cpu__max_cpu().cpu; i++) { struct perf_cpu cpu = { .cpu = i, }; @@ -814,7 +814,7 @@ static int cs_etm_info_fill(struct auxtrace_record *itr, return -EINVAL; /* If the cpu_map is empty all online CPUs are involved */ - if (perf_cpu_map__empty(event_cpus)) { + if (perf_cpu_map__has_any_cpu_or_is_empty(event_cpus)) { cpu_map = online_cpus; } else { /* Make sure all specified CPUs are online */ diff --git a/tools/perf/arch/arm64/util/arm-spe.c b/tools/perf/arch/arm64/util/arm-spe.c index e3acc739bd002..51ccbfd3d246d 100644 --- a/tools/perf/arch/arm64/util/arm-spe.c +++ b/tools/perf/arch/arm64/util/arm-spe.c @@ -232,7 +232,7 @@ static int arm_spe_recording_options(struct auxtrace_record *itr, * In the case of per-cpu mmaps, sample CPU for AUX event; * also enable the timestamp tracing for samples correlation. */ - if (!perf_cpu_map__empty(cpus)) { + if (!perf_cpu_map__has_any_cpu_or_is_empty(cpus)) { evsel__set_sample_bit(arm_spe_evsel, CPU); evsel__set_config_if_unset(arm_spe_pmu, arm_spe_evsel, "ts_enable", 1); @@ -265,7 +265,7 @@ static int arm_spe_recording_options(struct auxtrace_record *itr, tracking_evsel->core.attr.sample_period = 1; /* In per-cpu case, always need the time of mmap events etc */ - if (!perf_cpu_map__empty(cpus)) { + if (!perf_cpu_map__has_any_cpu_or_is_empty(cpus)) { evsel__set_sample_bit(tracking_evsel, TIME); evsel__set_sample_bit(tracking_evsel, CPU); diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c index d2c8cac114702..af8ae4647585b 100644 --- a/tools/perf/arch/x86/util/intel-bts.c +++ b/tools/perf/arch/x86/util/intel-bts.c @@ -143,7 +143,7 @@ static int intel_bts_recording_options(struct auxtrace_record *itr, if (!opts->full_auxtrace) return 0; - if (opts->full_auxtrace && !perf_cpu_map__empty(cpus)) { + if (opts->full_auxtrace && !perf_cpu_map__has_any_cpu_or_is_empty(cpus)) { pr_err(INTEL_BTS_PMU_NAME " does not support per-cpu recording\n"); return -EINVAL; } @@ -224,7 +224,7 @@ static int intel_bts_recording_options(struct auxtrace_record *itr, * In the case of per-cpu mmaps, we need the CPU on the * AUX event. */ - if (!perf_cpu_map__empty(cpus)) + if (!perf_cpu_map__has_any_cpu_or_is_empty(cpus)) evsel__set_sample_bit(intel_bts_evsel, CPU); } diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index fa0c718b9e727..d199619df3abe 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -369,7 +369,7 @@ static int intel_pt_info_fill(struct auxtrace_record *itr, ui__warning("Intel Processor Trace: TSC not available\n"); } - per_cpu_mmaps = !perf_cpu_map__empty(session->evlist->core.user_requested_cpus); + per_cpu_mmaps = !perf_cpu_map__has_any_cpu_or_is_empty(session->evlist->core.user_requested_cpus); auxtrace_info->type = PERF_AUXTRACE_INTEL_PT; auxtrace_info->priv[INTEL_PT_PMU_TYPE] = intel_pt_pmu->type; @@ -774,7 +774,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, * Per-cpu recording needs sched_switch events to distinguish different * threads. */ - if (have_timing_info && !perf_cpu_map__empty(cpus) && + if (have_timing_info && !perf_cpu_map__has_any_cpu_or_is_empty(cpus) && !record_opts__no_switch_events(opts)) { if (perf_can_record_switch_events()) { bool cpu_wide = !target__none(&opts->target) && @@ -832,7 +832,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, * In the case of per-cpu mmaps, we need the CPU on the * AUX event. */ - if (!perf_cpu_map__empty(cpus)) + if (!perf_cpu_map__has_any_cpu_or_is_empty(cpus)) evsel__set_sample_bit(intel_pt_evsel, CPU); } @@ -858,7 +858,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, tracking_evsel->immediate = true; /* In per-cpu case, always need the time of mmap events etc */ - if (!perf_cpu_map__empty(cpus)) { + if (!perf_cpu_map__has_any_cpu_or_is_empty(cpus)) { evsel__set_sample_bit(tracking_evsel, TIME); /* And the CPU for switch events */ evsel__set_sample_bit(tracking_evsel, CPU); @@ -870,7 +870,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, * Warn the user when we do not have enough information to decode i.e. * per-cpu with no sched_switch (except workload-only). */ - if (!ptr->have_sched_switch && !perf_cpu_map__empty(cpus) && + if (!ptr->have_sched_switch && !perf_cpu_map__has_any_cpu_or_is_empty(cpus) && !target__none(&opts->target) && !intel_pt_evsel->core.attr.exclude_user) ui__warning("Intel Processor Trace decoding will not be possible except for kernel tracing!\n"); diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index a4cf9de7a7b5a..f78eea9e21539 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -2320,7 +2320,7 @@ static int setup_nodes(struct perf_session *session) nodes[node] = set; /* empty node, skip */ - if (perf_cpu_map__empty(map)) + if (perf_cpu_map__has_any_cpu_or_is_empty(map)) continue; perf_cpu_map__for_each_cpu(cpu, idx, map) { diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 0bfa70791cfcf..bda020c0b9d52 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1316,7 +1316,7 @@ static int cpu__get_cache_id_from_map(struct perf_cpu cpu, char *map) * be the first online CPU in the cache domain else use the * first online CPU of the cache domain as the ID. */ - if (perf_cpu_map__empty(cpu_map)) + if (perf_cpu_map__has_any_cpu_or_is_empty(cpu_map)) id = cpu.cpu; else id = perf_cpu_map__cpu(cpu_map, 0).cpu; @@ -1622,7 +1622,7 @@ static int perf_stat_init_aggr_mode(void) * taking the highest cpu number to be the size of * the aggregation translate cpumap. */ - if (!perf_cpu_map__empty(evsel_list->core.user_requested_cpus)) + if (!perf_cpu_map__has_any_cpu_or_is_empty(evsel_list->core.user_requested_cpus)) nr = perf_cpu_map__max(evsel_list->core.user_requested_cpus).cpu; else nr = 0; @@ -2289,7 +2289,7 @@ int process_stat_config_event(struct perf_session *session, perf_event__read_stat_config(&stat_config, &event->stat_config); - if (perf_cpu_map__empty(st->cpus)) { + if (perf_cpu_map__has_any_cpu_or_is_empty(st->cpus)) { if (st->aggr_mode != AGGR_UNSET) pr_warning("warning: processing task data, aggregation mode not set\n"); } else if (st->aggr_mode != AGGR_UNSET) { diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index f528c4364d235..3684e6009b635 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -174,7 +174,7 @@ void auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp, struct evlist *evlist, struct evsel *evsel, int idx) { - bool per_cpu = !perf_cpu_map__empty(evlist->core.user_requested_cpus); + bool per_cpu = !perf_cpu_map__has_any_cpu_or_is_empty(evlist->core.user_requested_cpus); mp->mmap_needed = evsel->needs_auxtrace_mmap; @@ -648,7 +648,7 @@ int auxtrace_parse_snapshot_options(struct auxtrace_record *itr, static int evlist__enable_event_idx(struct evlist *evlist, struct evsel *evsel, int idx) { - bool per_cpu_mmaps = !perf_cpu_map__empty(evlist->core.user_requested_cpus); + bool per_cpu_mmaps = !perf_cpu_map__has_any_cpu_or_is_empty(evlist->core.user_requested_cpus); if (per_cpu_mmaps) { struct perf_cpu evlist_cpu = perf_cpu_map__cpu(evlist->core.all_cpus, idx); diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c index 9eb5c6a08999e..40290382b2d70 100644 --- a/tools/perf/util/record.c +++ b/tools/perf/util/record.c @@ -237,7 +237,7 @@ bool evlist__can_select_event(struct evlist *evlist, const char *str) evsel = evlist__last(temp_evlist); - if (!evlist || perf_cpu_map__empty(evlist->core.user_requested_cpus)) { + if (!evlist || perf_cpu_map__has_any_cpu_or_is_empty(evlist->core.user_requested_cpus)) { struct perf_cpu_map *cpus = perf_cpu_map__new(NULL); if (cpus) diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index ec35060422173..012c4946b9c49 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -315,7 +315,7 @@ static int check_per_pkg(struct evsel *counter, struct perf_counts_values *vals, if (!counter->per_pkg) return 0; - if (perf_cpu_map__empty(cpus)) + if (perf_cpu_map__has_any_cpu_or_is_empty(cpus)) return 0; if (!mask) { -- GitLab From effe957c6bb70cac12918c0f5fd4cefb35967618 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 28 Nov 2023 22:02:01 -0800 Subject: [PATCH 0314/1290] libperf cpumap: Replace usage of perf_cpu_map__new(NULL) with perf_cpu_map__new_online_cpus() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Passing NULL to perf_cpu_map__new() performs perf_cpu_map__new_online_cpus(), just directly call perf_cpu_map__new_online_cpus() to be more intention revealing. Reviewed-by: James Clark Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexandre Ghiti Cc: Andrew Jones Cc: André Almeida Cc: Athira Jajeev Cc: Atish Patra Cc: Changbin Du Cc: Darren Hart Cc: Davidlohr Bueso Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Paolo Bonzini Cc: Paran Lee Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Jihong Cc: Yang Li Cc: Yanteng Si Cc: bpf@vger.kernel.org Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20231129060211.1890454-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/Documentation/examples/sampling.c | 2 +- tools/lib/perf/Documentation/libperf-sampling.txt | 2 +- tools/lib/perf/evlist.c | 2 +- tools/lib/perf/tests/test-evlist.c | 4 ++-- tools/lib/perf/tests/test-evsel.c | 2 +- tools/perf/arch/arm/util/cs-etm.c | 6 +++--- tools/perf/arch/arm64/util/header.c | 2 +- tools/perf/bench/epoll-ctl.c | 2 +- tools/perf/bench/epoll-wait.c | 2 +- tools/perf/bench/futex-hash.c | 2 +- tools/perf/bench/futex-lock-pi.c | 2 +- tools/perf/bench/futex-requeue.c | 2 +- tools/perf/bench/futex-wake-parallel.c | 2 +- tools/perf/bench/futex-wake.c | 2 +- tools/perf/builtin-ftrace.c | 2 +- tools/perf/tests/code-reading.c | 2 +- tools/perf/tests/keep-tracking.c | 2 +- tools/perf/tests/mmap-basic.c | 2 +- tools/perf/tests/openat-syscall-all-cpus.c | 2 +- tools/perf/tests/perf-time-to-tsc.c | 2 +- tools/perf/tests/switch-tracking.c | 2 +- tools/perf/tests/topology.c | 2 +- tools/perf/util/bpf_counter.c | 2 +- tools/perf/util/cpumap.c | 2 +- tools/perf/util/cputopo.c | 2 +- tools/perf/util/evlist.c | 2 +- tools/perf/util/perf_api_probe.c | 4 ++-- tools/perf/util/record.c | 2 +- 28 files changed, 32 insertions(+), 32 deletions(-) diff --git a/tools/lib/perf/Documentation/examples/sampling.c b/tools/lib/perf/Documentation/examples/sampling.c index 8e1a926a9cfe6..bc142f0664b5a 100644 --- a/tools/lib/perf/Documentation/examples/sampling.c +++ b/tools/lib/perf/Documentation/examples/sampling.c @@ -39,7 +39,7 @@ int main(int argc, char **argv) libperf_init(libperf_print); - cpus = perf_cpu_map__new(NULL); + cpus = perf_cpu_map__new_online_cpus(); if (!cpus) { fprintf(stderr, "failed to create cpus\n"); return -1; diff --git a/tools/lib/perf/Documentation/libperf-sampling.txt b/tools/lib/perf/Documentation/libperf-sampling.txt index d6ca24f6ef78f..2378980fab8a6 100644 --- a/tools/lib/perf/Documentation/libperf-sampling.txt +++ b/tools/lib/perf/Documentation/libperf-sampling.txt @@ -97,7 +97,7 @@ In this case we will monitor all the available CPUs: [source,c] -- - 42 cpus = perf_cpu_map__new(NULL); + 42 cpus = perf_cpu_map__new_online_cpus(); 43 if (!cpus) { 44 fprintf(stderr, "failed to create cpus\n"); 45 return -1; diff --git a/tools/lib/perf/evlist.c b/tools/lib/perf/evlist.c index 75f36218fdd98..058e3ff10f9b2 100644 --- a/tools/lib/perf/evlist.c +++ b/tools/lib/perf/evlist.c @@ -39,7 +39,7 @@ static void __perf_evlist__propagate_maps(struct perf_evlist *evlist, if (evsel->system_wide) { /* System wide: set the cpu map of the evsel to all online CPUs. */ perf_cpu_map__put(evsel->cpus); - evsel->cpus = perf_cpu_map__new(NULL); + evsel->cpus = perf_cpu_map__new_online_cpus(); } else if (evlist->has_user_cpus && evsel->is_pmu_core) { /* * User requested CPUs on a core PMU, ensure the requested CPUs diff --git a/tools/lib/perf/tests/test-evlist.c b/tools/lib/perf/tests/test-evlist.c index ab63878bacb99..10f70cb41ff1d 100644 --- a/tools/lib/perf/tests/test-evlist.c +++ b/tools/lib/perf/tests/test-evlist.c @@ -46,7 +46,7 @@ static int test_stat_cpu(void) }; int err, idx; - cpus = perf_cpu_map__new(NULL); + cpus = perf_cpu_map__new_online_cpus(); __T("failed to create cpus", cpus); evlist = perf_evlist__new(); @@ -350,7 +350,7 @@ static int test_mmap_cpus(void) attr.config = id; - cpus = perf_cpu_map__new(NULL); + cpus = perf_cpu_map__new_online_cpus(); __T("failed to create cpus", cpus); evlist = perf_evlist__new(); diff --git a/tools/lib/perf/tests/test-evsel.c b/tools/lib/perf/tests/test-evsel.c index a11fc51bfb688..545ec31505466 100644 --- a/tools/lib/perf/tests/test-evsel.c +++ b/tools/lib/perf/tests/test-evsel.c @@ -27,7 +27,7 @@ static int test_stat_cpu(void) }; int err, idx; - cpus = perf_cpu_map__new(NULL); + cpus = perf_cpu_map__new_online_cpus(); __T("failed to create cpus", cpus); evsel = perf_evsel__new(&attr); diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index c6b7b3066324e..77e6663c1703b 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -199,7 +199,7 @@ static int cs_etm_validate_config(struct auxtrace_record *itr, { int i, err = -EINVAL; struct perf_cpu_map *event_cpus = evsel->evlist->core.user_requested_cpus; - struct perf_cpu_map *online_cpus = perf_cpu_map__new(NULL); + struct perf_cpu_map *online_cpus = perf_cpu_map__new_online_cpus(); /* Set option of each CPU we have */ for (i = 0; i < cpu__max_cpu().cpu; i++) { @@ -536,7 +536,7 @@ cs_etm_info_priv_size(struct auxtrace_record *itr __maybe_unused, int i; int etmv3 = 0, etmv4 = 0, ete = 0; struct perf_cpu_map *event_cpus = evlist->core.user_requested_cpus; - struct perf_cpu_map *online_cpus = perf_cpu_map__new(NULL); + struct perf_cpu_map *online_cpus = perf_cpu_map__new_online_cpus(); /* cpu map is not empty, we have specific CPUs to work with */ if (!perf_cpu_map__has_any_cpu_or_is_empty(event_cpus)) { @@ -802,7 +802,7 @@ static int cs_etm_info_fill(struct auxtrace_record *itr, u64 nr_cpu, type; struct perf_cpu_map *cpu_map; struct perf_cpu_map *event_cpus = session->evlist->core.user_requested_cpus; - struct perf_cpu_map *online_cpus = perf_cpu_map__new(NULL); + struct perf_cpu_map *online_cpus = perf_cpu_map__new_online_cpus(); struct cs_etm_recording *ptr = container_of(itr, struct cs_etm_recording, itr); struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu; diff --git a/tools/perf/arch/arm64/util/header.c b/tools/perf/arch/arm64/util/header.c index a2eef9ec54910..97037499152ef 100644 --- a/tools/perf/arch/arm64/util/header.c +++ b/tools/perf/arch/arm64/util/header.c @@ -57,7 +57,7 @@ static int _get_cpuid(char *buf, size_t sz, struct perf_cpu_map *cpus) int get_cpuid(char *buf, size_t sz) { - struct perf_cpu_map *cpus = perf_cpu_map__new(NULL); + struct perf_cpu_map *cpus = perf_cpu_map__new_online_cpus(); int ret; if (!cpus) diff --git a/tools/perf/bench/epoll-ctl.c b/tools/perf/bench/epoll-ctl.c index 6bfffe83dde99..d3db73dac66af 100644 --- a/tools/perf/bench/epoll-ctl.c +++ b/tools/perf/bench/epoll-ctl.c @@ -330,7 +330,7 @@ int bench_epoll_ctl(int argc, const char **argv) act.sa_sigaction = toggle_done; sigaction(SIGINT, &act, NULL); - cpu = perf_cpu_map__new(NULL); + cpu = perf_cpu_map__new_online_cpus(); if (!cpu) goto errmem; diff --git a/tools/perf/bench/epoll-wait.c b/tools/perf/bench/epoll-wait.c index cb5174b53940b..06bb3187660ab 100644 --- a/tools/perf/bench/epoll-wait.c +++ b/tools/perf/bench/epoll-wait.c @@ -444,7 +444,7 @@ int bench_epoll_wait(int argc, const char **argv) act.sa_sigaction = toggle_done; sigaction(SIGINT, &act, NULL); - cpu = perf_cpu_map__new(NULL); + cpu = perf_cpu_map__new_online_cpus(); if (!cpu) goto errmem; diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c index 2005a3fa30267..0c69d20efa329 100644 --- a/tools/perf/bench/futex-hash.c +++ b/tools/perf/bench/futex-hash.c @@ -138,7 +138,7 @@ int bench_futex_hash(int argc, const char **argv) exit(EXIT_FAILURE); } - cpu = perf_cpu_map__new(NULL); + cpu = perf_cpu_map__new_online_cpus(); if (!cpu) goto errmem; diff --git a/tools/perf/bench/futex-lock-pi.c b/tools/perf/bench/futex-lock-pi.c index 092cbd52db82b..7a4973346180f 100644 --- a/tools/perf/bench/futex-lock-pi.c +++ b/tools/perf/bench/futex-lock-pi.c @@ -172,7 +172,7 @@ int bench_futex_lock_pi(int argc, const char **argv) if (argc) goto err; - cpu = perf_cpu_map__new(NULL); + cpu = perf_cpu_map__new_online_cpus(); if (!cpu) err(EXIT_FAILURE, "calloc"); diff --git a/tools/perf/bench/futex-requeue.c b/tools/perf/bench/futex-requeue.c index c0035990a33ce..d9ad736c1a3e0 100644 --- a/tools/perf/bench/futex-requeue.c +++ b/tools/perf/bench/futex-requeue.c @@ -174,7 +174,7 @@ int bench_futex_requeue(int argc, const char **argv) if (argc) goto err; - cpu = perf_cpu_map__new(NULL); + cpu = perf_cpu_map__new_online_cpus(); if (!cpu) err(EXIT_FAILURE, "cpu_map__new"); diff --git a/tools/perf/bench/futex-wake-parallel.c b/tools/perf/bench/futex-wake-parallel.c index 5ab0234d74e69..b66df553e5614 100644 --- a/tools/perf/bench/futex-wake-parallel.c +++ b/tools/perf/bench/futex-wake-parallel.c @@ -264,7 +264,7 @@ int bench_futex_wake_parallel(int argc, const char **argv) err(EXIT_FAILURE, "mlockall"); } - cpu = perf_cpu_map__new(NULL); + cpu = perf_cpu_map__new_online_cpus(); if (!cpu) err(EXIT_FAILURE, "calloc"); diff --git a/tools/perf/bench/futex-wake.c b/tools/perf/bench/futex-wake.c index 18a5894af8bb5..690fd6d3da130 100644 --- a/tools/perf/bench/futex-wake.c +++ b/tools/perf/bench/futex-wake.c @@ -149,7 +149,7 @@ int bench_futex_wake(int argc, const char **argv) exit(EXIT_FAILURE); } - cpu = perf_cpu_map__new(NULL); + cpu = perf_cpu_map__new_online_cpus(); if (!cpu) err(EXIT_FAILURE, "calloc"); diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c index ac2e6c75f9120..eb30c8eca4887 100644 --- a/tools/perf/builtin-ftrace.c +++ b/tools/perf/builtin-ftrace.c @@ -333,7 +333,7 @@ static int set_tracing_func_irqinfo(struct perf_ftrace *ftrace) static int reset_tracing_cpu(void) { - struct perf_cpu_map *cpumap = perf_cpu_map__new(NULL); + struct perf_cpu_map *cpumap = perf_cpu_map__new_online_cpus(); int ret; ret = set_tracing_cpumask(cpumap); diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index 8620146d03783..7a3a7bbbec714 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -610,7 +610,7 @@ static int do_test_code_reading(bool try_kcore) goto out_put; } - cpus = perf_cpu_map__new(NULL); + cpus = perf_cpu_map__new_online_cpus(); if (!cpus) { pr_debug("perf_cpu_map__new failed\n"); goto out_put; diff --git a/tools/perf/tests/keep-tracking.c b/tools/perf/tests/keep-tracking.c index 8f4f9b632e1e5..5a3b2bed07f32 100644 --- a/tools/perf/tests/keep-tracking.c +++ b/tools/perf/tests/keep-tracking.c @@ -81,7 +81,7 @@ static int test__keep_tracking(struct test_suite *test __maybe_unused, int subte threads = thread_map__new(-1, getpid(), UINT_MAX); CHECK_NOT_NULL__(threads); - cpus = perf_cpu_map__new(NULL); + cpus = perf_cpu_map__new_online_cpus(); CHECK_NOT_NULL__(cpus); evlist = evlist__new(); diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c index 886a13a77a162..012c8ae439fdc 100644 --- a/tools/perf/tests/mmap-basic.c +++ b/tools/perf/tests/mmap-basic.c @@ -52,7 +52,7 @@ static int test__basic_mmap(struct test_suite *test __maybe_unused, int subtest return -1; } - cpus = perf_cpu_map__new(NULL); + cpus = perf_cpu_map__new_online_cpus(); if (cpus == NULL) { pr_debug("perf_cpu_map__new\n"); goto out_free_threads; diff --git a/tools/perf/tests/openat-syscall-all-cpus.c b/tools/perf/tests/openat-syscall-all-cpus.c index f3275be83a338..fb114118c8764 100644 --- a/tools/perf/tests/openat-syscall-all-cpus.c +++ b/tools/perf/tests/openat-syscall-all-cpus.c @@ -37,7 +37,7 @@ static int test__openat_syscall_event_on_all_cpus(struct test_suite *test __mayb return -1; } - cpus = perf_cpu_map__new(NULL); + cpus = perf_cpu_map__new_online_cpus(); if (cpus == NULL) { pr_debug("perf_cpu_map__new\n"); goto out_thread_map_delete; diff --git a/tools/perf/tests/perf-time-to-tsc.c b/tools/perf/tests/perf-time-to-tsc.c index efcd71c2738af..bbe2ddeb9b745 100644 --- a/tools/perf/tests/perf-time-to-tsc.c +++ b/tools/perf/tests/perf-time-to-tsc.c @@ -93,7 +93,7 @@ static int test__perf_time_to_tsc(struct test_suite *test __maybe_unused, int su threads = thread_map__new(-1, getpid(), UINT_MAX); CHECK_NOT_NULL__(threads); - cpus = perf_cpu_map__new(NULL); + cpus = perf_cpu_map__new_online_cpus(); CHECK_NOT_NULL__(cpus); evlist = evlist__new(); diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c index e52b031bedc5a..5cab17a1942e6 100644 --- a/tools/perf/tests/switch-tracking.c +++ b/tools/perf/tests/switch-tracking.c @@ -351,7 +351,7 @@ static int test__switch_tracking(struct test_suite *test __maybe_unused, int sub goto out_err; } - cpus = perf_cpu_map__new(NULL); + cpus = perf_cpu_map__new_online_cpus(); if (!cpus) { pr_debug("perf_cpu_map__new failed!\n"); goto out_err; diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index 9dee63734e66a..2a842f53fbb57 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -215,7 +215,7 @@ static int test__session_topology(struct test_suite *test __maybe_unused, int su if (session_write_header(path)) goto free_path; - map = perf_cpu_map__new(NULL); + map = perf_cpu_map__new_online_cpus(); if (map == NULL) { pr_debug("failed to get system cpumap\n"); goto free_path; diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c index 7f9b0e46e008c..7a8af60e0f515 100644 --- a/tools/perf/util/bpf_counter.c +++ b/tools/perf/util/bpf_counter.c @@ -455,7 +455,7 @@ static int bperf__load(struct evsel *evsel, struct target *target) return -1; if (!all_cpu_map) { - all_cpu_map = perf_cpu_map__new(NULL); + all_cpu_map = perf_cpu_map__new_online_cpus(); if (!all_cpu_map) return -1; } diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 0e090e8bc3349..0581ee0fa5f27 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -672,7 +672,7 @@ struct perf_cpu_map *cpu_map__online(void) /* thread unsafe */ static struct perf_cpu_map *online; if (!online) - online = perf_cpu_map__new(NULL); /* from /sys/devices/system/cpu/online */ + online = perf_cpu_map__new_online_cpus(); /* from /sys/devices/system/cpu/online */ return online; } diff --git a/tools/perf/util/cputopo.c b/tools/perf/util/cputopo.c index 81cfc85f46682..8bbeb2dc76fda 100644 --- a/tools/perf/util/cputopo.c +++ b/tools/perf/util/cputopo.c @@ -267,7 +267,7 @@ struct cpu_topology *cpu_topology__new(void) ncpus = cpu__max_present_cpu().cpu; /* build online CPU map */ - map = perf_cpu_map__new(NULL); + map = perf_cpu_map__new_online_cpus(); if (map == NULL) { pr_debug("failed to get system cpumap\n"); return NULL; diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index ff7f85ded89d7..0ed3ce2aa8ebf 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1352,7 +1352,7 @@ static int evlist__create_syswide_maps(struct evlist *evlist) * error, and we may not want to do that fallback to a * default cpu identity map :-\ */ - cpus = perf_cpu_map__new(NULL); + cpus = perf_cpu_map__new_online_cpus(); if (!cpus) goto out; diff --git a/tools/perf/util/perf_api_probe.c b/tools/perf/util/perf_api_probe.c index e1e2d701599c4..1de3b69cdf4aa 100644 --- a/tools/perf/util/perf_api_probe.c +++ b/tools/perf/util/perf_api_probe.c @@ -64,7 +64,7 @@ static bool perf_probe_api(setup_probe_fn_t fn) struct perf_cpu cpu; int ret, i = 0; - cpus = perf_cpu_map__new(NULL); + cpus = perf_cpu_map__new_online_cpus(); if (!cpus) return false; cpu = perf_cpu_map__cpu(cpus, 0); @@ -140,7 +140,7 @@ bool perf_can_record_cpu_wide(void) struct perf_cpu cpu; int fd; - cpus = perf_cpu_map__new(NULL); + cpus = perf_cpu_map__new_online_cpus(); if (!cpus) return false; diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c index 40290382b2d70..87e817b3cf7e9 100644 --- a/tools/perf/util/record.c +++ b/tools/perf/util/record.c @@ -238,7 +238,7 @@ bool evlist__can_select_event(struct evlist *evlist, const char *str) evsel = evlist__last(temp_evlist); if (!evlist || perf_cpu_map__has_any_cpu_or_is_empty(evlist->core.user_requested_cpus)) { - struct perf_cpu_map *cpus = perf_cpu_map__new(NULL); + struct perf_cpu_map *cpus = perf_cpu_map__new_online_cpus(); if (cpus) cpu = perf_cpu_map__cpu(cpus, 0); -- GitLab From 5805c82513c444333efb086017be8d666336858a Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 28 Nov 2023 22:02:02 -0800 Subject: [PATCH 0315/1290] libperf cpumap: Add for_each_cpu() that skips the "any CPU" case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When iterating CPUs in a CPU map it is often desirable to skip the "any CPU" (aka dummy) case. Add a helper for this and use in builtin-record. Reviewed-by: James Clark Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexandre Ghiti Cc: Andrew Jones Cc: André Almeida Cc: Athira Jajeev Cc: Atish Patra Cc: Changbin Du Cc: Darren Hart Cc: Davidlohr Bueso Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Paolo Bonzini Cc: Paran Lee Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Jihong Cc: Yang Li Cc: Yanteng Si Cc: bpf@vger.kernel.org Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20231129060211.1890454-6-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/include/perf/cpumap.h | 6 ++++++ tools/perf/builtin-record.c | 4 +--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/lib/perf/include/perf/cpumap.h b/tools/lib/perf/include/perf/cpumap.h index 9cf361fc5edcb..dbe0a7352b648 100644 --- a/tools/lib/perf/include/perf/cpumap.h +++ b/tools/lib/perf/include/perf/cpumap.h @@ -64,6 +64,12 @@ LIBPERF_API bool perf_cpu_map__has_any_cpu(const struct perf_cpu_map *map); (idx) < perf_cpu_map__nr(cpus); \ (idx)++, (cpu) = perf_cpu_map__cpu(cpus, idx)) +#define perf_cpu_map__for_each_cpu_skip_any(_cpu, idx, cpus) \ + for ((idx) = 0, (_cpu) = perf_cpu_map__cpu(cpus, idx); \ + (idx) < perf_cpu_map__nr(cpus); \ + (idx)++, (_cpu) = perf_cpu_map__cpu(cpus, idx)) \ + if ((_cpu).cpu != -1) + #define perf_cpu_map__for_each_idx(idx, cpus) \ for ((idx) = 0; (idx) < perf_cpu_map__nr(cpus); (idx)++) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index eb5a398ddb1d8..8a1002683fda2 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -3601,9 +3601,7 @@ static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cp if (cpu_map__is_dummy(cpus)) return 0; - perf_cpu_map__for_each_cpu(cpu, idx, cpus) { - if (cpu.cpu == -1) - continue; + perf_cpu_map__for_each_cpu_skip_any(cpu, idx, cpus) { /* Return ENODEV is input cpu is greater than max cpu */ if ((unsigned long)cpu.cpu > mask->nbits) return -ENODEV; -- GitLab From e4af6bb1f62fe30f19171322374f021bbcdc5b04 Mon Sep 17 00:00:00 2001 From: Jiri Valek - 2N Date: Mon, 11 Dec 2023 18:54:22 -0800 Subject: [PATCH 0316/1290] dt-bindings: input: microchip,cap11xx: add advanced sensitivity settings Add support for advanced sensitivity settings and signal guard feature. Signed-off-by: Jiri Valek - 2N Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20231121155250.613242-2-jiriv@axis.com Signed-off-by: Dmitry Torokhov --- .../bindings/input/microchip,cap11xx.yaml | 80 ++++++++++++++++++- 1 file changed, 77 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/input/microchip,cap11xx.yaml b/Documentation/devicetree/bindings/input/microchip,cap11xx.yaml index 5b5d4f7d34827..7ade03f1b32b8 100644 --- a/Documentation/devicetree/bindings/input/microchip,cap11xx.yaml +++ b/Documentation/devicetree/bindings/input/microchip,cap11xx.yaml @@ -45,13 +45,13 @@ properties: Enables the Linux input system's autorepeat feature on the input device. linux,keycodes: - minItems: 6 - maxItems: 6 + minItems: 3 + maxItems: 8 description: | Specifies an array of numeric keycode values to be used for the channels. If this property is omitted, KEY_A, KEY_B, etc are used as defaults. - The array must have exactly six entries. + The number of entries must correspond to the number of channels. microchip,sensor-gain: $ref: /schemas/types.yaml#/definitions/uint32 @@ -70,6 +70,59 @@ properties: open drain. This property allows using the active high push-pull output. + microchip,sensitivity-delta-sense: + $ref: /schemas/types.yaml#/definitions/uint32 + default: 32 + enum: [1, 2, 4, 8, 16, 32, 64, 128] + description: + Controls the sensitivity multiplier of a touch detection. + Higher value means more sensitive settings. + At the more sensitive settings, touches are detected for a smaller delta + capacitance corresponding to a "lighter" touch. + + microchip,signal-guard: + $ref: /schemas/types.yaml#/definitions/uint32-array + minItems: 3 + maxItems: 8 + items: + enum: [0, 1] + description: | + 0 - off + 1 - on + The signal guard isolates the signal from virtual grounds. + If enabled then the behavior of the channel is changed to signal guard. + The number of entries must correspond to the number of channels. + + microchip,input-threshold: + $ref: /schemas/types.yaml#/definitions/uint32-array + minItems: 3 + maxItems: 8 + items: + minimum: 0 + maximum: 127 + description: + Specifies the delta threshold that is used to determine if a touch has + been detected. A higher value means a larger difference in capacitance + is required for a touch to be registered, making the touch sensor less + sensitive. + The number of entries must correspond to the number of channels. + + microchip,calib-sensitivity: + $ref: /schemas/types.yaml#/definitions/uint32-array + minItems: 3 + maxItems: 8 + items: + enum: [1, 2, 4] + description: | + Specifies an array of numeric values that controls the gain + used by the calibration routine to enable sensor inputs + to be more sensitive for proximity detection. + Gain is based on touch pad capacitance range + 1 - 5-50pF + 2 - 0-25pF + 4 - 0-12.5pF + The number of entries must correspond to the number of channels. + patternProperties: "^led@[0-7]$": type: object @@ -99,10 +152,29 @@ allOf: contains: enum: - microchip,cap1106 + - microchip,cap1203 + - microchip,cap1206 + - microchip,cap1293 + - microchip,cap1298 then: patternProperties: "^led@[0-7]$": false + - if: + properties: + compatible: + contains: + enum: + - microchip,cap1106 + - microchip,cap1126 + - microchip,cap1188 + - microchip,cap1203 + - microchip,cap1206 + then: + properties: + microchip,signal-guard: false + microchip,calib-sensitivity: false + required: - compatible - interrupts @@ -122,6 +194,8 @@ examples: reg = <0x28>; autorepeat; microchip,sensor-gain = <2>; + microchip,sensitivity-delta-sense = <16>; + microchip,input-threshold = <21>, <18>, <46>, <46>, <46>, <21>; linux,keycodes = <103>, /* KEY_UP */ <106>, /* KEY_RIGHT */ -- GitLab From 2e3ae00021901461455b6a9ff4ef850817e9ab6d Mon Sep 17 00:00:00 2001 From: Jiri Valek - 2N Date: Mon, 11 Dec 2023 18:54:45 -0800 Subject: [PATCH 0317/1290] Input: cap11xx - add advanced sensitivity settings Add support for advanced sensitivity settings that allows more precise tunig of touch buttons. Input-treshold allows to set the sensitivity for each channel separately. Also add signal guard feature for CAP129x chips. Signed-off-by: Jiri Valek - 2N Link: https://lore.kernel.org/r/20231121155250.613242-3-jiriv@axis.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/cap11xx.c | 237 +++++++++++++++++++++++++------ 1 file changed, 192 insertions(+), 45 deletions(-) diff --git a/drivers/input/keyboard/cap11xx.c b/drivers/input/keyboard/cap11xx.c index 01e7ead68fead..ebcbc00d2059e 100644 --- a/drivers/input/keyboard/cap11xx.c +++ b/drivers/input/keyboard/cap11xx.c @@ -14,6 +14,7 @@ #include #include #include +#include #define CAP11XX_REG_MAIN_CONTROL 0x00 #define CAP11XX_REG_MAIN_CONTROL_GAIN_SHIFT (6) @@ -24,6 +25,7 @@ #define CAP11XX_REG_NOISE_FLAG_STATUS 0x0a #define CAP11XX_REG_SENOR_DELTA(X) (0x10 + (X)) #define CAP11XX_REG_SENSITIVITY_CONTROL 0x1f +#define CAP11XX_REG_SENSITIVITY_CONTROL_DELTA_SENSE_MASK 0x70 #define CAP11XX_REG_CONFIG 0x20 #define CAP11XX_REG_SENSOR_ENABLE 0x21 #define CAP11XX_REG_SENSOR_CONFIG 0x22 @@ -32,6 +34,7 @@ #define CAP11XX_REG_CALIBRATION 0x26 #define CAP11XX_REG_INT_ENABLE 0x27 #define CAP11XX_REG_REPEAT_RATE 0x28 +#define CAP11XX_REG_SIGNAL_GUARD_ENABLE 0x29 #define CAP11XX_REG_MT_CONFIG 0x2a #define CAP11XX_REG_MT_PATTERN_CONFIG 0x2b #define CAP11XX_REG_MT_PATTERN 0x2d @@ -47,6 +50,8 @@ #define CAP11XX_REG_SENSOR_BASE_CNT(X) (0x50 + (X)) #define CAP11XX_REG_LED_POLARITY 0x73 #define CAP11XX_REG_LED_OUTPUT_CONTROL 0x74 +#define CAP11XX_REG_CALIB_SENSITIVITY_CONFIG 0x80 +#define CAP11XX_REG_CALIB_SENSITIVITY_CONFIG2 0x81 #define CAP11XX_REG_LED_DUTY_CYCLE_1 0x90 #define CAP11XX_REG_LED_DUTY_CYCLE_2 0x91 @@ -78,12 +83,20 @@ struct cap11xx_led { struct cap11xx_priv { struct regmap *regmap; + struct device *dev; struct input_dev *idev; + const struct cap11xx_hw_model *model; + u8 id; struct cap11xx_led *leds; int num_leds; /* config */ + u8 analog_gain; + u8 sensitivity_delta_sense; + u8 signal_guard_inputs_mask; + u32 thresholds[8]; + u32 calib_sensitivities[8]; u32 keycodes[]; }; @@ -181,6 +194,175 @@ static const struct regmap_config cap11xx_regmap_config = { .volatile_reg = cap11xx_volatile_reg, }; +static int cap11xx_write_calib_sens_config_1(struct cap11xx_priv *priv) +{ + return regmap_write(priv->regmap, + CAP11XX_REG_CALIB_SENSITIVITY_CONFIG, + (priv->calib_sensitivities[3] << 6) | + (priv->calib_sensitivities[2] << 4) | + (priv->calib_sensitivities[1] << 2) | + priv->calib_sensitivities[0]); +} + +static int cap11xx_write_calib_sens_config_2(struct cap11xx_priv *priv) +{ + return regmap_write(priv->regmap, + CAP11XX_REG_CALIB_SENSITIVITY_CONFIG2, + (priv->calib_sensitivities[7] << 6) | + (priv->calib_sensitivities[6] << 4) | + (priv->calib_sensitivities[5] << 2) | + priv->calib_sensitivities[4]); +} + +static int cap11xx_init_keys(struct cap11xx_priv *priv) +{ + struct device_node *node = priv->dev->of_node; + struct device *dev = priv->dev; + int i, error; + u32 u32_val; + + if (!node) { + dev_err(dev, "Corresponding DT entry is not available\n"); + return -ENODEV; + } + + if (!of_property_read_u32(node, "microchip,sensor-gain", &u32_val)) { + if (priv->model->no_gain) { + dev_warn(dev, + "This model doesn't support 'sensor-gain'\n"); + } else if (is_power_of_2(u32_val) && u32_val <= 8) { + priv->analog_gain = (u8)ilog2(u32_val); + + error = regmap_update_bits(priv->regmap, + CAP11XX_REG_MAIN_CONTROL, + CAP11XX_REG_MAIN_CONTROL_GAIN_MASK, + priv->analog_gain << CAP11XX_REG_MAIN_CONTROL_GAIN_SHIFT); + if (error) + return error; + } else { + dev_err(dev, "Invalid sensor-gain value %u\n", u32_val); + return -EINVAL; + } + } + + if (of_property_read_bool(node, "microchip,irq-active-high")) { + if (priv->id == CAP1106 || + priv->id == CAP1126 || + priv->id == CAP1188) { + error = regmap_update_bits(priv->regmap, + CAP11XX_REG_CONFIG2, + CAP11XX_REG_CONFIG2_ALT_POL, + 0); + if (error) + return error; + } else { + dev_warn(dev, + "This model doesn't support 'irq-active-high'\n"); + } + } + + if (!of_property_read_u32(node, "microchip,sensitivity-delta-sense", &u32_val)) { + if (!is_power_of_2(u32_val) || u32_val > 128) { + dev_err(dev, "Invalid sensitivity-delta-sense value %u\n", u32_val); + return -EINVAL; + } + + priv->sensitivity_delta_sense = (u8)ilog2(u32_val); + u32_val = ~(FIELD_PREP(CAP11XX_REG_SENSITIVITY_CONTROL_DELTA_SENSE_MASK, + priv->sensitivity_delta_sense)); + + error = regmap_update_bits(priv->regmap, + CAP11XX_REG_SENSITIVITY_CONTROL, + CAP11XX_REG_SENSITIVITY_CONTROL_DELTA_SENSE_MASK, + u32_val); + if (error) + return error; + } + + if (!of_property_read_u32_array(node, "microchip,input-threshold", + priv->thresholds, priv->model->num_channels)) { + for (i = 0; i < priv->model->num_channels; i++) { + if (priv->thresholds[i] > 127) { + dev_err(dev, "Invalid input-threshold value %u\n", + priv->thresholds[i]); + return -EINVAL; + } + + error = regmap_write(priv->regmap, + CAP11XX_REG_SENSOR_THRESH(i), + priv->thresholds[i]); + if (error) + return error; + } + } + + if (!of_property_read_u32_array(node, "microchip,calib-sensitivity", + priv->calib_sensitivities, + priv->model->num_channels)) { + if (priv->id == CAP1293 || priv->id == CAP1298) { + for (i = 0; i < priv->model->num_channels; i++) { + if (!is_power_of_2(priv->calib_sensitivities[i]) || + priv->calib_sensitivities[i] > 4) { + dev_err(dev, "Invalid calib-sensitivity value %u\n", + priv->calib_sensitivities[i]); + return -EINVAL; + } + priv->calib_sensitivities[i] = ilog2(priv->calib_sensitivities[i]); + } + + error = cap11xx_write_calib_sens_config_1(priv); + if (error) + return error; + + if (priv->id == CAP1298) { + error = cap11xx_write_calib_sens_config_2(priv); + if (error) + return error; + } + } else { + dev_warn(dev, + "This model doesn't support 'calib-sensitivity'\n"); + } + } + + for (i = 0; i < priv->model->num_channels; i++) { + if (!of_property_read_u32_index(node, "microchip,signal-guard", + i, &u32_val)) { + if (u32_val > 1) + return -EINVAL; + if (u32_val) + priv->signal_guard_inputs_mask |= 0x01 << i; + } + } + + if (priv->signal_guard_inputs_mask) { + if (priv->id == CAP1293 || priv->id == CAP1298) { + error = regmap_write(priv->regmap, + CAP11XX_REG_SIGNAL_GUARD_ENABLE, + priv->signal_guard_inputs_mask); + if (error) + return error; + } else { + dev_warn(dev, + "This model doesn't support 'signal-guard'\n"); + } + } + + /* Provide some useful defaults */ + for (i = 0; i < priv->model->num_channels; i++) + priv->keycodes[i] = KEY_A + i; + + of_property_read_u32_array(node, "linux,keycodes", + priv->keycodes, priv->model->num_channels); + + /* Disable autorepeat. The Linux input system has its own handling. */ + error = regmap_write(priv->regmap, CAP11XX_REG_REPEAT_RATE, 0); + if (error) + return error; + + return 0; +} + static irqreturn_t cap11xx_thread_func(int irq_num, void *data) { struct cap11xx_priv *priv = data; @@ -332,11 +514,9 @@ static int cap11xx_i2c_probe(struct i2c_client *i2c_client) const struct i2c_device_id *id = i2c_client_get_device_id(i2c_client); struct device *dev = &i2c_client->dev; struct cap11xx_priv *priv; - struct device_node *node; const struct cap11xx_hw_model *cap; - int i, error, gain = 0; + int i, error; unsigned int val, rev; - u32 gain32; if (id->driver_data >= ARRAY_SIZE(cap11xx_devices)) { dev_err(dev, "Invalid device ID %lu\n", id->driver_data); @@ -355,6 +535,8 @@ static int cap11xx_i2c_probe(struct i2c_client *i2c_client) if (!priv) return -ENOMEM; + priv->dev = dev; + priv->regmap = devm_regmap_init_i2c(i2c_client, &cap11xx_regmap_config); if (IS_ERR(priv->regmap)) return PTR_ERR(priv->regmap); @@ -384,50 +566,15 @@ static int cap11xx_i2c_probe(struct i2c_client *i2c_client) return error; dev_info(dev, "CAP11XX detected, model %s, revision 0x%02x\n", - id->name, rev); - node = dev->of_node; - - if (!of_property_read_u32(node, "microchip,sensor-gain", &gain32)) { - if (cap->no_gain) - dev_warn(dev, - "This version doesn't support sensor gain\n"); - else if (is_power_of_2(gain32) && gain32 <= 8) - gain = ilog2(gain32); - else - dev_err(dev, "Invalid sensor-gain value %d\n", gain32); - } + id->name, rev); - if (id->driver_data == CAP1106 || - id->driver_data == CAP1126 || - id->driver_data == CAP1188) { - if (of_property_read_bool(node, "microchip,irq-active-high")) { - error = regmap_update_bits(priv->regmap, - CAP11XX_REG_CONFIG2, - CAP11XX_REG_CONFIG2_ALT_POL, - 0); - if (error) - return error; - } - } + priv->model = cap; + priv->id = id->driver_data; - /* Provide some useful defaults */ - for (i = 0; i < cap->num_channels; i++) - priv->keycodes[i] = KEY_A + i; - - of_property_read_u32_array(node, "linux,keycodes", - priv->keycodes, cap->num_channels); - - if (!cap->no_gain) { - error = regmap_update_bits(priv->regmap, - CAP11XX_REG_MAIN_CONTROL, - CAP11XX_REG_MAIN_CONTROL_GAIN_MASK, - gain << CAP11XX_REG_MAIN_CONTROL_GAIN_SHIFT); - if (error) - return error; - } + dev_info(dev, "CAP11XX device detected, model %s, revision 0x%02x\n", + id->name, rev); - /* Disable autorepeat. The Linux input system has its own handling. */ - error = regmap_write(priv->regmap, CAP11XX_REG_REPEAT_RATE, 0); + error = cap11xx_init_keys(priv); if (error) return error; @@ -439,7 +586,7 @@ static int cap11xx_i2c_probe(struct i2c_client *i2c_client) priv->idev->id.bustype = BUS_I2C; priv->idev->evbit[0] = BIT_MASK(EV_KEY); - if (of_property_read_bool(node, "autorepeat")) + if (of_property_read_bool(dev->of_node, "autorepeat")) __set_bit(EV_REP, priv->idev->evbit); for (i = 0; i < cap->num_channels; i++) -- GitLab From 39bd68d422ba085997fb1f26160d7c686915b3cb Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Sun, 1 Oct 2023 01:43:38 +0200 Subject: [PATCH 0318/1290] Input: cap11xx - cache hardware ID registers The cap11xx devices have three hardware identification registers which are currently marked as volatile, preventing caching of those registers. This is not ideal since the registers should never change at runtime, we should be able to cache the value after the first read. Stop marking the registers as volatile, we don't have register defaults specified in the driver so this will result in reading from the hardware on first use. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20231001-input-maple-v1-1-ed3716051431@kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/cap11xx.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/input/keyboard/cap11xx.c b/drivers/input/keyboard/cap11xx.c index ebcbc00d2059e..8e25d7d5e64de 100644 --- a/drivers/input/keyboard/cap11xx.c +++ b/drivers/input/keyboard/cap11xx.c @@ -173,9 +173,6 @@ static bool cap11xx_volatile_reg(struct device *dev, unsigned int reg) case CAP11XX_REG_SENOR_DELTA(3): case CAP11XX_REG_SENOR_DELTA(4): case CAP11XX_REG_SENOR_DELTA(5): - case CAP11XX_REG_PRODUCT_ID: - case CAP11XX_REG_MANUFACTURER_ID: - case CAP11XX_REG_REVISION: return true; } -- GitLab From 718963d94197626f83544b63ca5581d16cffdac2 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Sun, 1 Oct 2023 01:43:39 +0200 Subject: [PATCH 0319/1290] Input: cap11xx - convert to use maple tree register cache The maple tree register cache is based on a much more modern data structure than the rbtree cache and makes optimisation choices which are probably more appropriate for modern systems than those made by the rbtree cache. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20231001-input-maple-v1-2-ed3716051431@kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/cap11xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/keyboard/cap11xx.c b/drivers/input/keyboard/cap11xx.c index 8e25d7d5e64de..52fba9ee7c1d8 100644 --- a/drivers/input/keyboard/cap11xx.c +++ b/drivers/input/keyboard/cap11xx.c @@ -187,7 +187,7 @@ static const struct regmap_config cap11xx_regmap_config = { .reg_defaults = cap11xx_reg_defaults, .num_reg_defaults = ARRAY_SIZE(cap11xx_reg_defaults), - .cache_type = REGCACHE_RBTREE, + .cache_type = REGCACHE_MAPLE, .volatile_reg = cap11xx_volatile_reg, }; -- GitLab From 5958274f1de5bcc435a11a92a5d555775c5ed341 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Sun, 1 Oct 2023 01:43:40 +0200 Subject: [PATCH 0320/1290] Input: qt1050 - convert to use maple tree register cache The maple tree register cache is based on a much more modern data structure than the rbtree cache and makes optimisation choices which are probably more appropriate for modern systems than those made by the rbtree cache. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20231001-input-maple-v1-3-ed3716051431@kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/qt1050.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/keyboard/qt1050.c b/drivers/input/keyboard/qt1050.c index 6953097db4456..b51dfcd760386 100644 --- a/drivers/input/keyboard/qt1050.c +++ b/drivers/input/keyboard/qt1050.c @@ -213,7 +213,7 @@ static struct regmap_config qt1050_regmap_config = { .val_bits = 8, .max_register = QT1050_RES_CAL, - .cache_type = REGCACHE_RBTREE, + .cache_type = REGCACHE_MAPLE, .wr_table = &qt1050_writeable_table, .rd_table = &qt1050_readable_table, -- GitLab From f737020d24e47d6e8b893a2fee2b43268a6af629 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Fri, 6 Oct 2023 12:03:20 +0200 Subject: [PATCH 0321/1290] Input: max77693-haptic - add device-tree compatible strings Add the needed device-tree compatible strings to the MAX77693 haptic driver, so it can be automatically loaded when compiled as a kernel module and given device-tree contains separate (i.e. 'motor-driver') node under the main PMIC node. When device is instantiated from device-tree, the driver data cannot be read via platform_get_device_id(), so get device type from the parent MFD device instead, what works for both cases. Signed-off-by: Marek Szyprowski Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20231006100320.2908210-1-m.szyprowski@samsung.com Signed-off-by: Dmitry Torokhov --- drivers/input/misc/max77693-haptic.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/input/misc/max77693-haptic.c b/drivers/input/misc/max77693-haptic.c index 80f4416ffe2fe..0e646f1b257b8 100644 --- a/drivers/input/misc/max77693-haptic.c +++ b/drivers/input/misc/max77693-haptic.c @@ -307,7 +307,7 @@ static int max77693_haptic_probe(struct platform_device *pdev) haptic->suspend_state = false; /* Variant-specific init */ - haptic->dev_type = platform_get_device_id(pdev)->driver_data; + haptic->dev_type = max77693->type; switch (haptic->dev_type) { case TYPE_MAX77693: haptic->regmap_haptic = max77693->regmap_haptic; @@ -406,16 +406,24 @@ static DEFINE_SIMPLE_DEV_PM_OPS(max77693_haptic_pm_ops, max77693_haptic_resume); static const struct platform_device_id max77693_haptic_id[] = { - { "max77693-haptic", TYPE_MAX77693 }, - { "max77843-haptic", TYPE_MAX77843 }, + { "max77693-haptic", }, + { "max77843-haptic", }, {}, }; MODULE_DEVICE_TABLE(platform, max77693_haptic_id); +static const struct of_device_id of_max77693_haptic_dt_match[] = { + { .compatible = "maxim,max77693-haptic", }, + { .compatible = "maxim,max77843-haptic", }, + { /* sentinel */ }, +}; +MODULE_DEVICE_TABLE(of, of_max77693_haptic_dt_match); + static struct platform_driver max77693_haptic_driver = { .driver = { .name = "max77693-haptic", .pm = pm_sleep_ptr(&max77693_haptic_pm_ops), + .of_match_table = of_max77693_haptic_dt_match, }, .probe = max77693_haptic_probe, .id_table = max77693_haptic_id, -- GitLab From 57b89048874c9edd4a17f34c126b4a4188b7b444 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 5 Dec 2023 15:14:33 -0400 Subject: [PATCH 0322/1290] iommu/arm-smmu-v3: Add a type for the STE Instead of passing a naked __le16 * around to represent a STE wrap it in a "struct arm_smmu_ste" with an array of the correct size. This makes it much clearer which functions will comprise the "STE API". Reviewed-by: Moritz Fischer Reviewed-by: Michael Shavit Reviewed-by: Eric Auger Reviewed-by: Nicolin Chen Tested-by: Shameer Kolothum Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 59 ++++++++++----------- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 7 ++- 2 files changed, 35 insertions(+), 31 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 7086e5fa41ff6..ad1e055e420d3 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1252,7 +1252,7 @@ static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu, u32 sid) } static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, - __le64 *dst) + struct arm_smmu_ste *dst) { /* * This is hideously complicated, but we only really care about @@ -1270,7 +1270,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, * 2. Write everything apart from dword 0, sync, write dword 0, sync * 3. Update Config, sync */ - u64 val = le64_to_cpu(dst[0]); + u64 val = le64_to_cpu(dst->data[0]); bool ste_live = false; struct arm_smmu_device *smmu = NULL; struct arm_smmu_ctx_desc_cfg *cd_table = NULL; @@ -1328,10 +1328,10 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, else val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_BYPASS); - dst[0] = cpu_to_le64(val); - dst[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG, + dst->data[0] = cpu_to_le64(val); + dst->data[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING)); - dst[2] = 0; /* Nuke the VMID */ + dst->data[2] = 0; /* Nuke the VMID */ /* * The SMMU can perform negative caching, so we must sync * the STE regardless of whether the old value was live. @@ -1346,7 +1346,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, STRTAB_STE_1_STRW_EL2 : STRTAB_STE_1_STRW_NSEL1; BUG_ON(ste_live); - dst[1] = cpu_to_le64( + dst->data[1] = cpu_to_le64( FIELD_PREP(STRTAB_STE_1_S1DSS, STRTAB_STE_1_S1DSS_SSID0) | FIELD_PREP(STRTAB_STE_1_S1CIR, STRTAB_STE_1_S1C_CACHE_WBRA) | FIELD_PREP(STRTAB_STE_1_S1COR, STRTAB_STE_1_S1C_CACHE_WBRA) | @@ -1355,7 +1355,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, if (smmu->features & ARM_SMMU_FEAT_STALLS && !master->stall_enabled) - dst[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD); + dst->data[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD); val |= (cd_table->cdtab_dma & STRTAB_STE_0_S1CTXPTR_MASK) | FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S1_TRANS) | @@ -1365,7 +1365,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, if (s2_cfg) { BUG_ON(ste_live); - dst[2] = cpu_to_le64( + dst->data[2] = cpu_to_le64( FIELD_PREP(STRTAB_STE_2_S2VMID, s2_cfg->vmid) | FIELD_PREP(STRTAB_STE_2_VTCR, s2_cfg->vtcr) | #ifdef __BIG_ENDIAN @@ -1374,18 +1374,18 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2R); - dst[3] = cpu_to_le64(s2_cfg->vttbr & STRTAB_STE_3_S2TTB_MASK); + dst->data[3] = cpu_to_le64(s2_cfg->vttbr & STRTAB_STE_3_S2TTB_MASK); val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S2_TRANS); } if (master->ats_enabled) - dst[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_EATS, + dst->data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_EATS, STRTAB_STE_1_EATS_TRANS)); arm_smmu_sync_ste_for_sid(smmu, sid); /* See comment in arm_smmu_write_ctx_desc() */ - WRITE_ONCE(dst[0], cpu_to_le64(val)); + WRITE_ONCE(dst->data[0], cpu_to_le64(val)); arm_smmu_sync_ste_for_sid(smmu, sid); /* It's likely that we'll want to use the new STE soon */ @@ -1393,7 +1393,8 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, arm_smmu_cmdq_issue_cmd(smmu, &prefetch_cmd); } -static void arm_smmu_init_bypass_stes(__le64 *strtab, unsigned int nent, bool force) +static void arm_smmu_init_bypass_stes(struct arm_smmu_ste *strtab, + unsigned int nent, bool force) { unsigned int i; u64 val = STRTAB_STE_0_V; @@ -1404,11 +1405,11 @@ static void arm_smmu_init_bypass_stes(__le64 *strtab, unsigned int nent, bool fo val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_BYPASS); for (i = 0; i < nent; ++i) { - strtab[0] = cpu_to_le64(val); - strtab[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG, - STRTAB_STE_1_SHCFG_INCOMING)); - strtab[2] = 0; - strtab += STRTAB_STE_DWORDS; + strtab->data[0] = cpu_to_le64(val); + strtab->data[1] = cpu_to_le64(FIELD_PREP( + STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING)); + strtab->data[2] = 0; + strtab++; } } @@ -2212,26 +2213,23 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain) return 0; } -static __le64 *arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid) +static struct arm_smmu_ste * +arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid) { - __le64 *step; struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg; if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB) { - struct arm_smmu_strtab_l1_desc *l1_desc; - int idx; + unsigned int idx1, idx2; /* Two-level walk */ - idx = (sid >> STRTAB_SPLIT) * STRTAB_L1_DESC_DWORDS; - l1_desc = &cfg->l1_desc[idx]; - idx = (sid & ((1 << STRTAB_SPLIT) - 1)) * STRTAB_STE_DWORDS; - step = &l1_desc->l2ptr[idx]; + idx1 = (sid >> STRTAB_SPLIT) * STRTAB_L1_DESC_DWORDS; + idx2 = sid & ((1 << STRTAB_SPLIT) - 1); + return &cfg->l1_desc[idx1].l2ptr[idx2]; } else { /* Simple linear lookup */ - step = &cfg->strtab[sid * STRTAB_STE_DWORDS]; + return (struct arm_smmu_ste *)&cfg + ->strtab[sid * STRTAB_STE_DWORDS]; } - - return step; } static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master) @@ -2241,7 +2239,8 @@ static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master) for (i = 0; i < master->num_streams; ++i) { u32 sid = master->streams[i].id; - __le64 *step = arm_smmu_get_step_for_sid(smmu, sid); + struct arm_smmu_ste *step = + arm_smmu_get_step_for_sid(smmu, sid); /* Bridged PCI devices may end up with duplicated IDs */ for (j = 0; j < i; j++) @@ -3772,7 +3771,7 @@ static void arm_smmu_rmr_install_bypass_ste(struct arm_smmu_device *smmu) iort_get_rmr_sids(dev_fwnode(smmu->dev), &rmr_list); list_for_each_entry(e, &rmr_list, list) { - __le64 *step; + struct arm_smmu_ste *step; struct iommu_iort_rmr_data *rmr; int ret, i; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 961205ba86d25..03f9e526cbd92 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -206,6 +206,11 @@ #define STRTAB_L1_DESC_L2PTR_MASK GENMASK_ULL(51, 6) #define STRTAB_STE_DWORDS 8 + +struct arm_smmu_ste { + __le64 data[STRTAB_STE_DWORDS]; +}; + #define STRTAB_STE_0_V (1UL << 0) #define STRTAB_STE_0_CFG GENMASK_ULL(3, 1) #define STRTAB_STE_0_CFG_ABORT 0 @@ -571,7 +576,7 @@ struct arm_smmu_priq { struct arm_smmu_strtab_l1_desc { u8 span; - __le64 *l2ptr; + struct arm_smmu_ste *l2ptr; dma_addr_t l2ptr_dma; }; -- GitLab From 12a48fe90d0919e4e594815122403f793a090803 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 5 Dec 2023 15:14:34 -0400 Subject: [PATCH 0323/1290] iommu/arm-smmu-v3: Master cannot be NULL in arm_smmu_write_strtab_ent() The only caller is arm_smmu_install_ste_for_dev() which never has a NULL master. Remove the confusing if. Reviewed-by: Moritz Fischer Reviewed-by: Michael Shavit Reviewed-by: Eric Auger Reviewed-by: Nicolin Chen Tested-by: Shameer Kolothum Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index ad1e055e420d3..46b2555b7c5af 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1272,10 +1272,10 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, */ u64 val = le64_to_cpu(dst->data[0]); bool ste_live = false; - struct arm_smmu_device *smmu = NULL; + struct arm_smmu_device *smmu = master->smmu; struct arm_smmu_ctx_desc_cfg *cd_table = NULL; struct arm_smmu_s2_cfg *s2_cfg = NULL; - struct arm_smmu_domain *smmu_domain = NULL; + struct arm_smmu_domain *smmu_domain = master->domain; struct arm_smmu_cmdq_ent prefetch_cmd = { .opcode = CMDQ_OP_PREFETCH_CFG, .prefetch = { @@ -1283,11 +1283,6 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, }, }; - if (master) { - smmu_domain = master->domain; - smmu = master->smmu; - } - if (smmu_domain) { switch (smmu_domain->stage) { case ARM_SMMU_DOMAIN_S1: -- GitLab From 9fde008337d3be8afa7b3b42c1787aac4b6853c9 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 5 Dec 2023 15:14:35 -0400 Subject: [PATCH 0324/1290] iommu/arm-smmu-v3: Remove ARM_SMMU_DOMAIN_NESTED Currently this is exactly the same as ARM_SMMU_DOMAIN_S2, so just remove it. The ongoing work to add nesting support through iommufd will do something a little different. Reviewed-by: Moritz Fischer Reviewed-by: Eric Auger Reviewed-by: Nicolin Chen Tested-by: Shameer Kolothum Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 4 +--- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 46b2555b7c5af..5387c90b32930 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1289,7 +1289,6 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, cd_table = &master->cd_table; break; case ARM_SMMU_DOMAIN_S2: - case ARM_SMMU_DOMAIN_NESTED: s2_cfg = &smmu_domain->s2_cfg; break; default: @@ -2170,7 +2169,6 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain) fmt = ARM_64_LPAE_S1; finalise_stage_fn = arm_smmu_domain_finalise_s1; break; - case ARM_SMMU_DOMAIN_NESTED: case ARM_SMMU_DOMAIN_S2: ias = smmu->ias; oas = smmu->oas; @@ -2739,7 +2737,7 @@ static int arm_smmu_enable_nesting(struct iommu_domain *domain) if (smmu_domain->smmu) ret = -EPERM; else - smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED; + smmu_domain->stage = ARM_SMMU_DOMAIN_S2; mutex_unlock(&smmu_domain->init_mutex); return ret; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 03f9e526cbd92..27ddf1acd12ce 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -715,7 +715,6 @@ struct arm_smmu_master { enum arm_smmu_domain_stage { ARM_SMMU_DOMAIN_S1 = 0, ARM_SMMU_DOMAIN_S2, - ARM_SMMU_DOMAIN_NESTED, ARM_SMMU_DOMAIN_BYPASS, }; -- GitLab From ff0f802974136c7f4c576da227565dc27cc1c69d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 17 Oct 2023 15:11:40 -0300 Subject: [PATCH 0325/1290] iommu/arm-smmu: Reorganize arm_smmu_domain_add_master() Make arm_smmu_domain_add_master() not use the smmu_domain to detect the s2cr configuration, instead pass it in as a parameter. It always returns zero so make it return void. Since it no longer really does anything to do with a domain call it arm_smmu_master_install_s2crs(). This is done to make the next two patches able to re-use this code without forcing the creation of a struct arm_smmu_domain. Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v2-c86cc8c2230e+160bb-smmu_newapi_jgg@nvidia.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index d6d1a2a55cc06..e2ec1fe14ed40 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1081,21 +1081,14 @@ static void arm_smmu_master_free_smes(struct arm_smmu_master_cfg *cfg, mutex_unlock(&smmu->stream_map_mutex); } -static int arm_smmu_domain_add_master(struct arm_smmu_domain *smmu_domain, - struct arm_smmu_master_cfg *cfg, - struct iommu_fwspec *fwspec) +static void arm_smmu_master_install_s2crs(struct arm_smmu_master_cfg *cfg, + enum arm_smmu_s2cr_type type, + u8 cbndx, struct iommu_fwspec *fwspec) { - struct arm_smmu_device *smmu = smmu_domain->smmu; + struct arm_smmu_device *smmu = cfg->smmu; struct arm_smmu_s2cr *s2cr = smmu->s2crs; - u8 cbndx = smmu_domain->cfg.cbndx; - enum arm_smmu_s2cr_type type; int i, idx; - if (smmu_domain->stage == ARM_SMMU_DOMAIN_BYPASS) - type = S2CR_TYPE_BYPASS; - else - type = S2CR_TYPE_TRANS; - for_each_cfg_sme(cfg, fwspec, i, idx) { if (type == s2cr[idx].type && cbndx == s2cr[idx].cbndx) continue; @@ -1105,7 +1098,6 @@ static int arm_smmu_domain_add_master(struct arm_smmu_domain *smmu_domain, s2cr[idx].cbndx = cbndx; arm_smmu_write_s2cr(smmu, idx); } - return 0; } static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) @@ -1153,7 +1145,12 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) } /* Looks ok, so add the device to the domain */ - ret = arm_smmu_domain_add_master(smmu_domain, cfg, fwspec); + arm_smmu_master_install_s2crs(cfg, + smmu_domain->stage == + ARM_SMMU_DOMAIN_BYPASS ? + S2CR_TYPE_BYPASS : + S2CR_TYPE_TRANS, + smmu_domain->cfg.cbndx, fwspec); /* * Setup an autosuspend delay to avoid bouncing runpm state. -- GitLab From 22bb7b41476a1b8dc282dc5b3ca5470090b52e46 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 17 Oct 2023 15:11:41 -0300 Subject: [PATCH 0326/1290] iommu/arm-smmu: Convert to a global static identity domain Create a global static identity domain with it's own arm_smmu_attach_dev_identity() that simply calls arm_smmu_master_install_s2crs() with the identity parameters. This is done by giving the attach path for identity its own unique implementation that simply calls arm_smmu_master_install_s2crs(). Remove ARM_SMMU_DOMAIN_BYPASS and all checks of IOMMU_DOMAIN_IDENTITY. Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v2-c86cc8c2230e+160bb-smmu_newapi_jgg@nvidia.com [will: Move duplicated autosuspend logic into a helper function] Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 81 ++++++++++++++++++--------- drivers/iommu/arm/arm-smmu/arm-smmu.h | 1 - 2 files changed, 53 insertions(+), 29 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index e2ec1fe14ed40..dec912c271412 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -82,6 +82,23 @@ static inline void arm_smmu_rpm_put(struct arm_smmu_device *smmu) pm_runtime_put_autosuspend(smmu->dev); } +static void arm_smmu_rpm_use_autosuspend(struct arm_smmu_device *smmu) +{ + /* + * Setup an autosuspend delay to avoid bouncing runpm state. + * Otherwise, if a driver for a suspended consumer device + * unmaps buffers, it will runpm resume/suspend for each one. + * + * For example, when used by a GPU device, when an application + * or game exits, it can trigger unmapping 100s or 1000s of + * buffers. With a runpm cycle for each buffer, that adds up + * to 5-10sec worth of reprogramming the context bank, while + * the system appears to be locked up to the user. + */ + pm_runtime_set_autosuspend_delay(smmu->dev, 20); + pm_runtime_use_autosuspend(smmu->dev); +} + static struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom) { return container_of(dom, struct arm_smmu_domain, domain); @@ -624,12 +641,6 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, if (smmu_domain->smmu) goto out_unlock; - if (domain->type == IOMMU_DOMAIN_IDENTITY) { - smmu_domain->stage = ARM_SMMU_DOMAIN_BYPASS; - smmu_domain->smmu = smmu; - goto out_unlock; - } - /* * Mapping the requested stage onto what we support is surprisingly * complicated, mainly because the spec allows S1+S2 SMMUs without @@ -825,7 +836,7 @@ static void arm_smmu_destroy_domain_context(struct iommu_domain *domain) struct arm_smmu_cfg *cfg = &smmu_domain->cfg; int ret, irq; - if (!smmu || domain->type == IOMMU_DOMAIN_IDENTITY) + if (!smmu) return; ret = arm_smmu_rpm_get(smmu); @@ -854,7 +865,7 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type) { struct arm_smmu_domain *smmu_domain; - if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_IDENTITY) { + if (type != IOMMU_DOMAIN_UNMANAGED) { if (using_legacy_binding || type != IOMMU_DOMAIN_DMA) return NULL; } @@ -1145,32 +1156,45 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) } /* Looks ok, so add the device to the domain */ - arm_smmu_master_install_s2crs(cfg, - smmu_domain->stage == - ARM_SMMU_DOMAIN_BYPASS ? - S2CR_TYPE_BYPASS : - S2CR_TYPE_TRANS, + arm_smmu_master_install_s2crs(cfg, S2CR_TYPE_TRANS, smmu_domain->cfg.cbndx, fwspec); - - /* - * Setup an autosuspend delay to avoid bouncing runpm state. - * Otherwise, if a driver for a suspended consumer device - * unmaps buffers, it will runpm resume/suspend for each one. - * - * For example, when used by a GPU device, when an application - * or game exits, it can trigger unmapping 100s or 1000s of - * buffers. With a runpm cycle for each buffer, that adds up - * to 5-10sec worth of reprogramming the context bank, while - * the system appears to be locked up to the user. - */ - pm_runtime_set_autosuspend_delay(smmu->dev, 20); - pm_runtime_use_autosuspend(smmu->dev); - + arm_smmu_rpm_use_autosuspend(smmu); rpm_put: arm_smmu_rpm_put(smmu); return ret; } +static int arm_smmu_attach_dev_identity(struct iommu_domain *domain, + struct device *dev) +{ + struct arm_smmu_master_cfg *cfg = dev_iommu_priv_get(dev); + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct arm_smmu_device *smmu; + int ret; + + if (!cfg) + return -ENODEV; + smmu = cfg->smmu; + + ret = arm_smmu_rpm_get(smmu); + if (ret < 0) + return ret; + + arm_smmu_master_install_s2crs(cfg, S2CR_TYPE_BYPASS, 0, fwspec); + arm_smmu_rpm_use_autosuspend(smmu); + arm_smmu_rpm_put(smmu); + return 0; +} + +static const struct iommu_domain_ops arm_smmu_identity_ops = { + .attach_dev = arm_smmu_attach_dev_identity, +}; + +static struct iommu_domain arm_smmu_identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &arm_smmu_identity_ops, +}; + static int arm_smmu_map_pages(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t pgsize, size_t pgcount, int prot, gfp_t gfp, size_t *mapped) @@ -1557,6 +1581,7 @@ static int arm_smmu_def_domain_type(struct device *dev) } static struct iommu_ops arm_smmu_ops = { + .identity_domain = &arm_smmu_identity_domain, .capable = arm_smmu_capable, .domain_alloc = arm_smmu_domain_alloc, .probe_device = arm_smmu_probe_device, diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h index 703fd5817ec11..836ed6799a801 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.h +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h @@ -361,7 +361,6 @@ enum arm_smmu_domain_stage { ARM_SMMU_DOMAIN_S1 = 0, ARM_SMMU_DOMAIN_S2, ARM_SMMU_DOMAIN_NESTED, - ARM_SMMU_DOMAIN_BYPASS, }; struct arm_smmu_domain { -- GitLab From bbbf11eea38c0cb167edc5ce5fe76324e87ad074 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 17 Oct 2023 15:11:42 -0300 Subject: [PATCH 0327/1290] iommu/arm-smmu: Implement IOMMU_DOMAIN_BLOCKED Using the same design as IDENTITY setup a S2CR_TYPE_FAULT s2cr for the device. Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v2-c86cc8c2230e+160bb-smmu_newapi_jgg@nvidia.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 28 ++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index dec912c271412..c76b68a296bcb 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1164,8 +1164,8 @@ rpm_put: return ret; } -static int arm_smmu_attach_dev_identity(struct iommu_domain *domain, - struct device *dev) +static int arm_smmu_attach_dev_type(struct device *dev, + enum arm_smmu_s2cr_type type) { struct arm_smmu_master_cfg *cfg = dev_iommu_priv_get(dev); struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); @@ -1180,12 +1180,18 @@ static int arm_smmu_attach_dev_identity(struct iommu_domain *domain, if (ret < 0) return ret; - arm_smmu_master_install_s2crs(cfg, S2CR_TYPE_BYPASS, 0, fwspec); + arm_smmu_master_install_s2crs(cfg, type, 0, fwspec); arm_smmu_rpm_use_autosuspend(smmu); arm_smmu_rpm_put(smmu); return 0; } +static int arm_smmu_attach_dev_identity(struct iommu_domain *domain, + struct device *dev) +{ + return arm_smmu_attach_dev_type(dev, S2CR_TYPE_BYPASS); +} + static const struct iommu_domain_ops arm_smmu_identity_ops = { .attach_dev = arm_smmu_attach_dev_identity, }; @@ -1195,6 +1201,21 @@ static struct iommu_domain arm_smmu_identity_domain = { .ops = &arm_smmu_identity_ops, }; +static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain, + struct device *dev) +{ + return arm_smmu_attach_dev_type(dev, S2CR_TYPE_FAULT); +} + +static const struct iommu_domain_ops arm_smmu_blocked_ops = { + .attach_dev = arm_smmu_attach_dev_blocked, +}; + +static struct iommu_domain arm_smmu_blocked_domain = { + .type = IOMMU_DOMAIN_BLOCKED, + .ops = &arm_smmu_blocked_ops, +}; + static int arm_smmu_map_pages(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t pgsize, size_t pgcount, int prot, gfp_t gfp, size_t *mapped) @@ -1582,6 +1603,7 @@ static int arm_smmu_def_domain_type(struct device *dev) static struct iommu_ops arm_smmu_ops = { .identity_domain = &arm_smmu_identity_domain, + .blocked_domain = &arm_smmu_blocked_domain, .capable = arm_smmu_capable, .domain_alloc = arm_smmu_domain_alloc, .probe_device = arm_smmu_probe_device, -- GitLab From e0976331ad114af8e379e18483c346c6c79ca858 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 17 Oct 2023 15:11:43 -0300 Subject: [PATCH 0328/1290] iommu/arm-smmu: Pass arm_smmu_domain to internal functions Keep the types consistent, all the callers of these functions already have obtained a struct arm_smmu_domain, don't needlessly go to/from an iommu_domain through the internal call chains. Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4-v2-c86cc8c2230e+160bb-smmu_newapi_jgg@nvidia.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index c76b68a296bcb..664d53dfb3bdb 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -409,8 +409,7 @@ static irqreturn_t arm_smmu_context_fault(int irq, void *dev) { u32 fsr, fsynr, cbfrsynra; unsigned long iova; - struct iommu_domain *domain = dev; - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); + struct arm_smmu_domain *smmu_domain = dev; struct arm_smmu_device *smmu = smmu_domain->smmu; int idx = smmu_domain->cfg.cbndx; int ret; @@ -423,7 +422,7 @@ static irqreturn_t arm_smmu_context_fault(int irq, void *dev) iova = arm_smmu_cb_readq(smmu, idx, ARM_SMMU_CB_FAR); cbfrsynra = arm_smmu_gr1_read(smmu, ARM_SMMU_GR1_CBFRSYNRA(idx)); - ret = report_iommu_fault(domain, NULL, iova, + ret = report_iommu_fault(&smmu_domain->domain, NULL, iova, fsynr & ARM_SMMU_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ); if (ret == -ENOSYS) @@ -624,7 +623,7 @@ static int arm_smmu_alloc_context_bank(struct arm_smmu_domain *smmu_domain, return __arm_smmu_alloc_bitmap(smmu->context_map, start, smmu->num_context_banks); } -static int arm_smmu_init_domain_context(struct iommu_domain *domain, +static int arm_smmu_init_domain_context(struct arm_smmu_domain *smmu_domain, struct arm_smmu_device *smmu, struct device *dev) { @@ -633,7 +632,7 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, struct io_pgtable_ops *pgtbl_ops; struct io_pgtable_cfg pgtbl_cfg; enum io_pgtable_fmt fmt; - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); + struct iommu_domain *domain = &smmu_domain->domain; struct arm_smmu_cfg *cfg = &smmu_domain->cfg; irqreturn_t (*context_fault)(int irq, void *dev); @@ -807,8 +806,8 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, else context_fault = arm_smmu_context_fault; - ret = devm_request_irq(smmu->dev, irq, context_fault, - IRQF_SHARED, "arm-smmu-context-fault", domain); + ret = devm_request_irq(smmu->dev, irq, context_fault, IRQF_SHARED, + "arm-smmu-context-fault", smmu_domain); if (ret < 0) { dev_err(smmu->dev, "failed to request context IRQ %d (%u)\n", cfg->irptndx, irq); @@ -829,9 +828,8 @@ out_unlock: return ret; } -static void arm_smmu_destroy_domain_context(struct iommu_domain *domain) +static void arm_smmu_destroy_domain_context(struct arm_smmu_domain *smmu_domain) { - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_device *smmu = smmu_domain->smmu; struct arm_smmu_cfg *cfg = &smmu_domain->cfg; int ret, irq; @@ -852,7 +850,7 @@ static void arm_smmu_destroy_domain_context(struct iommu_domain *domain) if (cfg->irptndx != ARM_SMMU_INVALID_IRPTNDX) { irq = smmu->irqs[cfg->irptndx]; - devm_free_irq(smmu->dev, irq, domain); + devm_free_irq(smmu->dev, irq, smmu_domain); } free_io_pgtable_ops(smmu_domain->pgtbl_ops); @@ -892,7 +890,7 @@ static void arm_smmu_domain_free(struct iommu_domain *domain) * Free the domain resources. We assume that all devices have * already been detached. */ - arm_smmu_destroy_domain_context(domain); + arm_smmu_destroy_domain_context(smmu_domain); kfree(smmu_domain); } @@ -1142,7 +1140,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) return ret; /* Ensure that the domain is finalised */ - ret = arm_smmu_init_domain_context(domain, smmu, dev); + ret = arm_smmu_init_domain_context(smmu_domain, smmu, dev); if (ret < 0) goto rpm_put; -- GitLab From 9b3febc3a3da7fcd81ece10614b7fd6c729ba8b4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 17 Oct 2023 15:11:44 -0300 Subject: [PATCH 0329/1290] iommu/arm-smmu: Convert to domain_alloc_paging() Now that the BLOCKED and IDENTITY behaviors are managed with their own domains change to the domain_alloc_paging() op. The check for using_legacy_binding is now redundant, arm_smmu_def_domain_type() always returns IOMMU_DOMAIN_IDENTITY for this mode, so the core code will never attempt to create a DMA domain in the first place. Since commit a4fdd9762272 ("iommu: Use flush queue capability") the core code only passes in IDENTITY/BLOCKED/UNMANAGED/DMA domain types. It will not pass in IDENTITY or BLOCKED if the global statics exist, so the test for DMA is also redundant now too. Call arm_smmu_init_domain_context() early if a dev is available. Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/5-v2-c86cc8c2230e+160bb-smmu_newapi_jgg@nvidia.com [will: Simplify arm_smmu_domain_alloc_paging() since 'cfg' cannot be NULL] Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 664d53dfb3bdb..b0a6b367d8a2c 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -859,14 +859,10 @@ static void arm_smmu_destroy_domain_context(struct arm_smmu_domain *smmu_domain) arm_smmu_rpm_put(smmu); } -static struct iommu_domain *arm_smmu_domain_alloc(unsigned type) +static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev) { struct arm_smmu_domain *smmu_domain; - if (type != IOMMU_DOMAIN_UNMANAGED) { - if (using_legacy_binding || type != IOMMU_DOMAIN_DMA) - return NULL; - } /* * Allocate the domain and initialise some of its data structures. * We can't really do anything meaningful until we've added a @@ -879,6 +875,15 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type) mutex_init(&smmu_domain->init_mutex); spin_lock_init(&smmu_domain->cb_lock); + if (dev) { + struct arm_smmu_master_cfg *cfg = dev_iommu_priv_get(dev); + + if (arm_smmu_init_domain_context(smmu_domain, cfg->smmu, dev)) { + kfree(smmu_domain); + return NULL; + } + } + return &smmu_domain->domain; } @@ -1603,7 +1608,7 @@ static struct iommu_ops arm_smmu_ops = { .identity_domain = &arm_smmu_identity_domain, .blocked_domain = &arm_smmu_blocked_domain, .capable = arm_smmu_capable, - .domain_alloc = arm_smmu_domain_alloc, + .domain_alloc_paging = arm_smmu_domain_alloc_paging, .probe_device = arm_smmu_probe_device, .release_device = arm_smmu_release_device, .probe_finalize = arm_smmu_probe_finalize, -- GitLab From 7d84a63a39b78443d09f2b4edf7ecb1d586379b4 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 7 Dec 2023 18:14:32 +0200 Subject: [PATCH 0330/1290] backlight: hx8357: Convert to agnostic GPIO API The of_gpio.h is going to be removed. In preparation of that convert the driver to the agnostic API. Fixes: fbbbcd177a27 ("gpiolib: of: add quirk for locating reset lines with legacy bindings") Signed-off-by: Andy Shevchenko Reviewed-by: Daniel Thompson Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20231207161513.3195509-2-andriy.shevchenko@linux.intel.com Signed-off-by: Lee Jones --- drivers/gpio/gpiolib-of.c | 4 +- drivers/video/backlight/hx8357.c | 74 ++++++++++---------------------- 2 files changed, 24 insertions(+), 54 deletions(-) diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index 402f7d99b0c1e..e7770eedd1469 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -184,7 +184,7 @@ static void of_gpio_try_fixup_polarity(const struct device_node *np, const char *propname; bool active_high; } gpios[] = { -#if !IS_ENABLED(CONFIG_LCD_HX8357) +#if IS_ENABLED(CONFIG_LCD_HX8357) /* * Himax LCD controllers used incorrectly named * "gpios-reset" property and also specified wrong @@ -478,7 +478,7 @@ static struct gpio_desc *of_find_gpio_rename(struct device_node *np, */ const char *compatible; } gpios[] = { -#if !IS_ENABLED(CONFIG_LCD_HX8357) +#if IS_ENABLED(CONFIG_LCD_HX8357) /* Himax LCD controllers used "gpios-reset" */ { "reset", "gpios-reset", "himax,hx8357" }, { "reset", "gpios-reset", "himax,hx8369" }, diff --git a/drivers/video/backlight/hx8357.c b/drivers/video/backlight/hx8357.c index f76d2469d4901..d7298376cf74d 100644 --- a/drivers/video/backlight/hx8357.c +++ b/drivers/video/backlight/hx8357.c @@ -6,11 +6,11 @@ */ #include +#include #include #include #include #include -#include #include #define HX8357_NUM_IM_PINS 3 @@ -83,11 +83,10 @@ #define HX8369_SET_GAMMA_CURVE_RELATED 0xe0 struct hx8357_data { - unsigned im_pins[HX8357_NUM_IM_PINS]; - unsigned reset; + struct gpio_descs *im_pins; + struct gpio_desc *reset; struct spi_device *spi; int state; - bool use_im_pins; }; static u8 hx8357_seq_power[] = { @@ -321,11 +320,11 @@ static void hx8357_lcd_reset(struct lcd_device *lcdev) struct hx8357_data *lcd = lcd_get_data(lcdev); /* Reset the screen */ - gpio_set_value(lcd->reset, 1); + gpiod_set_value(lcd->reset, 0); usleep_range(10000, 12000); - gpio_set_value(lcd->reset, 0); + gpiod_set_value(lcd->reset, 1); usleep_range(10000, 12000); - gpio_set_value(lcd->reset, 1); + gpiod_set_value(lcd->reset, 0); /* The controller needs 120ms to recover from reset */ msleep(120); @@ -340,10 +339,10 @@ static int hx8357_lcd_init(struct lcd_device *lcdev) * Set the interface selection pins to SPI mode, with three * wires */ - if (lcd->use_im_pins) { - gpio_set_value_cansleep(lcd->im_pins[0], 1); - gpio_set_value_cansleep(lcd->im_pins[1], 0); - gpio_set_value_cansleep(lcd->im_pins[2], 1); + if (lcd->im_pins) { + gpiod_set_value_cansleep(lcd->im_pins->desc[0], 1); + gpiod_set_value_cansleep(lcd->im_pins->desc[1], 0); + gpiod_set_value_cansleep(lcd->im_pins->desc[2], 1); } ret = hx8357_spi_write_array(lcdev, hx8357_seq_power, @@ -580,6 +579,7 @@ MODULE_DEVICE_TABLE(of, hx8357_dt_ids); static int hx8357_probe(struct spi_device *spi) { + struct device *dev = &spi->dev; struct lcd_device *lcdev; struct hx8357_data *lcd; const struct of_device_id *match; @@ -601,49 +601,19 @@ static int hx8357_probe(struct spi_device *spi) if (!match || !match->data) return -EINVAL; - lcd->reset = of_get_named_gpio(spi->dev.of_node, "gpios-reset", 0); - if (!gpio_is_valid(lcd->reset)) { - dev_err(&spi->dev, "Missing dt property: gpios-reset\n"); - return -EINVAL; - } + lcd->reset = devm_gpiod_get(dev, "reset", GPIOD_OUT_LOW); + if (IS_ERR(lcd->reset)) + return dev_err_probe(dev, PTR_ERR(lcd->reset), "failed to request reset GPIO\n"); + gpiod_set_consumer_name(lcd->reset, "hx8357-reset"); - ret = devm_gpio_request_one(&spi->dev, lcd->reset, - GPIOF_OUT_INIT_HIGH, - "hx8357-reset"); - if (ret) { - dev_err(&spi->dev, - "failed to request gpio %d: %d\n", - lcd->reset, ret); - return -EINVAL; - } + lcd->im_pins = devm_gpiod_get_array_optional(dev, "im", GPIOD_OUT_LOW); + if (IS_ERR(lcd->im_pins)) + return dev_err_probe(dev, PTR_ERR(lcd->im_pins), "failed to request im GPIOs\n"); + if (lcd->im_pins->ndescs < HX8357_NUM_IM_PINS) + return dev_err_probe(dev, -EINVAL, "not enough im GPIOs\n"); - if (of_property_present(spi->dev.of_node, "im-gpios")) { - lcd->use_im_pins = 1; - - for (i = 0; i < HX8357_NUM_IM_PINS; i++) { - lcd->im_pins[i] = of_get_named_gpio(spi->dev.of_node, - "im-gpios", i); - if (lcd->im_pins[i] == -EPROBE_DEFER) { - dev_info(&spi->dev, "GPIO requested is not here yet, deferring the probe\n"); - return -EPROBE_DEFER; - } - if (!gpio_is_valid(lcd->im_pins[i])) { - dev_err(&spi->dev, "Missing dt property: im-gpios\n"); - return -EINVAL; - } - - ret = devm_gpio_request_one(&spi->dev, lcd->im_pins[i], - GPIOF_OUT_INIT_LOW, - "im_pins"); - if (ret) { - dev_err(&spi->dev, "failed to request gpio %d: %d\n", - lcd->im_pins[i], ret); - return -EINVAL; - } - } - } else { - lcd->use_im_pins = 0; - } + for (i = 0; i < HX8357_NUM_IM_PINS; i++) + gpiod_set_consumer_name(lcd->im_pins->desc[i], "im_pins"); lcdev = devm_lcd_device_register(&spi->dev, "mxsfb", &spi->dev, lcd, &hx8357_ops); -- GitLab From 813900d19b923fc1b241c1ce292472f68066092b Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Thu, 7 Dec 2023 16:16:34 +0800 Subject: [PATCH 0331/1290] perf header: Fix one memory leakage in perf_event__fprintf_event_update() When dump the raw trace by `perf report -D` ASan reports a memory leakage in perf_event__fprintf_event_update(). It shows that we allocated a temporary cpumap for dumping the CPUs but doesn't release it and it's not used elsewhere. Fix this by free the cpumap after the dumping. Fixes: c853f9394b7bc189 ("perf tools: Add perf_event__fprintf_event_update function") Reviewed-by: Ian Rogers Signed-off-by: Yicong Yang Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Jonathan Cameron Cc: Junhao He Cc: Mark Rutland Cc: Peter Zijlstra Cc: linuxarm@huawei.com Link: https://lore.kernel.org/r/20231207081635.8427-2-yangyicong@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/header.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 08cc2febabde2..a9f71f8343f08 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -4391,9 +4391,10 @@ size_t perf_event__fprintf_event_update(union perf_event *event, FILE *fp) ret += fprintf(fp, "... "); map = cpu_map__new_data(&ev->cpus.cpus); - if (map) + if (map) { ret += cpu_map__fprintf(map, fp); - else + perf_cpu_map__put(map); + } else ret += fprintf(fp, "failed to get cpus\n"); break; default: -- GitLab From 1bc479d665bc25a9a4e8168d5b400a47491511f9 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Thu, 7 Dec 2023 16:16:35 +0800 Subject: [PATCH 0332/1290] perf hisi-ptt: Fix one memory leakage in hisi_ptt_process_auxtrace_event() ASan complains a memory leakage in hisi_ptt_process_auxtrace_event() that the data buffer is not freed. Since currently we only support the raw dump trace mode, the data buffer is used only within this function. So fix this by freeing the data buffer before going out. Fixes: 5e91e57e68090c0e ("perf auxtrace arm64: Add support for parsing HiSilicon PCIe Trace packet") Reviewed-by: Ian Rogers Signed-off-by: Yicong Yang Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Jonathan Cameron Cc: Junhao He Cc: Mark Rutland Cc: Peter Zijlstra Cc: Qi Liu Link: https://lore.kernel.org/r/20231207081635.8427-3-yangyicong@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/hisi-ptt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/util/hisi-ptt.c b/tools/perf/util/hisi-ptt.c index 43bd1ca62d582..52d0ce302ca04 100644 --- a/tools/perf/util/hisi-ptt.c +++ b/tools/perf/util/hisi-ptt.c @@ -123,6 +123,7 @@ static int hisi_ptt_process_auxtrace_event(struct perf_session *session, if (dump_trace) hisi_ptt_dump_event(ptt, data, size); + free(data); return 0; } -- GitLab From d4db8762dc4c364dce89901ed4690588ad310bfc Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Tue, 12 Dec 2023 22:37:16 -0800 Subject: [PATCH 0333/1290] Input: use sysfs_emit() instead of scnprintf() Replace calls to scnprintf() in the methods showing device attributes with sysfs_emit() to simplify the code. Signed-off-by: ye xingchen Link: https://lore.kernel.org/r/202212021133398847947@zte.com.cn Signed-off-by: Dmitry Torokhov --- drivers/input/input.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/input/input.c b/drivers/input/input.c index 8c5fdb0f858ab..f71ea4fb173fd 100644 --- a/drivers/input/input.c +++ b/drivers/input/input.c @@ -1365,8 +1365,8 @@ static ssize_t input_dev_show_##name(struct device *dev, \ { \ struct input_dev *input_dev = to_input_dev(dev); \ \ - return scnprintf(buf, PAGE_SIZE, "%s\n", \ - input_dev->name ? input_dev->name : ""); \ + return sysfs_emit(buf, "%s\n", \ + input_dev->name ? input_dev->name : ""); \ } \ static DEVICE_ATTR(name, S_IRUGO, input_dev_show_##name, NULL) @@ -1458,7 +1458,7 @@ static ssize_t inhibited_show(struct device *dev, { struct input_dev *input_dev = to_input_dev(dev); - return scnprintf(buf, PAGE_SIZE, "%d\n", input_dev->inhibited); + return sysfs_emit(buf, "%d\n", input_dev->inhibited); } static ssize_t inhibited_store(struct device *dev, @@ -1505,7 +1505,7 @@ static ssize_t input_dev_show_id_##name(struct device *dev, \ char *buf) \ { \ struct input_dev *input_dev = to_input_dev(dev); \ - return scnprintf(buf, PAGE_SIZE, "%04x\n", input_dev->id.name); \ + return sysfs_emit(buf, "%04x\n", input_dev->id.name); \ } \ static DEVICE_ATTR(name, S_IRUGO, input_dev_show_id_##name, NULL) -- GitLab From 1864a2006ee1e41960b63ca63aa79fabb690e71b Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Tue, 12 Dec 2023 22:12:01 -0800 Subject: [PATCH 0334/1290] Input: mouse - use sysfs_emit[_at]() instead of scnprintf() Replace the calls to various *printf() functions with sysfs_emit() to simplify the code. Signed-off-by: ye xingchen Link: https://lore.kernel.org/r/202212021453578171100@zte.com.cn Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/cyapa.c | 22 +++++++++++----------- drivers/input/mouse/cyapa_gen3.c | 2 +- drivers/input/mouse/cyapa_gen5.c | 4 ++-- drivers/input/mouse/cyapa_gen6.c | 20 ++++++++++---------- drivers/input/mouse/elan_i2c_core.c | 18 +++++++++--------- 5 files changed, 33 insertions(+), 33 deletions(-) diff --git a/drivers/input/mouse/cyapa.c b/drivers/input/mouse/cyapa.c index a84098448f5b9..5979deabe23d1 100644 --- a/drivers/input/mouse/cyapa.c +++ b/drivers/input/mouse/cyapa.c @@ -756,16 +756,16 @@ static ssize_t cyapa_show_suspend_scanrate(struct device *dev, switch (pwr_cmd) { case PWR_MODE_BTN_ONLY: - len = scnprintf(buf, PAGE_SIZE, "%s\n", BTN_ONLY_MODE_NAME); + len = sysfs_emit(buf, "%s\n", BTN_ONLY_MODE_NAME); break; case PWR_MODE_OFF: - len = scnprintf(buf, PAGE_SIZE, "%s\n", OFF_MODE_NAME); + len = sysfs_emit(buf, "%s\n", OFF_MODE_NAME); break; default: - len = scnprintf(buf, PAGE_SIZE, "%u\n", - cyapa->gen == CYAPA_GEN3 ? + len = sysfs_emit(buf, "%u\n", + cyapa->gen == CYAPA_GEN3 ? cyapa_pwr_cmd_to_sleep_time(pwr_cmd) : sleep_time); break; @@ -877,8 +877,8 @@ static ssize_t cyapa_show_rt_suspend_scanrate(struct device *dev, mutex_unlock(&cyapa->state_sync_lock); - return scnprintf(buf, PAGE_SIZE, "%u\n", - cyapa->gen == CYAPA_GEN3 ? + return sysfs_emit(buf, "%u\n", + cyapa->gen == CYAPA_GEN3 ? cyapa_pwr_cmd_to_sleep_time(pwr_cmd) : sleep_time); } @@ -988,8 +988,8 @@ static ssize_t cyapa_show_fm_ver(struct device *dev, error = mutex_lock_interruptible(&cyapa->state_sync_lock); if (error) return error; - error = scnprintf(buf, PAGE_SIZE, "%d.%d\n", cyapa->fw_maj_ver, - cyapa->fw_min_ver); + error = sysfs_emit(buf, "%d.%d\n", + cyapa->fw_maj_ver, cyapa->fw_min_ver); mutex_unlock(&cyapa->state_sync_lock); return error; } @@ -1004,7 +1004,7 @@ static ssize_t cyapa_show_product_id(struct device *dev, error = mutex_lock_interruptible(&cyapa->state_sync_lock); if (error) return error; - size = scnprintf(buf, PAGE_SIZE, "%s\n", cyapa->product_id); + size = sysfs_emit(buf, "%s\n", cyapa->product_id); mutex_unlock(&cyapa->state_sync_lock); return size; } @@ -1209,8 +1209,8 @@ static ssize_t cyapa_show_mode(struct device *dev, if (error) return error; - size = scnprintf(buf, PAGE_SIZE, "gen%d %s\n", - cyapa->gen, cyapa_state_to_string(cyapa)); + size = sysfs_emit(buf, "gen%d %s\n", + cyapa->gen, cyapa_state_to_string(cyapa)); mutex_unlock(&cyapa->state_sync_lock); return size; diff --git a/drivers/input/mouse/cyapa_gen3.c b/drivers/input/mouse/cyapa_gen3.c index a97f4acb64526..60c83bc71d84e 100644 --- a/drivers/input/mouse/cyapa_gen3.c +++ b/drivers/input/mouse/cyapa_gen3.c @@ -860,7 +860,7 @@ static ssize_t cyapa_gen3_show_baseline(struct device *dev, dev_dbg(dev, "Baseline report successful. Max: %d Min: %d\n", max_baseline, min_baseline); - ret = scnprintf(buf, PAGE_SIZE, "%d %d\n", max_baseline, min_baseline); + ret = sysfs_emit(buf, "%d %d\n", max_baseline, min_baseline); out: return ret; diff --git a/drivers/input/mouse/cyapa_gen5.c b/drivers/input/mouse/cyapa_gen5.c index abf42f77b4c59..2e6bcb07257ed 100644 --- a/drivers/input/mouse/cyapa_gen5.c +++ b/drivers/input/mouse/cyapa_gen5.c @@ -2418,12 +2418,12 @@ resume_scanning: return resume_error ? resume_error : error; /* 12. Output data strings */ - size = scnprintf(buf, PAGE_SIZE, "%d %d %d %d %d %d %d %d %d %d %d ", + size = sysfs_emit(buf, "%d %d %d %d %d %d %d %d %d %d %d ", gidac_mutual_min, gidac_mutual_max, gidac_mutual_ave, lidac_mutual_min, lidac_mutual_max, lidac_mutual_ave, gidac_self_rx, gidac_self_tx, lidac_self_min, lidac_self_max, lidac_self_ave); - size += scnprintf(buf + size, PAGE_SIZE - size, + size += sysfs_emit_at(buf, size, "%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", raw_cap_mutual_min, raw_cap_mutual_max, raw_cap_mutual_ave, raw_cap_self_min, raw_cap_self_max, raw_cap_self_ave, diff --git a/drivers/input/mouse/cyapa_gen6.c b/drivers/input/mouse/cyapa_gen6.c index 0caaf3e64215d..4ffe08fee10c8 100644 --- a/drivers/input/mouse/cyapa_gen6.c +++ b/drivers/input/mouse/cyapa_gen6.c @@ -629,14 +629,14 @@ static ssize_t cyapa_gen6_show_baseline(struct device *dev, if (error) goto resume_scanning; - size = scnprintf(buf, PAGE_SIZE, "%d %d %d %d %d %d ", - data[0], /* RX Attenuator Mutual */ - data[1], /* IDAC Mutual */ - data[2], /* RX Attenuator Self RX */ - data[3], /* IDAC Self RX */ - data[4], /* RX Attenuator Self TX */ - data[5] /* IDAC Self TX */ - ); + size = sysfs_emit(buf, "%d %d %d %d %d %d ", + data[0], /* RX Attenuator Mutual */ + data[1], /* IDAC Mutual */ + data[2], /* RX Attenuator Self RX */ + data[3], /* IDAC Self RX */ + data[4], /* RX Attenuator Self TX */ + data[5] /* IDAC Self TX */ + ); /* 3. Read Attenuator Trim. */ data_len = sizeof(data); @@ -648,8 +648,8 @@ static ssize_t cyapa_gen6_show_baseline(struct device *dev, /* set attenuator trim values. */ for (i = 0; i < data_len; i++) - size += scnprintf(buf + size, PAGE_SIZE - size, "%d ", data[i]); - size += scnprintf(buf + size, PAGE_SIZE - size, "\n"); + size += sysfs_emit_at(buf, size, "%d ", data[i]); + size += sysfs_emit_at(buf, size, "\n"); resume_scanning: /* 4. Resume Scanning*/ diff --git a/drivers/input/mouse/elan_i2c_core.c b/drivers/input/mouse/elan_i2c_core.c index 148a601396f92..8a72c200ccb5d 100644 --- a/drivers/input/mouse/elan_i2c_core.c +++ b/drivers/input/mouse/elan_i2c_core.c @@ -572,7 +572,7 @@ static ssize_t elan_sysfs_read_fw_checksum(struct device *dev, struct i2c_client *client = to_i2c_client(dev); struct elan_tp_data *data = i2c_get_clientdata(client); - return sprintf(buf, "0x%04x\n", data->fw_checksum); + return sysfs_emit(buf, "0x%04x\n", data->fw_checksum); } static ssize_t elan_sysfs_read_product_id(struct device *dev, @@ -582,8 +582,8 @@ static ssize_t elan_sysfs_read_product_id(struct device *dev, struct i2c_client *client = to_i2c_client(dev); struct elan_tp_data *data = i2c_get_clientdata(client); - return sprintf(buf, ETP_PRODUCT_ID_FORMAT_STRING "\n", - data->product_id); + return sysfs_emit(buf, ETP_PRODUCT_ID_FORMAT_STRING "\n", + data->product_id); } static ssize_t elan_sysfs_read_fw_ver(struct device *dev, @@ -593,7 +593,7 @@ static ssize_t elan_sysfs_read_fw_ver(struct device *dev, struct i2c_client *client = to_i2c_client(dev); struct elan_tp_data *data = i2c_get_clientdata(client); - return sprintf(buf, "%d.0\n", data->fw_version); + return sysfs_emit(buf, "%d.0\n", data->fw_version); } static ssize_t elan_sysfs_read_sm_ver(struct device *dev, @@ -603,7 +603,7 @@ static ssize_t elan_sysfs_read_sm_ver(struct device *dev, struct i2c_client *client = to_i2c_client(dev); struct elan_tp_data *data = i2c_get_clientdata(client); - return sprintf(buf, "%d.0\n", data->sm_version); + return sysfs_emit(buf, "%d.0\n", data->sm_version); } static ssize_t elan_sysfs_read_iap_ver(struct device *dev, @@ -613,7 +613,7 @@ static ssize_t elan_sysfs_read_iap_ver(struct device *dev, struct i2c_client *client = to_i2c_client(dev); struct elan_tp_data *data = i2c_get_clientdata(client); - return sprintf(buf, "%d.0\n", data->iap_version); + return sysfs_emit(buf, "%d.0\n", data->iap_version); } static ssize_t elan_sysfs_update_fw(struct device *dev, @@ -754,7 +754,7 @@ static ssize_t elan_sysfs_read_mode(struct device *dev, if (error) return error; - return sprintf(buf, "%d\n", (int)mode); + return sysfs_emit(buf, "%d\n", (int)mode); } static DEVICE_ATTR(product_id, S_IRUGO, elan_sysfs_read_product_id, NULL); @@ -858,7 +858,7 @@ static ssize_t min_show(struct device *dev, goto out; } - retval = snprintf(buf, PAGE_SIZE, "%d", data->min_baseline); + retval = sysfs_emit(buf, "%d", data->min_baseline); out: mutex_unlock(&data->sysfs_mutex); @@ -881,7 +881,7 @@ static ssize_t max_show(struct device *dev, goto out; } - retval = snprintf(buf, PAGE_SIZE, "%d", data->max_baseline); + retval = sysfs_emit(buf, "%d", data->max_baseline); out: mutex_unlock(&data->sysfs_mutex); -- GitLab From e50389f208daf1d60810e789b8860063e3256693 Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Tue, 12 Dec 2023 21:53:40 -0800 Subject: [PATCH 0335/1290] =?UTF-8?q?Input:=20touchscreen=20-=20use=20sysf?= =?UTF-8?q?s=5Femit[=5Fat]()=20instead=20of=C2=A0scnprintf()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow the advice of the Documentation/filesystems/sysfs.rst and show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. Signed-off-by: ye xingchen Acked-by: Uwe Kleine-König Acked-by: Oliver Graute Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/atmel_mxt_ts.c | 21 +++++++++------------ drivers/input/touchscreen/edt-ft5x06.c | 2 +- drivers/input/touchscreen/hideep.c | 6 ++---- drivers/input/touchscreen/hycon-hy46xx.c | 2 +- drivers/input/touchscreen/ilitek_ts_i2c.c | 16 ++++++++-------- drivers/input/touchscreen/iqs5xx.c | 12 ++++++------ drivers/input/touchscreen/iqs7211.c | 12 ++++++------ drivers/input/touchscreen/melfas_mip4.c | 16 ++++++++-------- drivers/input/touchscreen/usbtouchscreen.c | 4 ++-- drivers/input/touchscreen/wdt87xx_i2c.c | 6 +++--- 10 files changed, 46 insertions(+), 51 deletions(-) diff --git a/drivers/input/touchscreen/atmel_mxt_ts.c b/drivers/input/touchscreen/atmel_mxt_ts.c index 20094b9899f09..05dcacf7061b2 100644 --- a/drivers/input/touchscreen/atmel_mxt_ts.c +++ b/drivers/input/touchscreen/atmel_mxt_ts.c @@ -2818,8 +2818,8 @@ static ssize_t mxt_fw_version_show(struct device *dev, { struct mxt_data *data = dev_get_drvdata(dev); struct mxt_info *info = data->info; - return scnprintf(buf, PAGE_SIZE, "%u.%u.%02X\n", - info->version >> 4, info->version & 0xf, info->build); + return sysfs_emit(buf, "%u.%u.%02X\n", + info->version >> 4, info->version & 0xf, info->build); } /* Hardware Version is returned as FamilyID.VariantID */ @@ -2828,8 +2828,7 @@ static ssize_t mxt_hw_version_show(struct device *dev, { struct mxt_data *data = dev_get_drvdata(dev); struct mxt_info *info = data->info; - return scnprintf(buf, PAGE_SIZE, "%u.%u\n", - info->family_id, info->variant_id); + return sysfs_emit(buf, "%u.%u\n", info->family_id, info->variant_id); } static ssize_t mxt_show_instance(char *buf, int count, @@ -2839,19 +2838,18 @@ static ssize_t mxt_show_instance(char *buf, int count, int i; if (mxt_obj_instances(object) > 1) - count += scnprintf(buf + count, PAGE_SIZE - count, - "Instance %u\n", instance); + count += sysfs_emit_at(buf, count, "Instance %u\n", instance); for (i = 0; i < mxt_obj_size(object); i++) - count += scnprintf(buf + count, PAGE_SIZE - count, - "\t[%2u]: %02x (%d)\n", i, val[i], val[i]); - count += scnprintf(buf + count, PAGE_SIZE - count, "\n"); + count += sysfs_emit_at(buf, count, "\t[%2u]: %02x (%d)\n", + i, val[i], val[i]); + count += sysfs_emit_at(buf, count, "\n"); return count; } static ssize_t mxt_object_show(struct device *dev, - struct device_attribute *attr, char *buf) + struct device_attribute *attr, char *buf) { struct mxt_data *data = dev_get_drvdata(dev); struct mxt_object *object; @@ -2872,8 +2870,7 @@ static ssize_t mxt_object_show(struct device *dev, if (!mxt_object_readable(object->type)) continue; - count += scnprintf(buf + count, PAGE_SIZE - count, - "T%u:\n", object->type); + count += sysfs_emit_at(buf, count, "T%u:\n", object->type); for (j = 0; j < mxt_obj_instances(object); j++) { u16 size = mxt_obj_size(object); diff --git a/drivers/input/touchscreen/edt-ft5x06.c b/drivers/input/touchscreen/edt-ft5x06.c index 3e102bcc4a1c7..2a1db11344766 100644 --- a/drivers/input/touchscreen/edt-ft5x06.c +++ b/drivers/input/touchscreen/edt-ft5x06.c @@ -431,7 +431,7 @@ static ssize_t edt_ft5x06_setting_show(struct device *dev, *field = val; } - count = scnprintf(buf, PAGE_SIZE, "%d\n", val); + count = sysfs_emit(buf, "%d\n", val); out: mutex_unlock(&tsdata->mutex); return error ?: count; diff --git a/drivers/input/touchscreen/hideep.c b/drivers/input/touchscreen/hideep.c index 0f58258306bfc..eae90676f4e55 100644 --- a/drivers/input/touchscreen/hideep.c +++ b/drivers/input/touchscreen/hideep.c @@ -928,8 +928,7 @@ static ssize_t hideep_fw_version_show(struct device *dev, ssize_t len; mutex_lock(&ts->dev_mutex); - len = scnprintf(buf, PAGE_SIZE, "%04x\n", - be16_to_cpu(ts->dwz_info.release_ver)); + len = sysfs_emit(buf, "%04x\n", be16_to_cpu(ts->dwz_info.release_ver)); mutex_unlock(&ts->dev_mutex); return len; @@ -943,8 +942,7 @@ static ssize_t hideep_product_id_show(struct device *dev, ssize_t len; mutex_lock(&ts->dev_mutex); - len = scnprintf(buf, PAGE_SIZE, "%04x\n", - be16_to_cpu(ts->dwz_info.product_id)); + len = sysfs_emit(buf, "%04x\n", be16_to_cpu(ts->dwz_info.product_id)); mutex_unlock(&ts->dev_mutex); return len; diff --git a/drivers/input/touchscreen/hycon-hy46xx.c b/drivers/input/touchscreen/hycon-hy46xx.c index d0f257989fd6b..2e01d87977c16 100644 --- a/drivers/input/touchscreen/hycon-hy46xx.c +++ b/drivers/input/touchscreen/hycon-hy46xx.c @@ -202,7 +202,7 @@ static ssize_t hycon_hy46xx_setting_show(struct device *dev, *field = val; } - count = scnprintf(buf, PAGE_SIZE, "%d\n", val); + count = sysfs_emit(buf, "%d\n", val); out: mutex_unlock(&tsdata->mutex); diff --git a/drivers/input/touchscreen/ilitek_ts_i2c.c b/drivers/input/touchscreen/ilitek_ts_i2c.c index 90c4934e750a3..fc4e39b6651a4 100644 --- a/drivers/input/touchscreen/ilitek_ts_i2c.c +++ b/drivers/input/touchscreen/ilitek_ts_i2c.c @@ -512,12 +512,12 @@ static ssize_t firmware_version_show(struct device *dev, struct i2c_client *client = to_i2c_client(dev); struct ilitek_ts_data *ts = i2c_get_clientdata(client); - return scnprintf(buf, PAGE_SIZE, - "fw version: [%02X%02X.%02X%02X.%02X%02X.%02X%02X]\n", - ts->firmware_ver[0], ts->firmware_ver[1], - ts->firmware_ver[2], ts->firmware_ver[3], - ts->firmware_ver[4], ts->firmware_ver[5], - ts->firmware_ver[6], ts->firmware_ver[7]); + return sysfs_emit(buf, + "fw version: [%02X%02X.%02X%02X.%02X%02X.%02X%02X]\n", + ts->firmware_ver[0], ts->firmware_ver[1], + ts->firmware_ver[2], ts->firmware_ver[3], + ts->firmware_ver[4], ts->firmware_ver[5], + ts->firmware_ver[6], ts->firmware_ver[7]); } static DEVICE_ATTR_RO(firmware_version); @@ -527,8 +527,8 @@ static ssize_t product_id_show(struct device *dev, struct i2c_client *client = to_i2c_client(dev); struct ilitek_ts_data *ts = i2c_get_clientdata(client); - return scnprintf(buf, PAGE_SIZE, "product id: [%04X], module: [%s]\n", - ts->mcu_ver, ts->product_id); + return sysfs_emit(buf, "product id: [%04X], module: [%s]\n", + ts->mcu_ver, ts->product_id); } static DEVICE_ATTR_RO(product_id); diff --git a/drivers/input/touchscreen/iqs5xx.c b/drivers/input/touchscreen/iqs5xx.c index a3f4fb85bee58..4d226118f3cc2 100644 --- a/drivers/input/touchscreen/iqs5xx.c +++ b/drivers/input/touchscreen/iqs5xx.c @@ -943,12 +943,12 @@ static ssize_t fw_info_show(struct device *dev, if (!iqs5xx->dev_id_info.bl_status) return -ENODATA; - return scnprintf(buf, PAGE_SIZE, "%u.%u.%u.%u:%u.%u\n", - be16_to_cpu(iqs5xx->dev_id_info.prod_num), - be16_to_cpu(iqs5xx->dev_id_info.proj_num), - iqs5xx->dev_id_info.major_ver, - iqs5xx->dev_id_info.minor_ver, - iqs5xx->exp_file[0], iqs5xx->exp_file[1]); + return sysfs_emit(buf, "%u.%u.%u.%u:%u.%u\n", + be16_to_cpu(iqs5xx->dev_id_info.prod_num), + be16_to_cpu(iqs5xx->dev_id_info.proj_num), + iqs5xx->dev_id_info.major_ver, + iqs5xx->dev_id_info.minor_ver, + iqs5xx->exp_file[0], iqs5xx->exp_file[1]); } static DEVICE_ATTR_WO(fw_file); diff --git a/drivers/input/touchscreen/iqs7211.c b/drivers/input/touchscreen/iqs7211.c index dc084f8737620..f0a56cde899e4 100644 --- a/drivers/input/touchscreen/iqs7211.c +++ b/drivers/input/touchscreen/iqs7211.c @@ -2401,12 +2401,12 @@ static ssize_t fw_info_show(struct device *dev, { struct iqs7211_private *iqs7211 = dev_get_drvdata(dev); - return scnprintf(buf, PAGE_SIZE, "%u.%u.%u.%u:%u.%u\n", - le16_to_cpu(iqs7211->ver_info.prod_num), - le32_to_cpu(iqs7211->ver_info.patch), - le16_to_cpu(iqs7211->ver_info.major), - le16_to_cpu(iqs7211->ver_info.minor), - iqs7211->exp_file[1], iqs7211->exp_file[0]); + return sysfs_emit(buf, "%u.%u.%u.%u:%u.%u\n", + le16_to_cpu(iqs7211->ver_info.prod_num), + le32_to_cpu(iqs7211->ver_info.patch), + le16_to_cpu(iqs7211->ver_info.major), + le16_to_cpu(iqs7211->ver_info.minor), + iqs7211->exp_file[1], iqs7211->exp_file[0]); } static DEVICE_ATTR_RO(fw_info); diff --git a/drivers/input/touchscreen/melfas_mip4.c b/drivers/input/touchscreen/melfas_mip4.c index aa325486f6182..78e1c63e530e0 100644 --- a/drivers/input/touchscreen/melfas_mip4.c +++ b/drivers/input/touchscreen/melfas_mip4.c @@ -1336,9 +1336,9 @@ static ssize_t mip4_sysfs_read_fw_version(struct device *dev, /* Take lock to prevent racing with firmware update */ mutex_lock(&ts->input->mutex); - count = snprintf(buf, PAGE_SIZE, "%04X %04X %04X %04X\n", - ts->fw_version.boot, ts->fw_version.core, - ts->fw_version.app, ts->fw_version.param); + count = sysfs_emit(buf, "%04X %04X %04X %04X\n", + ts->fw_version.boot, ts->fw_version.core, + ts->fw_version.app, ts->fw_version.param); mutex_unlock(&ts->input->mutex); @@ -1362,8 +1362,8 @@ static ssize_t mip4_sysfs_read_hw_version(struct device *dev, * product_name shows the name or version of the hardware * paired with current firmware in the chip. */ - count = snprintf(buf, PAGE_SIZE, "%.*s\n", - (int)sizeof(ts->product_name), ts->product_name); + count = sysfs_emit(buf, "%.*s\n", + (int)sizeof(ts->product_name), ts->product_name); mutex_unlock(&ts->input->mutex); @@ -1382,7 +1382,7 @@ static ssize_t mip4_sysfs_read_product_id(struct device *dev, mutex_lock(&ts->input->mutex); - count = snprintf(buf, PAGE_SIZE, "%04X\n", ts->product_id); + count = sysfs_emit(buf, "%04X\n", ts->product_id); mutex_unlock(&ts->input->mutex); @@ -1401,8 +1401,8 @@ static ssize_t mip4_sysfs_read_ic_name(struct device *dev, mutex_lock(&ts->input->mutex); - count = snprintf(buf, PAGE_SIZE, "%.*s\n", - (int)sizeof(ts->ic_name), ts->ic_name); + count = sysfs_emit(buf, "%.*s\n", + (int)sizeof(ts->ic_name), ts->ic_name); mutex_unlock(&ts->input->mutex); diff --git a/drivers/input/touchscreen/usbtouchscreen.c b/drivers/input/touchscreen/usbtouchscreen.c index d6d04b9f04fc1..60354ebc72424 100644 --- a/drivers/input/touchscreen/usbtouchscreen.c +++ b/drivers/input/touchscreen/usbtouchscreen.c @@ -456,8 +456,8 @@ static ssize_t mtouch_firmware_rev_show(struct device *dev, struct usbtouch_usb *usbtouch = usb_get_intfdata(intf); struct mtouch_priv *priv = usbtouch->priv; - return scnprintf(output, PAGE_SIZE, "%1x.%1x\n", - priv->fw_rev_major, priv->fw_rev_minor); + return sysfs_emit(output, "%1x.%1x\n", + priv->fw_rev_major, priv->fw_rev_minor); } static DEVICE_ATTR(firmware_rev, 0444, mtouch_firmware_rev_show, NULL); diff --git a/drivers/input/touchscreen/wdt87xx_i2c.c b/drivers/input/touchscreen/wdt87xx_i2c.c index 128341a6696bc..32c7be54434cf 100644 --- a/drivers/input/touchscreen/wdt87xx_i2c.c +++ b/drivers/input/touchscreen/wdt87xx_i2c.c @@ -887,7 +887,7 @@ static ssize_t config_csum_show(struct device *dev, cfg_csum = wdt->param.xmls_id1; cfg_csum = (cfg_csum << 16) | wdt->param.xmls_id2; - return scnprintf(buf, PAGE_SIZE, "%x\n", cfg_csum); + return sysfs_emit(buf, "%x\n", cfg_csum); } static ssize_t fw_version_show(struct device *dev, @@ -896,7 +896,7 @@ static ssize_t fw_version_show(struct device *dev, struct i2c_client *client = to_i2c_client(dev); struct wdt87xx_data *wdt = i2c_get_clientdata(client); - return scnprintf(buf, PAGE_SIZE, "%x\n", wdt->param.fw_id); + return sysfs_emit(buf, "%x\n", wdt->param.fw_id); } static ssize_t plat_id_show(struct device *dev, @@ -905,7 +905,7 @@ static ssize_t plat_id_show(struct device *dev, struct i2c_client *client = to_i2c_client(dev); struct wdt87xx_data *wdt = i2c_get_clientdata(client); - return scnprintf(buf, PAGE_SIZE, "%x\n", wdt->param.plat_id); + return sysfs_emit(buf, "%x\n", wdt->param.plat_id); } static ssize_t update_config_store(struct device *dev, -- GitLab From 8fbdb8fb36c68a00555a1a2593c6fff1244a65b3 Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Tue, 12 Dec 2023 22:31:44 -0800 Subject: [PATCH 0336/1290] Input: synaptics-rmi4 - use sysfs_emit() to instead of scnprintf() Replace calls to scnprintf() in the methods showing device attributes with sysfs_emit() to simplify the code. Signed-off-by: ye xingchen Link: https://lore.kernel.org/r/202212011551429834598@zte.com.cn Signed-off-by: Dmitry Torokhov --- drivers/input/rmi4/rmi_f01.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/input/rmi4/rmi_f01.c b/drivers/input/rmi4/rmi_f01.c index d7603c50f864b..cc1d4b424640e 100644 --- a/drivers/input/rmi4/rmi_f01.c +++ b/drivers/input/rmi4/rmi_f01.c @@ -267,8 +267,7 @@ static ssize_t rmi_driver_manufacturer_id_show(struct device *dev, struct rmi_driver_data *data = dev_get_drvdata(dev); struct f01_data *f01 = dev_get_drvdata(&data->f01_container->dev); - return scnprintf(buf, PAGE_SIZE, "%d\n", - f01->properties.manufacturer_id); + return sysfs_emit(buf, "%d\n", f01->properties.manufacturer_id); } static DEVICE_ATTR(manufacturer_id, 0444, @@ -280,7 +279,7 @@ static ssize_t rmi_driver_dom_show(struct device *dev, struct rmi_driver_data *data = dev_get_drvdata(dev); struct f01_data *f01 = dev_get_drvdata(&data->f01_container->dev); - return scnprintf(buf, PAGE_SIZE, "%s\n", f01->properties.dom); + return sysfs_emit(buf, "%s\n", f01->properties.dom); } static DEVICE_ATTR(date_of_manufacture, 0444, rmi_driver_dom_show, NULL); @@ -292,7 +291,7 @@ static ssize_t rmi_driver_product_id_show(struct device *dev, struct rmi_driver_data *data = dev_get_drvdata(dev); struct f01_data *f01 = dev_get_drvdata(&data->f01_container->dev); - return scnprintf(buf, PAGE_SIZE, "%s\n", f01->properties.product_id); + return sysfs_emit(buf, "%s\n", f01->properties.product_id); } static DEVICE_ATTR(product_id, 0444, rmi_driver_product_id_show, NULL); @@ -304,7 +303,7 @@ static ssize_t rmi_driver_firmware_id_show(struct device *dev, struct rmi_driver_data *data = dev_get_drvdata(dev); struct f01_data *f01 = dev_get_drvdata(&data->f01_container->dev); - return scnprintf(buf, PAGE_SIZE, "%d\n", f01->properties.firmware_id); + return sysfs_emit(buf, "%d\n", f01->properties.firmware_id); } static DEVICE_ATTR(firmware_id, 0444, rmi_driver_firmware_id_show, NULL); @@ -318,8 +317,8 @@ static ssize_t rmi_driver_package_id_show(struct device *dev, u32 package_id = f01->properties.package_id; - return scnprintf(buf, PAGE_SIZE, "%04x.%04x\n", - package_id & 0xffff, (package_id >> 16) & 0xffff); + return sysfs_emit(buf, "%04x.%04x\n", + package_id & 0xffff, (package_id >> 16) & 0xffff); } static DEVICE_ATTR(package_id, 0444, rmi_driver_package_id_show, NULL); -- GitLab From 3e39104ba81dc8738c3706b55f737db5da4dbbfd Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Tue, 12 Dec 2023 22:21:11 -0800 Subject: [PATCH 0337/1290] Input: ims-pcu - use sysfs_emit() instead of scnprintf() Replace calls to scnprintf() in the methods showing device attributes with sysfs_emit() to simplify the code. Signed-off-by: ye xingchen Link: https://lore.kernel.org/r/202212011548387254492@zte.com.cn Signed-off-by: Dmitry Torokhov --- drivers/input/misc/ims-pcu.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/input/misc/ims-pcu.c b/drivers/input/misc/ims-pcu.c index b2f1292e27ef7..6e8cc28debd97 100644 --- a/drivers/input/misc/ims-pcu.c +++ b/drivers/input/misc/ims-pcu.c @@ -1050,7 +1050,7 @@ static ssize_t ims_pcu_attribute_show(struct device *dev, container_of(dattr, struct ims_pcu_attribute, dattr); char *field = (char *)pcu + attr->field_offset; - return scnprintf(buf, PAGE_SIZE, "%.*s\n", attr->field_length, field); + return sysfs_emit(buf, "%.*s\n", attr->field_length, field); } static ssize_t ims_pcu_attribute_store(struct device *dev, @@ -1206,7 +1206,7 @@ ims_pcu_update_firmware_status_show(struct device *dev, struct usb_interface *intf = to_usb_interface(dev); struct ims_pcu *pcu = usb_get_intfdata(intf); - return scnprintf(buf, PAGE_SIZE, "%d\n", pcu->update_firmware_status); + return sysfs_emit(buf, "%d\n", pcu->update_firmware_status); } static DEVICE_ATTR(update_firmware_status, S_IRUGO, @@ -1309,7 +1309,7 @@ static ssize_t ims_pcu_ofn_reg_data_show(struct device *dev, if (error) return error; - return scnprintf(buf, PAGE_SIZE, "%x\n", data); + return sysfs_emit(buf, "%x\n", data); } static ssize_t ims_pcu_ofn_reg_data_store(struct device *dev, @@ -1344,7 +1344,7 @@ static ssize_t ims_pcu_ofn_reg_addr_show(struct device *dev, int error; mutex_lock(&pcu->cmd_mutex); - error = scnprintf(buf, PAGE_SIZE, "%x\n", pcu->ofn_reg_addr); + error = sysfs_emit(buf, "%x\n", pcu->ofn_reg_addr); mutex_unlock(&pcu->cmd_mutex); return error; @@ -1397,7 +1397,7 @@ static ssize_t ims_pcu_ofn_bit_show(struct device *dev, if (error) return error; - return scnprintf(buf, PAGE_SIZE, "%d\n", !!(data & (1 << attr->nr))); + return sysfs_emit(buf, "%d\n", !!(data & (1 << attr->nr))); } static ssize_t ims_pcu_ofn_bit_store(struct device *dev, -- GitLab From 7c73226525702e89aad9fe4c4683826ed5e24361 Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Tue, 12 Dec 2023 22:28:54 -0800 Subject: [PATCH 0338/1290] Input: iqs269a - use sysfs_emit() instead of scnprintf() Replace calls to scnprintf() in the methods showing device attributes with sysfs_emit() to simplify the code. Signed-off-by: ye xingchen Link: https://lore.kernel.org/r/202212011548387254492@zte.com.cn Signed-off-by: Dmitry Torokhov --- drivers/input/misc/iqs269a.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/drivers/input/misc/iqs269a.c b/drivers/input/misc/iqs269a.c index 3c636c75e8a1f..1abce35b955e4 100644 --- a/drivers/input/misc/iqs269a.c +++ b/drivers/input/misc/iqs269a.c @@ -1286,7 +1286,7 @@ static ssize_t counts_show(struct device *dev, if (error) return error; - return scnprintf(buf, PAGE_SIZE, "%u\n", le16_to_cpu(counts)); + return sysfs_emit(buf, "%u\n", le16_to_cpu(counts)); } static ssize_t hall_bin_show(struct device *dev, @@ -1324,7 +1324,7 @@ static ssize_t hall_bin_show(struct device *dev, return -EINVAL; } - return scnprintf(buf, PAGE_SIZE, "%u\n", val); + return sysfs_emit(buf, "%u\n", val); } static ssize_t hall_enable_show(struct device *dev, @@ -1332,7 +1332,7 @@ static ssize_t hall_enable_show(struct device *dev, { struct iqs269_private *iqs269 = dev_get_drvdata(dev); - return scnprintf(buf, PAGE_SIZE, "%u\n", iqs269->hall_enable); + return sysfs_emit(buf, "%u\n", iqs269->hall_enable); } static ssize_t hall_enable_store(struct device *dev, @@ -1362,7 +1362,7 @@ static ssize_t ch_number_show(struct device *dev, { struct iqs269_private *iqs269 = dev_get_drvdata(dev); - return scnprintf(buf, PAGE_SIZE, "%u\n", iqs269->ch_num); + return sysfs_emit(buf, "%u\n", iqs269->ch_num); } static ssize_t ch_number_store(struct device *dev, @@ -1391,8 +1391,7 @@ static ssize_t rx_enable_show(struct device *dev, struct iqs269_private *iqs269 = dev_get_drvdata(dev); struct iqs269_ch_reg *ch_reg = iqs269->sys_reg.ch_reg; - return scnprintf(buf, PAGE_SIZE, "%u\n", - ch_reg[iqs269->ch_num].rx_enable); + return sysfs_emit(buf, "%u\n", ch_reg[iqs269->ch_num].rx_enable); } static ssize_t rx_enable_store(struct device *dev, @@ -1432,7 +1431,7 @@ static ssize_t ati_mode_show(struct device *dev, if (error) return error; - return scnprintf(buf, PAGE_SIZE, "%u\n", val); + return sysfs_emit(buf, "%u\n", val); } static ssize_t ati_mode_store(struct device *dev, @@ -1465,7 +1464,7 @@ static ssize_t ati_base_show(struct device *dev, if (error) return error; - return scnprintf(buf, PAGE_SIZE, "%u\n", val); + return sysfs_emit(buf, "%u\n", val); } static ssize_t ati_base_store(struct device *dev, @@ -1498,7 +1497,7 @@ static ssize_t ati_target_show(struct device *dev, if (error) return error; - return scnprintf(buf, PAGE_SIZE, "%u\n", val); + return sysfs_emit(buf, "%u\n", val); } static ssize_t ati_target_store(struct device *dev, @@ -1525,9 +1524,9 @@ static ssize_t ati_trigger_show(struct device *dev, { struct iqs269_private *iqs269 = dev_get_drvdata(dev); - return scnprintf(buf, PAGE_SIZE, "%u\n", - iqs269->ati_current && - completion_done(&iqs269->ati_done)); + return sysfs_emit(buf, "%u\n", + iqs269->ati_current && + completion_done(&iqs269->ati_done)); } static ssize_t ati_trigger_store(struct device *dev, -- GitLab From 51835758e8a9ce089c47358ea373de82309234de Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Tue, 12 Dec 2023 22:41:50 -0800 Subject: [PATCH 0339/1290] Input: vivaldi - convert to use sysfs_emit_at() API Follow the advice of the Documentation/filesystems/sysfs.rst and show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. Signed-off-by: ye xingchen Link: https://lore.kernel.org/r/202212071644171074630@zte.com.cn Signed-off-by: Dmitry Torokhov --- drivers/input/vivaldi-fmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/input/vivaldi-fmap.c b/drivers/input/vivaldi-fmap.c index 6dae83d968067..0d29ec014e2f9 100644 --- a/drivers/input/vivaldi-fmap.c +++ b/drivers/input/vivaldi-fmap.c @@ -27,10 +27,10 @@ ssize_t vivaldi_function_row_physmap_show(const struct vivaldi_data *data, return 0; for (i = 0; i < data->num_function_row_keys; i++) - size += scnprintf(buf + size, PAGE_SIZE - size, - "%s%02X", size ? " " : "", physmap[i]); + size += sysfs_emit_at(buf, size, + "%s%02X", size ? " " : "", physmap[i]); if (size) - size += scnprintf(buf + size, PAGE_SIZE - size, "\n"); + size += sysfs_emit_at(buf, size, "\n"); return size; } -- GitLab From 6caa290684255991ffeebf228b2fd9e7e4da8f34 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 12 Dec 2023 23:02:56 -0800 Subject: [PATCH 0340/1290] Input: navpoint - convert to use GPIO descriptor The Navpoint driver uses a GPIO line, convert this to use a GPIO descriptor. There are no in-kernel users but out of tree users can easily be added or converted using a GPIO descriptor table as with numerous other drivers. Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20231129-descriptors-input-v1-1-9433162914a3@linaro.org Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/navpoint.c | 41 +++++++++++++--------------------- include/linux/input/navpoint.h | 1 - 2 files changed, 15 insertions(+), 27 deletions(-) diff --git a/drivers/input/mouse/navpoint.c b/drivers/input/mouse/navpoint.c index c00dc1275da23..ba757783c258a 100644 --- a/drivers/input/mouse/navpoint.c +++ b/drivers/input/mouse/navpoint.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include @@ -32,7 +32,7 @@ struct navpoint { struct ssp_device *ssp; struct input_dev *input; struct device *dev; - int gpio; + struct gpio_desc *gpiod; int index; u8 data[1 + HEADER_LENGTH(0xff)]; }; @@ -170,16 +170,14 @@ static void navpoint_up(struct navpoint *navpoint) dev_err(navpoint->dev, "timeout waiting for SSSR[CSS] to clear\n"); - if (gpio_is_valid(navpoint->gpio)) - gpio_set_value(navpoint->gpio, 1); + gpiod_set_value(navpoint->gpiod, 1); } static void navpoint_down(struct navpoint *navpoint) { struct ssp_device *ssp = navpoint->ssp; - if (gpio_is_valid(navpoint->gpio)) - gpio_set_value(navpoint->gpio, 0); + gpiod_set_value(navpoint->gpiod, 0); pxa_ssp_write_reg(ssp, SSCR0, 0); @@ -216,18 +214,9 @@ static int navpoint_probe(struct platform_device *pdev) return -EINVAL; } - if (gpio_is_valid(pdata->gpio)) { - error = gpio_request_one(pdata->gpio, GPIOF_OUT_INIT_LOW, - "SYNAPTICS_ON"); - if (error) - return error; - } - ssp = pxa_ssp_request(pdata->port, pdev->name); - if (!ssp) { - error = -ENODEV; - goto err_free_gpio; - } + if (!ssp) + return -ENODEV; /* HaRET does not disable devices before jumping into Linux */ if (pxa_ssp_read_reg(ssp, SSCR0) & SSCR0_SSE) { @@ -242,10 +231,18 @@ static int navpoint_probe(struct platform_device *pdev) goto err_free_mem; } + navpoint->gpiod = gpiod_get_optional(&pdev->dev, + NULL, GPIOD_OUT_LOW); + if (IS_ERR(navpoint->gpiod)) { + error = PTR_ERR(navpoint->gpiod); + dev_err(&pdev->dev, "error getting GPIO\n"); + goto err_free_mem; + } + gpiod_set_consumer_name(navpoint->gpiod, "SYNAPTICS_ON"); + navpoint->ssp = ssp; navpoint->input = input; navpoint->dev = &pdev->dev; - navpoint->gpio = pdata->gpio; input->name = pdev->name; input->dev.parent = &pdev->dev; @@ -288,17 +285,12 @@ err_free_mem: input_free_device(input); kfree(navpoint); pxa_ssp_free(ssp); -err_free_gpio: - if (gpio_is_valid(pdata->gpio)) - gpio_free(pdata->gpio); return error; } static void navpoint_remove(struct platform_device *pdev) { - const struct navpoint_platform_data *pdata = - dev_get_platdata(&pdev->dev); struct navpoint *navpoint = platform_get_drvdata(pdev); struct ssp_device *ssp = navpoint->ssp; @@ -308,9 +300,6 @@ static void navpoint_remove(struct platform_device *pdev) kfree(navpoint); pxa_ssp_free(ssp); - - if (gpio_is_valid(pdata->gpio)) - gpio_free(pdata->gpio); } static int navpoint_suspend(struct device *dev) diff --git a/include/linux/input/navpoint.h b/include/linux/input/navpoint.h index d464ffb4db52b..5192ae3f5ec1b 100644 --- a/include/linux/input/navpoint.h +++ b/include/linux/input/navpoint.h @@ -5,5 +5,4 @@ struct navpoint_platform_data { int port; /* PXA SSP port for pxa_ssp_request() */ - int gpio; /* GPIO for power on/off */ }; -- GitLab From 1ba05c92682fcc6d0ffb2fce5db68fb517278797 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 12 Dec 2023 23:04:35 -0800 Subject: [PATCH 0341/1290] Input: tca6416-keypad - drop unused include The TCA6416 keypad driver is including the legacy GPIO header for no reason, it is not using any of its symbols. Drop the header. Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20231129-descriptors-input-v1-2-9433162914a3@linaro.org Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/tca6416-keypad.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/input/keyboard/tca6416-keypad.c b/drivers/input/keyboard/tca6416-keypad.c index 8af59ced1ec2e..677bc4baa5d19 100644 --- a/drivers/input/keyboard/tca6416-keypad.c +++ b/drivers/input/keyboard/tca6416-keypad.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include -- GitLab From e53c18da99c75f080bd99436c57824f2ab657f03 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 12 Dec 2023 23:06:55 -0800 Subject: [PATCH 0342/1290] Input: omap-keypad - drop optional GPIO support The driver supports passing some GPIO lines for rows and columns through the driver data, but there is no in-kernel user of this. Further the use seems convoluted because the GPIO lines are unused in the driver, then explicitly free:ed when removing it without being requested when probing it, which is assymetric and just a recepie for disaster. Remove the support for these unused GPIOs, if need be support can be reestablished in an organized fashion using GPIO descriptors. Signed-off-by: Linus Walleij Reviewed-by: Tony Lindgren Link: https://lore.kernel.org/r/20231129-descriptors-input-v1-3-9433162914a3@linaro.org Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/omap-keypad.c | 19 +------------------ include/linux/platform_data/keypad-omap.h | 3 --- 2 files changed, 1 insertion(+), 21 deletions(-) diff --git a/drivers/input/keyboard/omap-keypad.c b/drivers/input/keyboard/omap-keypad.c index 454fb86756573..16f936db73058 100644 --- a/drivers/input/keyboard/omap-keypad.c +++ b/drivers/input/keyboard/omap-keypad.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include @@ -49,9 +48,6 @@ struct omap_kp { static DECLARE_TASKLET_DISABLED_OLD(kp_tasklet, omap_kp_tasklet); -static unsigned int *row_gpios; -static unsigned int *col_gpios; - static irqreturn_t omap_kp_interrupt(int irq, void *dev_id) { /* disable keyboard interrupt and schedule for handling */ @@ -180,7 +176,7 @@ static int omap_kp_probe(struct platform_device *pdev) struct omap_kp *omap_kp; struct input_dev *input_dev; struct omap_kp_platform_data *pdata = dev_get_platdata(&pdev->dev); - int i, col_idx, row_idx, ret; + int ret; unsigned int row_shift, keycodemax; if (!pdata->rows || !pdata->cols || !pdata->keymap_data) { @@ -209,17 +205,9 @@ static int omap_kp_probe(struct platform_device *pdev) if (pdata->delay) omap_kp->delay = pdata->delay; - if (pdata->row_gpios && pdata->col_gpios) { - row_gpios = pdata->row_gpios; - col_gpios = pdata->col_gpios; - } - omap_kp->rows = pdata->rows; omap_kp->cols = pdata->cols; - col_idx = 0; - row_idx = 0; - timer_setup(&omap_kp->timer, omap_kp_timer, 0); /* get the irq and init timer*/ @@ -276,11 +264,6 @@ err4: err3: device_remove_file(&pdev->dev, &dev_attr_enable); err2: - for (i = row_idx - 1; i >= 0; i--) - gpio_free(row_gpios[i]); - for (i = col_idx - 1; i >= 0; i--) - gpio_free(col_gpios[i]); - kfree(omap_kp); input_free_device(input_dev); diff --git a/include/linux/platform_data/keypad-omap.h b/include/linux/platform_data/keypad-omap.h index 3e7c64c854f4c..f3f1311cdf3aa 100644 --- a/include/linux/platform_data/keypad-omap.h +++ b/include/linux/platform_data/keypad-omap.h @@ -19,9 +19,6 @@ struct omap_kp_platform_data { bool rep; unsigned long delay; bool dbounce; - /* specific to OMAP242x*/ - unsigned int *row_gpios; - unsigned int *col_gpios; }; /* Group (0..3) -- when multiple keys are pressed, only the -- GitLab From 7395de647e87476f5b5d2f9a9fe80cee86b4e7cc Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 12 Dec 2023 23:04:47 -0800 Subject: [PATCH 0343/1290] Input: as5011 - convert to GPIO descriptor This driver does not have any in-tree users but is passing a legacy GPIO number through platform data. Convert it to use a GPIO descriptor, new users or outoftree users can easily be implemented using GPIO descriptor tables or software nodes. Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20231129-descriptors-input-v1-4-9433162914a3@linaro.org Signed-off-by: Dmitry Torokhov --- drivers/input/joystick/as5011.c | 24 +++++++++++------------- include/linux/input/as5011.h | 1 - 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/drivers/input/joystick/as5011.c b/drivers/input/joystick/as5011.c index bf8b1cc0ea9c7..f1822c19a289d 100644 --- a/drivers/input/joystick/as5011.c +++ b/drivers/input/joystick/as5011.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include @@ -61,7 +61,7 @@ MODULE_LICENSE("GPL"); struct as5011_device { struct input_dev *input_dev; struct i2c_client *i2c_client; - unsigned int button_gpio; + struct gpio_desc *button_gpiod; unsigned int button_irq; unsigned int axis_irq; }; @@ -114,7 +114,7 @@ static int as5011_i2c_read(struct i2c_client *client, static irqreturn_t as5011_button_interrupt(int irq, void *dev_id) { struct as5011_device *as5011 = dev_id; - int val = gpio_get_value_cansleep(as5011->button_gpio); + int val = gpiod_get_value_cansleep(as5011->button_gpiod); input_report_key(as5011->input_dev, BTN_JOYSTICK, !val); input_sync(as5011->input_dev); @@ -248,7 +248,6 @@ static int as5011_probe(struct i2c_client *client) as5011->i2c_client = client; as5011->input_dev = input_dev; - as5011->button_gpio = plat_data->button_gpio; as5011->axis_irq = plat_data->axis_irq; input_dev->name = "Austria Microsystem as5011 joystick"; @@ -262,18 +261,20 @@ static int as5011_probe(struct i2c_client *client) input_set_abs_params(as5011->input_dev, ABS_Y, AS5011_MIN_AXIS, AS5011_MAX_AXIS, AS5011_FUZZ, AS5011_FLAT); - error = gpio_request(as5011->button_gpio, "AS5011 button"); - if (error < 0) { - dev_err(&client->dev, "Failed to request button gpio\n"); + as5011->button_gpiod = devm_gpiod_get(&client->dev, NULL, GPIOD_IN); + if (IS_ERR(as5011->button_gpiod)) { + error = PTR_ERR(as5011->button_gpiod); + dev_err(&client->dev, "Failed to request button GPIO\n"); goto err_free_mem; } + gpiod_set_consumer_name(as5011->button_gpiod, "AS5011 button"); - irq = gpio_to_irq(as5011->button_gpio); + irq = gpiod_to_irq(as5011->button_gpiod); if (irq < 0) { dev_err(&client->dev, "Failed to get irq number for button gpio\n"); error = irq; - goto err_free_button_gpio; + goto err_free_mem; } as5011->button_irq = irq; @@ -286,7 +287,7 @@ static int as5011_probe(struct i2c_client *client) if (error < 0) { dev_err(&client->dev, "Can't allocate button irq %d\n", as5011->button_irq); - goto err_free_button_gpio; + goto err_free_mem; } error = as5011_configure_chip(as5011, plat_data); @@ -317,8 +318,6 @@ err_free_axis_irq: free_irq(as5011->axis_irq, as5011); err_free_button_irq: free_irq(as5011->button_irq, as5011); -err_free_button_gpio: - gpio_free(as5011->button_gpio); err_free_mem: input_free_device(input_dev); kfree(as5011); @@ -332,7 +331,6 @@ static void as5011_remove(struct i2c_client *client) free_irq(as5011->axis_irq, as5011); free_irq(as5011->button_irq, as5011); - gpio_free(as5011->button_gpio); input_unregister_device(as5011->input_dev); kfree(as5011); diff --git a/include/linux/input/as5011.h b/include/linux/input/as5011.h index 5fba52a56cd61..5705d5de3aeae 100644 --- a/include/linux/input/as5011.h +++ b/include/linux/input/as5011.h @@ -7,7 +7,6 @@ */ struct as5011_platform_data { - unsigned int button_gpio; unsigned int axis_irq; /* irq number */ unsigned long axis_irqflags; char xp, xn; /* threshold for x axis */ -- GitLab From 7a92fc8b4d20680e4c20289a670d8fca2d1f2c1b Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Tue, 12 Dec 2023 22:34:56 +0100 Subject: [PATCH 0344/1290] mm: Introduce flush_cache_vmap_early() The pcpu setup when using the page allocator sets up a new vmalloc mapping very early in the boot process, so early that it cannot use the flush_cache_vmap() function which may depend on structures not yet initialized (for example in riscv, we currently send an IPI to flush other cpus TLB). But on some architectures, we must call flush_cache_vmap(): for example, in riscv, some uarchs can cache invalid TLB entries so we need to flush the new established mapping to avoid taking an exception. So fix this by introducing a new function flush_cache_vmap_early() which is called right after setting the new page table entry and before accessing this new mapping. This new function implements a local flush tlb on riscv and is no-op for other architectures (same as today). Signed-off-by: Alexandre Ghiti Acked-by: Geert Uytterhoeven Signed-off-by: Dennis Zhou --- arch/arc/include/asm/cacheflush.h | 1 + arch/arm/include/asm/cacheflush.h | 2 ++ arch/csky/abiv1/inc/abi/cacheflush.h | 1 + arch/csky/abiv2/inc/abi/cacheflush.h | 1 + arch/m68k/include/asm/cacheflush_mm.h | 1 + arch/mips/include/asm/cacheflush.h | 2 ++ arch/nios2/include/asm/cacheflush.h | 1 + arch/parisc/include/asm/cacheflush.h | 1 + arch/riscv/include/asm/cacheflush.h | 3 ++- arch/riscv/include/asm/tlbflush.h | 1 + arch/riscv/mm/tlbflush.c | 5 +++++ arch/sh/include/asm/cacheflush.h | 1 + arch/sparc/include/asm/cacheflush_32.h | 1 + arch/sparc/include/asm/cacheflush_64.h | 1 + arch/xtensa/include/asm/cacheflush.h | 6 ++++-- include/asm-generic/cacheflush.h | 6 ++++++ mm/percpu.c | 8 +------- 17 files changed, 32 insertions(+), 10 deletions(-) diff --git a/arch/arc/include/asm/cacheflush.h b/arch/arc/include/asm/cacheflush.h index bd5b1a9a05440..6fc74500a9f52 100644 --- a/arch/arc/include/asm/cacheflush.h +++ b/arch/arc/include/asm/cacheflush.h @@ -40,6 +40,7 @@ void dma_cache_wback(phys_addr_t start, unsigned long sz); /* TBD: optimize this */ #define flush_cache_vmap(start, end) flush_cache_all() +#define flush_cache_vmap_early(start, end) do { } while (0) #define flush_cache_vunmap(start, end) flush_cache_all() #define flush_cache_dup_mm(mm) /* called on fork (VIVT only) */ diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index f6181f69577fe..1075534b0a2ee 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -340,6 +340,8 @@ static inline void flush_cache_vmap(unsigned long start, unsigned long end) dsb(ishst); } +#define flush_cache_vmap_early(start, end) do { } while (0) + static inline void flush_cache_vunmap(unsigned long start, unsigned long end) { if (!cache_is_vipt_nonaliasing()) diff --git a/arch/csky/abiv1/inc/abi/cacheflush.h b/arch/csky/abiv1/inc/abi/cacheflush.h index 908d8b0bc4fdc..d011a81575d21 100644 --- a/arch/csky/abiv1/inc/abi/cacheflush.h +++ b/arch/csky/abiv1/inc/abi/cacheflush.h @@ -43,6 +43,7 @@ static inline void flush_anon_page(struct vm_area_struct *vma, */ extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); #define flush_cache_vmap(start, end) cache_wbinv_all() +#define flush_cache_vmap_early(start, end) do { } while (0) #define flush_cache_vunmap(start, end) cache_wbinv_all() #define flush_icache_range(start, end) cache_wbinv_range(start, end) diff --git a/arch/csky/abiv2/inc/abi/cacheflush.h b/arch/csky/abiv2/inc/abi/cacheflush.h index 40be16907267d..6513ac5d25788 100644 --- a/arch/csky/abiv2/inc/abi/cacheflush.h +++ b/arch/csky/abiv2/inc/abi/cacheflush.h @@ -41,6 +41,7 @@ void flush_icache_mm_range(struct mm_struct *mm, void flush_icache_deferred(struct mm_struct *mm); #define flush_cache_vmap(start, end) do { } while (0) +#define flush_cache_vmap_early(start, end) do { } while (0) #define flush_cache_vunmap(start, end) do { } while (0) #define copy_to_user_page(vma, page, vaddr, dst, src, len) \ diff --git a/arch/m68k/include/asm/cacheflush_mm.h b/arch/m68k/include/asm/cacheflush_mm.h index ed12358c4783b..9a71b0148461a 100644 --- a/arch/m68k/include/asm/cacheflush_mm.h +++ b/arch/m68k/include/asm/cacheflush_mm.h @@ -191,6 +191,7 @@ extern void cache_push_v(unsigned long vaddr, int len); #define flush_cache_all() __flush_cache_all() #define flush_cache_vmap(start, end) flush_cache_all() +#define flush_cache_vmap_early(start, end) do { } while (0) #define flush_cache_vunmap(start, end) flush_cache_all() static inline void flush_cache_mm(struct mm_struct *mm) diff --git a/arch/mips/include/asm/cacheflush.h b/arch/mips/include/asm/cacheflush.h index f36c2519ed976..1f14132b3fc98 100644 --- a/arch/mips/include/asm/cacheflush.h +++ b/arch/mips/include/asm/cacheflush.h @@ -97,6 +97,8 @@ static inline void flush_cache_vmap(unsigned long start, unsigned long end) __flush_cache_vmap(); } +#define flush_cache_vmap_early(start, end) do { } while (0) + extern void (*__flush_cache_vunmap)(void); static inline void flush_cache_vunmap(unsigned long start, unsigned long end) diff --git a/arch/nios2/include/asm/cacheflush.h b/arch/nios2/include/asm/cacheflush.h index 348cea0977927..81484a776b333 100644 --- a/arch/nios2/include/asm/cacheflush.h +++ b/arch/nios2/include/asm/cacheflush.h @@ -38,6 +38,7 @@ void flush_icache_pages(struct vm_area_struct *vma, struct page *page, #define flush_icache_pages flush_icache_pages #define flush_cache_vmap(start, end) flush_dcache_range(start, end) +#define flush_cache_vmap_early(start, end) do { } while (0) #define flush_cache_vunmap(start, end) flush_dcache_range(start, end) extern void copy_to_user_page(struct vm_area_struct *vma, struct page *page, diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index b4006f2a97052..ba4c05bc24d69 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -41,6 +41,7 @@ void flush_kernel_vmap_range(void *vaddr, int size); void invalidate_kernel_vmap_range(void *vaddr, int size); #define flush_cache_vmap(start, end) flush_cache_all() +#define flush_cache_vmap_early(start, end) do { } while (0) #define flush_cache_vunmap(start, end) flush_cache_all() void flush_dcache_folio(struct folio *folio); diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h index 3cb53c4df27cf..a129dac4521d3 100644 --- a/arch/riscv/include/asm/cacheflush.h +++ b/arch/riscv/include/asm/cacheflush.h @@ -37,7 +37,8 @@ static inline void flush_dcache_page(struct page *page) flush_icache_mm(vma->vm_mm, 0) #ifdef CONFIG_64BIT -#define flush_cache_vmap(start, end) flush_tlb_kernel_range(start, end) +#define flush_cache_vmap(start, end) flush_tlb_kernel_range(start, end) +#define flush_cache_vmap_early(start, end) local_flush_tlb_kernel_range(start, end) #endif #ifndef CONFIG_SMP diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h index 8f3418c5f1724..a60416bbe1904 100644 --- a/arch/riscv/include/asm/tlbflush.h +++ b/arch/riscv/include/asm/tlbflush.h @@ -41,6 +41,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr); void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); void flush_tlb_kernel_range(unsigned long start, unsigned long end); +void local_flush_tlb_kernel_range(unsigned long start, unsigned long end); #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start, diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c index e6659d7368b35..8aadc5f71c93b 100644 --- a/arch/riscv/mm/tlbflush.c +++ b/arch/riscv/mm/tlbflush.c @@ -66,6 +66,11 @@ static inline void local_flush_tlb_range_asid(unsigned long start, local_flush_tlb_range_threshold_asid(start, size, stride, asid); } +void local_flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ + local_flush_tlb_range_asid(start, end, PAGE_SIZE, FLUSH_TLB_NO_ASID); +} + static void __ipi_flush_tlb_all(void *info) { local_flush_tlb_all(); diff --git a/arch/sh/include/asm/cacheflush.h b/arch/sh/include/asm/cacheflush.h index 878b6b551bd2d..51112f54552b3 100644 --- a/arch/sh/include/asm/cacheflush.h +++ b/arch/sh/include/asm/cacheflush.h @@ -90,6 +90,7 @@ extern void copy_from_user_page(struct vm_area_struct *vma, unsigned long len); #define flush_cache_vmap(start, end) local_flush_cache_all(NULL) +#define flush_cache_vmap_early(start, end) do { } while (0) #define flush_cache_vunmap(start, end) local_flush_cache_all(NULL) #define flush_dcache_mmap_lock(mapping) do { } while (0) diff --git a/arch/sparc/include/asm/cacheflush_32.h b/arch/sparc/include/asm/cacheflush_32.h index f3b7270bf71b2..9fee0ccfccb8e 100644 --- a/arch/sparc/include/asm/cacheflush_32.h +++ b/arch/sparc/include/asm/cacheflush_32.h @@ -48,6 +48,7 @@ static inline void flush_dcache_page(struct page *page) #define flush_dcache_mmap_unlock(mapping) do { } while (0) #define flush_cache_vmap(start, end) flush_cache_all() +#define flush_cache_vmap_early(start, end) do { } while (0) #define flush_cache_vunmap(start, end) flush_cache_all() /* When a context switch happens we must flush all user windows so that diff --git a/arch/sparc/include/asm/cacheflush_64.h b/arch/sparc/include/asm/cacheflush_64.h index 0e879004efff1..2b1261b77ecd1 100644 --- a/arch/sparc/include/asm/cacheflush_64.h +++ b/arch/sparc/include/asm/cacheflush_64.h @@ -75,6 +75,7 @@ void flush_ptrace_access(struct vm_area_struct *, struct page *, #define flush_dcache_mmap_unlock(mapping) do { } while (0) #define flush_cache_vmap(start, end) do { } while (0) +#define flush_cache_vmap_early(start, end) do { } while (0) #define flush_cache_vunmap(start, end) do { } while (0) #endif /* !__ASSEMBLY__ */ diff --git a/arch/xtensa/include/asm/cacheflush.h b/arch/xtensa/include/asm/cacheflush.h index 785a00ce83c11..38bcecb0e457d 100644 --- a/arch/xtensa/include/asm/cacheflush.h +++ b/arch/xtensa/include/asm/cacheflush.h @@ -116,8 +116,9 @@ void flush_cache_page(struct vm_area_struct*, #define flush_cache_mm(mm) flush_cache_all() #define flush_cache_dup_mm(mm) flush_cache_mm(mm) -#define flush_cache_vmap(start,end) flush_cache_all() -#define flush_cache_vunmap(start,end) flush_cache_all() +#define flush_cache_vmap(start,end) flush_cache_all() +#define flush_cache_vmap_early(start,end) do { } while (0) +#define flush_cache_vunmap(start,end) flush_cache_all() void flush_dcache_folio(struct folio *folio); #define flush_dcache_folio flush_dcache_folio @@ -140,6 +141,7 @@ void local_flush_cache_page(struct vm_area_struct *vma, #define flush_cache_dup_mm(mm) do { } while (0) #define flush_cache_vmap(start,end) do { } while (0) +#define flush_cache_vmap_early(start,end) do { } while (0) #define flush_cache_vunmap(start,end) do { } while (0) #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h index 84ec53ccc4502..7ee8a179d1036 100644 --- a/include/asm-generic/cacheflush.h +++ b/include/asm-generic/cacheflush.h @@ -91,6 +91,12 @@ static inline void flush_cache_vmap(unsigned long start, unsigned long end) } #endif +#ifndef flush_cache_vmap_early +static inline void flush_cache_vmap_early(unsigned long start, unsigned long end) +{ +} +#endif + #ifndef flush_cache_vunmap static inline void flush_cache_vunmap(unsigned long start, unsigned long end) { diff --git a/mm/percpu.c b/mm/percpu.c index 7b97d31df7676..4e11fc1e6deff 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -3333,13 +3333,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t if (rc < 0) panic("failed to map percpu area, err=%d\n", rc); - /* - * FIXME: Archs with virtual cache should flush local - * cache for the linear mapping here - something - * equivalent to flush_cache_vmap() on the local cpu. - * flush_cache_vmap() can't be used as most supporting - * data structures are not set up yet. - */ + flush_cache_vmap_early(unit_addr, unit_addr + ai->unit_size); /* copy static data */ memcpy((void *)unit_addr, __per_cpu_load, ai->static_size); -- GitLab From 6b9f29b81b155af023da95f560f738f29722b306 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Tue, 12 Dec 2023 22:34:57 +0100 Subject: [PATCH 0345/1290] riscv: Enable pcpu page first chunk allocator As explained in commit 6ea529a2037c ("percpu: make embedding first chunk allocator check vmalloc space size"), the embedding first chunk allocator needs the vmalloc space to be larger than the maximum distance between units which are grouped into NUMA nodes. On a very sparse NUMA configurations and a small vmalloc area (for example, it is 64GB in sv39), the allocation of dynamic percpu data in the vmalloc area could fail. So provide the pcpu page allocator as a fallback in case we fall into such a sparse configuration (which happened in arm64 as shown by commit 09cea6195073 ("arm64: support page mapping percpu first chunk allocator")). Signed-off-by: Alexandre Ghiti Signed-off-by: Dennis Zhou --- arch/riscv/Kconfig | 2 ++ arch/riscv/mm/kasan_init.c | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 95a2a06acc6a6..80e9b2ac34ab2 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -414,7 +414,9 @@ config NUMA depends on SMP && MMU select ARCH_SUPPORTS_NUMA_BALANCING select GENERIC_ARCH_NUMA + select HAVE_SETUP_PER_CPU_AREA select NEED_PER_CPU_EMBED_FIRST_CHUNK + select NEED_PER_CPU_PAGE_FIRST_CHUNK select OF_NUMA select USE_PERCPU_NUMA_NODE_ID help diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index 5e39dcf23fdbc..4c9a2c527f08f 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -438,6 +438,14 @@ static void __init kasan_shallow_populate(void *start, void *end) kasan_shallow_populate_pgd(vaddr, vend); } +#ifdef CONFIG_KASAN_VMALLOC +void __init kasan_populate_early_vm_area_shadow(void *start, unsigned long size) +{ + kasan_populate(kasan_mem_to_shadow(start), + kasan_mem_to_shadow(start + size)); +} +#endif + static void __init create_tmp_mapping(void) { void *ptr; -- GitLab From 6f33e6fa29d0366d6e5b3ea2930dbc0b648151fe Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 13 Dec 2023 22:02:56 -0800 Subject: [PATCH 0346/1290] perf stat: Combine the -A/--no-aggr and --no-merge options The -A or --no-aggr option disables aggregation of core events: $ perf stat -A -e cycles,data_total -a true Performance counter stats for 'system wide': CPU0 1,287,665 cycles CPU1 1,831,681 cycles CPU2 27,345,998 cycles CPU3 1,964,799 cycles CPU4 236,174 cycles CPU5 3,302,825 cycles CPU6 9,201,446 cycles CPU7 1,403,043 cycles CPU0 110.90 MiB data_total 0.008961761 seconds time elapsed The --no-merge option disables the aggregation of uncore events: $ perf stat --no-merge -e cycles,data_total -a true Performance counter stats for 'system wide': 38,482,778 cycles 15.04 MiB data_total [uncore_imc_free_running_1] 15.00 MiB data_total [uncore_imc_free_running_0] 0.005915155 seconds time elapsed Having two options confuses users who generally don't appreciate the difference in PMUs. Keep all the options but make it so they all disable aggregation both of core and uncore events: $ perf stat -A -e cycles,data_total -a true Performance counter stats for 'system wide': CPU0 85,878 cycles CPU1 88,179 cycles CPU2 60,872 cycles CPU3 3,265,567 cycles CPU4 82,357 cycles CPU5 83,383 cycles CPU6 84,156 cycles CPU7 220,803 cycles CPU0 2.38 MiB data_total [uncore_imc_free_running_0] CPU0 2.38 MiB data_total [uncore_imc_free_running_1] 0.001397205 seconds time elapsed Update the relevant 'perf stat' man page information. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Athira Jajeev Cc: Changbin Du Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kaige Ye Cc: Mark Rutland Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231214060256.2094017-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-stat.txt | 52 ++++++++++++++------------ tools/perf/builtin-stat.c | 5 ++- tools/perf/util/stat-display.c | 2 +- tools/perf/util/stat.c | 2 +- tools/perf/util/stat.h | 1 - 5 files changed, 33 insertions(+), 29 deletions(-) diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index 8f789fa1242e0..5af2e432b54fb 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -422,7 +422,34 @@ See perf list output for the possible metrics and metricgroups. -A:: --no-aggr:: -Do not aggregate counts across all monitored CPUs. +--no-merge:: +Do not aggregate/merge counts across monitored CPUs or PMUs. + +When multiple events are created from a single event specification, +stat will, by default, aggregate the event counts and show the result +in a single row. This option disables that behavior and shows the +individual events and counts. + +Multiple events are created from a single event specification when: + +1. PID monitoring isn't requested and the system has more than one + CPU. For example, a system with 8 SMT threads will have one event + opened on each thread and aggregation is performed across them. + +2. Prefix or glob wildcard matching is used for the PMU name. For + example, multiple memory controller PMUs may exist typically with a + suffix of _0, _1, etc. By default the event counts will all be + combined if the PMU is specified without the suffix such as + uncore_imc rather than uncore_imc_0. + +3. Aliases, which are listed immediately after the Kernel PMU events + by perf list, are used. + +--hybrid-merge:: +Merge core event counts from all core PMUs. In hybrid or big.LITTLE +systems by default each core PMU will report its count +separately. This option forces core PMU counts to be combined to give +a behavior closer to having a single CPU type in the system. --topdown:: Print top-down metrics supported by the CPU. This allows to determine @@ -475,29 +502,6 @@ highlight 'tma_frontend_bound'. This metric may be drilled into with Error out if the input is higher than the supported max level. ---no-merge:: -Do not merge results from same PMUs. - -When multiple events are created from a single event specification, -stat will, by default, aggregate the event counts and show the result -in a single row. This option disables that behavior and shows -the individual events and counts. - -Multiple events are created from a single event specification when: -1. Prefix or glob matching is used for the PMU name. -2. Aliases, which are listed immediately after the Kernel PMU events - by perf list, are used. - ---hybrid-merge:: -Merge the hybrid event counts from all PMUs. - -For hybrid events, by default, the stat aggregates and reports the event -counts per PMU. But sometimes, it's also useful to aggregate event counts -from all PMUs. This option enables that behavior and reports the counts -without PMUs. - -For non-hybrid events, it should be no effect. - --smi-cost:: Measure SMI cost if msr/aperf/ and msr/smi/ events are supported. diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index bda020c0b9d52..5fe9abc6a5241 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1204,8 +1204,9 @@ static struct option stat_options[] = { OPT_STRING('C', "cpu", &target.cpu_list, "cpu", "list of cpus to monitor in system-wide"), OPT_SET_UINT('A', "no-aggr", &stat_config.aggr_mode, - "disable CPU count aggregation", AGGR_NONE), - OPT_BOOLEAN(0, "no-merge", &stat_config.no_merge, "Do not merge identical named events"), + "disable aggregation across CPUs or PMUs", AGGR_NONE), + OPT_SET_UINT(0, "no-merge", &stat_config.aggr_mode, + "disable aggregation the same as -A or -no-aggr", AGGR_NONE), OPT_BOOLEAN(0, "hybrid-merge", &stat_config.hybrid_merge, "Merge identical named hybrid events"), OPT_STRING('x', "field-separator", &stat_config.csv_sep, "separator", diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index afe6db8e7bf4f..8c61f8627ebc9 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -898,7 +898,7 @@ static bool hybrid_uniquify(struct evsel *evsel, struct perf_stat_config *config static void uniquify_counter(struct perf_stat_config *config, struct evsel *counter) { - if (config->no_merge || hybrid_uniquify(counter, config)) + if (config->aggr_mode == AGGR_NONE || hybrid_uniquify(counter, config)) uniquify_event_name(counter); } diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index 012c4946b9c49..b0bcf92f0f9c3 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -592,7 +592,7 @@ void perf_stat_merge_counters(struct perf_stat_config *config, struct evlist *ev { struct evsel *evsel; - if (config->no_merge) + if (config->aggr_mode == AGGR_NONE) return; evlist__for_each_entry(evlist, evsel) diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index 325d0fad18424..4357ba1148221 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -76,7 +76,6 @@ struct perf_stat_config { bool null_run; bool ru_display; bool big_num; - bool no_merge; bool hybrid_merge; bool walltime_run_table; bool all_kernel; -- GitLab From 1af478903fc48c1409a8dd6b698383b62387adf1 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 11 Dec 2023 23:05:44 -0800 Subject: [PATCH 0347/1290] perf genelf: Set ELF program header addresses properly The text section starts after the ELF headers so PHDR.p_vaddr and others should have the correct addresses. Fixes: babd04386b1df8c3 ("perf jit: Include program header in ELF files") Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Fangrui Song Cc: Ingo Molnar Cc: Jiri Olsa Cc: Lieven Hey Cc: Milian Wolff Cc: Pablo Galindo Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231212070547.612536-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/genelf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/genelf.c b/tools/perf/util/genelf.c index fefc72066c4e8..ac17a3cb59dc0 100644 --- a/tools/perf/util/genelf.c +++ b/tools/perf/util/genelf.c @@ -293,9 +293,9 @@ jit_write_elf(int fd, uint64_t load_addr, const char *sym, */ phdr = elf_newphdr(e, 1); phdr[0].p_type = PT_LOAD; - phdr[0].p_offset = 0; - phdr[0].p_vaddr = 0; - phdr[0].p_paddr = 0; + phdr[0].p_offset = GEN_ELF_TEXT_OFFSET; + phdr[0].p_vaddr = GEN_ELF_TEXT_OFFSET; + phdr[0].p_paddr = GEN_ELF_TEXT_OFFSET; phdr[0].p_filesz = csize; phdr[0].p_memsz = csize; phdr[0].p_flags = PF_X | PF_R; -- GitLab From 83e1bdc94f32dcf52dfcd2025acc7a2b9376b1e8 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Thu, 14 Dec 2023 11:28:25 +1300 Subject: [PATCH 0348/1290] x86/virt/tdx: Make TDX host depend on X86_MCE A build failure was reported that when INTEL_TDX_HOST is enabled but X86_MCE is not, the tdx_dump_mce_info() function fails to link: ld: vmlinux.o: in function `tdx_dump_mce_info': ...: undefined reference to `mce_is_memory_error' ...: undefined reference to `mce_usable_address' The reason is in such configuration, despite there's no caller of tdx_dump_mce_info() it is still built and there's no implementation for the two "mce_*()" functions. Make INTEL_TDX_HOST depend on X86_MCE to fix. It makes sense to enable MCE support for the TDX host anyway. Because the only way that TDX has to report integrity errors is an MCE, and it is not good to silently ignore such MCE. The TDX spec also suggests the host VMM is expected to implement the MCE handler. Note it also makes sense to make INTEL_TDX_HOST select X86_MCE but this generates "recursive dependency detected!" error in the Kconfig. Closes: https://lore.kernel.org/all/20231212214612.GHZXjUpBFa1IwVMTI7@fat_crate.local/T/ Fixes: 70060463cb2b ("x86/mce: Differentiate real hardware #MCs from TDX erratum ones") Reported-by: Arnd Bergmann Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Reviewed-by: Kirill A. Shutemov Link: https://lore.kernel.org/all/20231212214612.GHZXjUpBFa1IwVMTI7@fat_crate.local/T/#m1a109c29324b2bbd0b3b1d45c218012cd3a13be6 Link: https://lore.kernel.org/all/20231213222825.286809-1-kai.huang%40intel.com --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 01cdb16acff6e..92c03cb99b3e6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1974,6 +1974,7 @@ config INTEL_TDX_HOST select ARCH_KEEP_MEMBLOCK depends on CONTIG_ALLOC depends on !KEXEC_CORE + depends on X86_MCE help Intel Trust Domain Extensions (TDX) protects guest VMs from malicious host and certain physical attacks. This option enables necessary TDX -- GitLab From c966d23a351a33f8a977fd7efbb6f467132f7383 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 11 Dec 2023 23:05:45 -0800 Subject: [PATCH 0349/1290] perf unwind-libdw: Handle JIT-generated DSOs properly Usually DSOs are mapped from the beginning of the file, so the base address of the DSO can be calculated by map->start - map->pgoff. However, JIT DSOs which are generated by `perf inject -j`, are mapped only the code segment. This makes unwind-libdw code confusing and rejects processing unwinds in the JIT DSOs. It should use the map start address as base for them to fix the confusion. Fixes: 1fe627da30331024 ("perf unwind: Take pgoff into account when reporting elf to libdwfl") Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Fangrui Song Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Milian Wolff Cc: Pablo Galindo Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231212070547.612536-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/unwind-libdw.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c index 8554db3fc0d7c..6013335a8daea 100644 --- a/tools/perf/util/unwind-libdw.c +++ b/tools/perf/util/unwind-libdw.c @@ -46,6 +46,7 @@ static int __report_module(struct addr_location *al, u64 ip, { Dwfl_Module *mod; struct dso *dso = NULL; + Dwarf_Addr base; /* * Some callers will use al->sym, so we can't just use the * cheaper thread__find_map() here. @@ -58,13 +59,25 @@ static int __report_module(struct addr_location *al, u64 ip, if (!dso) return 0; + /* + * The generated JIT DSO files only map the code segment without + * ELF headers. Since JIT codes used to be packed in a memory + * segment, calculating the base address using pgoff falls into + * a different code in another DSO. So just use the map->start + * directly to pick the correct one. + */ + if (!strncmp(dso->long_name, "/tmp/jitted-", 12)) + base = map__start(al->map); + else + base = map__start(al->map) - map__pgoff(al->map); + mod = dwfl_addrmodule(ui->dwfl, ip); if (mod) { Dwarf_Addr s; dwfl_module_info(mod, NULL, &s, NULL, NULL, NULL, NULL, NULL); - if (s != map__start(al->map) - map__pgoff(al->map)) - mod = 0; + if (s != base) + mod = NULL; } if (!mod) { @@ -72,14 +85,14 @@ static int __report_module(struct addr_location *al, u64 ip, __symbol__join_symfs(filename, sizeof(filename), dso->long_name); mod = dwfl_report_elf(ui->dwfl, dso->short_name, filename, -1, - map__start(al->map) - map__pgoff(al->map), false); + base, false); } if (!mod) { char filename[PATH_MAX]; if (dso__build_id_filename(dso, filename, sizeof(filename), false)) mod = dwfl_report_elf(ui->dwfl, dso->short_name, filename, -1, - map__start(al->map) - map__pgoff(al->map), false); + base, false); } if (mod) { -- GitLab From 4fb54994b2360ab5029ee3a959161f6fe6bbb349 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 11 Dec 2023 23:05:46 -0800 Subject: [PATCH 0350/1290] perf unwind-libunwind: Fix base address for .eh_frame The base address of a DSO mapping should start at the start of the file. Usually DSOs are mapped from the pgoff 0 so it doesn't matter when it uses the start of the map address. But generated DSOs for JIT codes doesn't start from the 0 so it should subtract the offset to calculate the .eh_frame table offsets correctly. Fixes: dc2cf4ca866f5715 ("perf unwind: Fix segbase for ld.lld linked objects") Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Fangrui Song Cc: Ingo Molnar Cc: Jiri Olsa Cc: Milian Wolff Cc: Pablo Galindo Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231212070547.612536-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/unwind-libunwind-local.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c index c0641882fd2fd..5e5c3395a4998 100644 --- a/tools/perf/util/unwind-libunwind-local.c +++ b/tools/perf/util/unwind-libunwind-local.c @@ -327,7 +327,7 @@ static int read_unwind_spec_eh_frame(struct dso *dso, struct unwind_info *ui, maps__for_each_entry(thread__maps(ui->thread), map_node) { struct map *map = map_node->map; - u64 start = map__start(map); + u64 start = map__start(map) - map__pgoff(map); if (map__dso(map) == dso && start < base_addr) base_addr = start; -- GitLab From 5fa695e7da4975e8d21ce49f3718d6cf00ecb75e Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 14 Dec 2023 06:46:11 -0800 Subject: [PATCH 0351/1290] perf top: Use evsel's cpus to replace user_requested_cpus perf top errors out on a hybrid machine $perf top Error: The cycles:P event is not supported. The perf top expects that the "cycles" is collected on all CPUs in the system. But for hybrid there is no single "cycles" event which can cover all CPUs. Perf has to split it into two cycles events, e.g., cpu_core/cycles/ and cpu_atom/cycles/. Each event has its own CPU mask. If a event is opened on the unsupported CPU. The open fails. That's the reason of the above error out. Perf should only open the cycles event on the corresponding CPU. The commit ef91871c960e ("perf evlist: Propagate user CPU maps intersecting core PMU maps") intersect the requested CPU map with the CPU map of the PMU. Use the evsel's cpus to replace user_requested_cpus. The evlist's threads are also propagated to the evsel's threads in __perf_evlist__propagate_maps(). For a system-wide event, perf appends a dummy event and assign it to the evsel's threads. For a per-thread event, the evlist's thread_map is assigned to the evsel's threads. The same as the other tools, e.g., perf record, using the evsel's threads when opening an event. Reported-by: Arnaldo Carvalho de Melo Reviewed-by: Ian Rogers Signed-off-by: Kan Liang Tested-by: Arnaldo Carvalho de Melo Cc: Hector Martin Cc: Marc Zyngier Cc: Mark Rutland Cc: Namhyung Kim Closes: https://lore.kernel.org/linux-perf-users/ZXNnDrGKXbEELMXV@kernel.org/ Link: https://lore.kernel.org/r/20231214144612.1092028-1-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-top.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index ed83afeeced05..13e609c0c6930 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1026,8 +1026,8 @@ static int perf_top__start_counters(struct perf_top *top) evlist__for_each_entry(evlist, counter) { try_again: - if (evsel__open(counter, top->evlist->core.user_requested_cpus, - top->evlist->core.threads) < 0) { + if (evsel__open(counter, counter->core.cpus, + counter->core.threads) < 0) { /* * Specially handle overwrite fall back. -- GitLab From a61f89bf76ef6f87ec48dd90dbc73a6cf9952edc Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 14 Dec 2023 06:46:12 -0800 Subject: [PATCH 0352/1290] perf top: Uniform the event name for the hybrid machine It's hard to distinguish the default cycles events among hybrid PMUs. For example, $ perf top Available samples 385 cycles:P 903 cycles:P The other tool, e.g., perf record, uniforms the event name and adds the hybrid PMU name before opening the event. So the events can be easily distinguished. Apply the same methodology for the perf top as well. The evlist__uniquify_name() will be invoked by both record and top. Move it to util/evlist.c With the patch: $ perf top Available samples 148 cpu_atom/cycles:P/ 1K cpu_core/cycles:P/ Reviewed-by: Ian Rogers Signed-off-by: Kan Liang Tested-by: Arnaldo Carvalho de Melo Cc: Hector Martin Cc: Marc Zyngier Cc: Mark Rutland Cc: Namhyung Kim Link: https://lore.kernel.org/r/20231214144612.1092028-2-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 28 +--------------------------- tools/perf/builtin-top.c | 1 + tools/perf/util/evlist.c | 25 +++++++++++++++++++++++++ tools/perf/util/evlist.h | 1 + 4 files changed, 28 insertions(+), 27 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 8a1002683fda2..a89013c44fd53 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -2237,32 +2237,6 @@ static void hit_auxtrace_snapshot_trigger(struct record *rec) } } -static void record__uniquify_name(struct record *rec) -{ - struct evsel *pos; - struct evlist *evlist = rec->evlist; - char *new_name; - int ret; - - if (perf_pmus__num_core_pmus() == 1) - return; - - evlist__for_each_entry(evlist, pos) { - if (!evsel__is_hybrid(pos)) - continue; - - if (strchr(pos->name, '/')) - continue; - - ret = asprintf(&new_name, "%s/%s/", - pos->pmu_name, pos->name); - if (ret) { - free(pos->name); - pos->name = new_name; - } - } -} - static int record__terminate_thread(struct record_thread *thread_data) { int err; @@ -2496,7 +2470,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) if (data->is_pipe && rec->evlist->core.nr_entries == 1) rec->opts.sample_id = true; - record__uniquify_name(rec); + evlist__uniquify_name(rec->evlist); /* Debug message used by test scripts */ pr_debug3("perf record opening and mmapping events\n"); diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 13e609c0c6930..baf1ab083436e 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1298,6 +1298,7 @@ static int __cmd_top(struct perf_top *top) } } + evlist__uniquify_name(top->evlist); ret = perf_top__start_counters(top); if (ret) return ret; diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 0ed3ce2aa8ebf..6f0892803c224 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -2518,3 +2518,28 @@ void evlist__warn_user_requested_cpus(struct evlist *evlist, const char *cpu_lis } perf_cpu_map__put(user_requested_cpus); } + +void evlist__uniquify_name(struct evlist *evlist) +{ + struct evsel *pos; + char *new_name; + int ret; + + if (perf_pmus__num_core_pmus() == 1) + return; + + evlist__for_each_entry(evlist, pos) { + if (!evsel__is_hybrid(pos)) + continue; + + if (strchr(pos->name, '/')) + continue; + + ret = asprintf(&new_name, "%s/%s/", + pos->pmu_name, pos->name); + if (ret) { + free(pos->name); + pos->name = new_name; + } + } +} diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 98e7ddb2bd305..cb91dc9117a27 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -442,5 +442,6 @@ struct evsel *evlist__find_evsel(struct evlist *evlist, int idx); int evlist__scnprintf_evsels(struct evlist *evlist, size_t size, char *bf); void evlist__check_mem_load_aux(struct evlist *evlist); void evlist__warn_user_requested_cpus(struct evlist *evlist, const char *cpu_list); +void evlist__uniquify_name(struct evlist *evlist); #endif /* __PERF_EVLIST_H */ -- GitLab From 9594f273fafe76663e502337eaf393dc4f7bf8fa Mon Sep 17 00:00:00 2001 From: Biju Das Date: Wed, 13 Dec 2023 21:37:15 -0800 Subject: [PATCH 0353/1290] Input: da9063 - simplify obtaining OF match data Simplify probe() by replacing of_match_node() for retrieving match data by device_get_match_data(). Some minor cleanups: * Remove the trailing comma in the terminator entry for the OF table making code robust against (theoretical) misrebases or other similar things where the new entry goes _after_ the termination without the compiler noticing. * Move OF table near to the user. * Arrange variables in reverse xmas tree order in probe(). Signed-off-by: Biju Das Link: https://lore.kernel.org/r/20231213214803.9931-2-biju.das.jz@bp.renesas.com Signed-off-by: Dmitry Torokhov --- drivers/input/misc/da9063_onkey.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/drivers/input/misc/da9063_onkey.c b/drivers/input/misc/da9063_onkey.c index 74808bae326a7..9351ce0bb4057 100644 --- a/drivers/input/misc/da9063_onkey.c +++ b/drivers/input/misc/da9063_onkey.c @@ -74,13 +74,6 @@ static const struct da906x_chip_config da9062_regs = { .name = "da9062-onkey", }; -static const struct of_device_id da9063_compatible_reg_id_table[] = { - { .compatible = "dlg,da9063-onkey", .data = &da9063_regs }, - { .compatible = "dlg,da9062-onkey", .data = &da9062_regs }, - { }, -}; -MODULE_DEVICE_TABLE(of, da9063_compatible_reg_id_table); - static void da9063_poll_on(struct work_struct *work) { struct da9063_onkey *onkey = container_of(work, @@ -187,14 +180,8 @@ static irqreturn_t da9063_onkey_irq_handler(int irq, void *data) static int da9063_onkey_probe(struct platform_device *pdev) { struct da9063_onkey *onkey; - const struct of_device_id *match; - int irq; int error; - - match = of_match_node(da9063_compatible_reg_id_table, - pdev->dev.of_node); - if (!match) - return -ENXIO; + int irq; onkey = devm_kzalloc(&pdev->dev, sizeof(struct da9063_onkey), GFP_KERNEL); @@ -203,7 +190,10 @@ static int da9063_onkey_probe(struct platform_device *pdev) return -ENOMEM; } - onkey->config = match->data; + onkey->config = device_get_match_data(&pdev->dev); + if (!onkey->config) + return -ENXIO; + onkey->dev = &pdev->dev; onkey->regmap = dev_get_regmap(pdev->dev.parent, NULL); @@ -270,6 +260,13 @@ static int da9063_onkey_probe(struct platform_device *pdev) return 0; } +static const struct of_device_id da9063_compatible_reg_id_table[] = { + { .compatible = "dlg,da9063-onkey", .data = &da9063_regs }, + { .compatible = "dlg,da9062-onkey", .data = &da9062_regs }, + { } +}; +MODULE_DEVICE_TABLE(of, da9063_compatible_reg_id_table); + static struct platform_driver da9063_onkey_driver = { .probe = da9063_onkey_probe, .driver = { -- GitLab From bd2334eda183a2888b5541605b39617c1e6b4b71 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Wed, 13 Dec 2023 23:48:38 -0800 Subject: [PATCH 0354/1290] Input: da9063 - drop redundant prints in probe() The memory allocation core code already prints error message in case of OOM. So, drop additional print messages for OOM cases. While at it, input_register_device() is already printing error messages on failure. Drop the redundant print. Signed-off-by: Biju Das Link: https://lore.kernel.org/r/20231213214803.9931-3-biju.das.jz@bp.renesas.com Signed-off-by: Dmitry Torokhov --- drivers/input/misc/da9063_onkey.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/drivers/input/misc/da9063_onkey.c b/drivers/input/misc/da9063_onkey.c index 9351ce0bb4057..5483a0576155d 100644 --- a/drivers/input/misc/da9063_onkey.c +++ b/drivers/input/misc/da9063_onkey.c @@ -185,10 +185,8 @@ static int da9063_onkey_probe(struct platform_device *pdev) onkey = devm_kzalloc(&pdev->dev, sizeof(struct da9063_onkey), GFP_KERNEL); - if (!onkey) { - dev_err(&pdev->dev, "Failed to allocate memory.\n"); + if (!onkey) return -ENOMEM; - } onkey->config = device_get_match_data(&pdev->dev); if (!onkey->config) @@ -206,10 +204,8 @@ static int da9063_onkey_probe(struct platform_device *pdev) "dlg,disable-key-power"); onkey->input = devm_input_allocate_device(&pdev->dev); - if (!onkey->input) { - dev_err(&pdev->dev, "Failed to allocated input device.\n"); + if (!onkey->input) return -ENOMEM; - } onkey->input->name = onkey->config->name; snprintf(onkey->phys, sizeof(onkey->phys), "%s/input0", @@ -221,12 +217,8 @@ static int da9063_onkey_probe(struct platform_device *pdev) error = devm_delayed_work_autocancel(&pdev->dev, &onkey->work, da9063_poll_on); - if (error) { - dev_err(&pdev->dev, - "Failed to add cancel poll action: %d\n", - error); + if (error) return error; - } irq = platform_get_irq_byname(pdev, "ONKEY"); if (irq < 0) @@ -251,11 +243,8 @@ static int da9063_onkey_probe(struct platform_device *pdev) device_init_wakeup(&pdev->dev, true); error = input_register_device(onkey->input); - if (error) { - dev_err(&pdev->dev, - "Failed to register input device: %d\n", error); + if (error) return error; - } return 0; } -- GitLab From c67f8a13be4e9ffc862c7f60ec97a799a1547486 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Thu, 14 Dec 2023 22:05:20 -0800 Subject: [PATCH 0355/1290] Input: da9063 - use dev_err_probe() Replace dev_err()->dev_err_probe() to simplify probe(). Signed-off-by: Biju Das Link: https://lore.kernel.org/r/20231213214803.9931-4-biju.das.jz@bp.renesas.com Signed-off-by: Dmitry Torokhov --- drivers/input/misc/da9063_onkey.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/input/misc/da9063_onkey.c b/drivers/input/misc/da9063_onkey.c index 5483a0576155d..a8b7f1cd0ec2a 100644 --- a/drivers/input/misc/da9063_onkey.c +++ b/drivers/input/misc/da9063_onkey.c @@ -195,10 +195,9 @@ static int da9063_onkey_probe(struct platform_device *pdev) onkey->dev = &pdev->dev; onkey->regmap = dev_get_regmap(pdev->dev.parent, NULL); - if (!onkey->regmap) { - dev_err(&pdev->dev, "Parent regmap unavailable.\n"); - return -ENXIO; - } + if (!onkey->regmap) + return dev_err_probe(&pdev->dev, -ENXIO, + "Parent regmap unavailable.\n"); onkey->key_power = !of_property_read_bool(pdev->dev.of_node, "dlg,disable-key-power"); @@ -228,11 +227,9 @@ static int da9063_onkey_probe(struct platform_device *pdev) NULL, da9063_onkey_irq_handler, IRQF_TRIGGER_LOW | IRQF_ONESHOT, "ONKEY", onkey); - if (error) { - dev_err(&pdev->dev, - "Failed to request IRQ %d: %d\n", irq, error); - return error; - } + if (error) + return dev_err_probe(&pdev->dev, error, + "Failed to allocate onkey IRQ\n"); error = dev_pm_set_wake_irq(&pdev->dev, irq); if (error) -- GitLab From a63c357b9fd56ad5fe64616f5b22835252c6a76a Mon Sep 17 00:00:00 2001 From: "Isaac J. Manjarres" Date: Fri, 8 Dec 2023 15:41:40 -0800 Subject: [PATCH 0356/1290] iommu/dma: Trace bounce buffer usage when mapping buffers When commit 82612d66d51d ("iommu: Allow the dma-iommu api to use bounce buffers") was introduced, it did not add the logic for tracing the bounce buffer usage from iommu_dma_map_page(). All of the users of swiotlb_tbl_map_single() trace their bounce buffer usage, except iommu_dma_map_page(). This makes it difficult to track SWIOTLB usage from that function. Thus, trace bounce buffer usage from iommu_dma_map_page(). Fixes: 82612d66d51d ("iommu: Allow the dma-iommu api to use bounce buffers") Cc: stable@vger.kernel.org # v5.15+ Cc: Tom Murphy Cc: Lu Baolu Cc: Saravana Kannan Signed-off-by: Isaac J. Manjarres Link: https://lore.kernel.org/r/20231208234141.2356157-1-isaacmanjarres@google.com Signed-off-by: Joerg Roedel --- drivers/iommu/dma-iommu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 85163a83df2f6..037fcf826407f 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "dma-iommu.h" @@ -1156,6 +1157,8 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, return DMA_MAPPING_ERROR; } + trace_swiotlb_bounced(dev, phys, size); + aligned_size = iova_align(iovad, size); phys = swiotlb_tbl_map_single(dev, phys, size, aligned_size, iova_mask(iovad), dir, attrs); -- GitLab From f8aa519976b38e67aae02d2db3e2998513305e80 Mon Sep 17 00:00:00 2001 From: Andy Yan Date: Tue, 12 Dec 2023 08:57:10 +0800 Subject: [PATCH 0357/1290] dt-bindings: iommu: rockchip: Add Rockchip RK3588 Add a Rockchip RK3588 compatible I split it from the vop2 patch series as suggested by Heiko[0] Signed-off-by: Andy Yan Reviewed-by: Heiko Stuebner Acked-by: Krzysztof Kozlowski [0]https://patchwork.kernel.org/project/dri-devel/patch/20231207080235.652719-1-andyshrk@163.com/ Link: https://lore.kernel.org/r/20231212005710.1837066-1-andyshrk@163.com Signed-off-by: Joerg Roedel --- .../devicetree/bindings/iommu/rockchip,iommu.yaml | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/iommu/rockchip,iommu.yaml b/Documentation/devicetree/bindings/iommu/rockchip,iommu.yaml index ba9124f721f15..621dde0e45d85 100644 --- a/Documentation/devicetree/bindings/iommu/rockchip,iommu.yaml +++ b/Documentation/devicetree/bindings/iommu/rockchip,iommu.yaml @@ -19,9 +19,14 @@ description: |+ properties: compatible: - enum: - - rockchip,iommu - - rockchip,rk3568-iommu + oneOf: + - enum: + - rockchip,iommu + - rockchip,rk3568-iommu + - items: + - enum: + - rockchip,rk3588-iommu + - const: rockchip,rk3568-iommu reg: items: -- GitLab From 9991a82a38174672273df44a3cc6c87ef74b18e9 Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Wed, 13 Dec 2023 03:14:50 -0800 Subject: [PATCH 0358/1290] iommu/sva: Fix memory leak in iommu_sva_bind_device() Free the handle when the domain allocation fails before unlocking and returning. Fixes: 092edaddb660 ("iommu: Support mm PASID 1:n with sva domains") Signed-off-by: Harshit Mogalapalli Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/20231213111450.2487861-1-harshit.m.mogalapalli@oracle.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu-sva.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index 5175e8d85247b..c3fc9201d0be9 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -101,7 +101,7 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm domain = iommu_sva_domain_alloc(dev, mm); if (!domain) { ret = -ENOMEM; - goto out_unlock; + goto out_free_handle; } ret = iommu_attach_device_pasid(domain, dev, iommu_mm->pasid); @@ -118,6 +118,7 @@ out: out_free_domain: iommu_domain_free(domain); +out_free_handle: kfree(handle); out_unlock: mutex_unlock(&iommu_sva_lock); -- GitLab From 9c556b7c3f520d42c435c0d78b25c719c060f8a1 Mon Sep 17 00:00:00 2001 From: Naveen N Rao Date: Thu, 14 Dec 2023 10:47:02 +0530 Subject: [PATCH 0359/1290] trace/kprobe: Display the actual notrace function when rejecting a probe Trying to probe update_sd_lb_stats() using perf results in the below message in the kernel log: trace_kprobe: Could not probe notrace function _text This is because 'perf probe' specifies the kprobe location as an offset from '_text': $ sudo perf probe -D update_sd_lb_stats p:probe/update_sd_lb_stats _text+1830728 However, the error message is misleading and doesn't help convey the actual notrace function that is being probed. Fix this by looking up the actual function name that is being probed. With this fix, we now get the below message in the kernel log: trace_kprobe: Could not probe notrace function update_sd_lb_stats.constprop.0 Link: https://lore.kernel.org/all/20231214051702.1687300-1-naveen@kernel.org/ Signed-off-by: Naveen N Rao Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_kprobe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 52f8b537dd0a0..c4c6e0e0068be 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -487,8 +487,8 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) return -EINVAL; if (within_notrace_func(tk)) { - pr_warn("Could not probe notrace function %s\n", - trace_kprobe_symbol(tk)); + pr_warn("Could not probe notrace function %ps\n", + (void *)trace_kprobe_address(tk)); return -EINVAL; } -- GitLab From af838635a3eb9b1bc0d98599c101ebca98f31311 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 27 Nov 2023 23:36:50 -0600 Subject: [PATCH 0360/1290] rtc: mc146818-lib: Adjust failure return code for mc146818_get_time() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mc146818_get_time() calls mc146818_avoid_UIP() to avoid fetching the time while RTC update is in progress (UIP). When this fails, the return code is -EIO, but actually there was no IO failure. The reason for the return from mc146818_avoid_UIP() is that the UIP wasn't cleared in the time period. Adjust the return code to -ETIMEDOUT to match the behavior. Tested-by: Mateusz Jończyk Reviewed-by: Mateusz Jończyk Acked-by: Mateusz Jończyk Cc: Fixes: 2a61b0ac5493 ("rtc: mc146818-lib: refactor mc146818_get_time") Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20231128053653.101798-2-mario.limonciello@amd.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-mc146818-lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-mc146818-lib.c b/drivers/rtc/rtc-mc146818-lib.c index f1c09f1db044c..43a28e82674e8 100644 --- a/drivers/rtc/rtc-mc146818-lib.c +++ b/drivers/rtc/rtc-mc146818-lib.c @@ -138,7 +138,7 @@ int mc146818_get_time(struct rtc_time *time) if (!mc146818_avoid_UIP(mc146818_get_time_callback, &p)) { memset(time, 0, sizeof(*time)); - return -EIO; + return -ETIMEDOUT; } if (!(p.ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD) -- GitLab From 1311a8f0d4b23f58bbababa13623aa40b8ad4e0c Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 27 Nov 2023 23:36:51 -0600 Subject: [PATCH 0361/1290] rtc: Adjust failure return code for cmos_set_alarm() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When mc146818_avoid_UIP() fails to return a valid value, this is because UIP didn't clear in the timeout period. Adjust the return code in this case to -ETIMEDOUT. Tested-by: Mateusz Jończyk Reviewed-by: Mateusz Jończyk Acked-by: Mateusz Jończyk Cc: Fixes: cdedc45c579f ("rtc: cmos: avoid UIP when reading alarm time") Fixes: cd17420ebea5 ("rtc: cmos: avoid UIP when writing alarm time") Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20231128053653.101798-3-mario.limonciello@amd.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-cmos.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index 696cfa7025ded..2a1a83caf081a 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -292,7 +292,7 @@ static int cmos_read_alarm(struct device *dev, struct rtc_wkalrm *t) /* This not only a rtc_op, but also called directly */ if (!is_valid_irq(cmos->irq)) - return -EIO; + return -ETIMEDOUT; /* Basic alarms only support hour, minute, and seconds fields. * Some also support day and month, for alarms up to a year in @@ -557,7 +557,7 @@ static int cmos_set_alarm(struct device *dev, struct rtc_wkalrm *t) * Use mc146818_avoid_UIP() to avoid this. */ if (!mc146818_avoid_UIP(cmos_set_alarm_callback, &p)) - return -EIO; + return -ETIMEDOUT; cmos->alarm_expires = rtc_tm_to_time64(&t->time); -- GitLab From 120931db07b49252aba2073096b595482d71857c Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 27 Nov 2023 23:36:52 -0600 Subject: [PATCH 0362/1290] rtc: Add support for configuring the UIP timeout for RTC reads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The UIP timeout is hardcoded to 10ms for all RTC reads, but in some contexts this might not be enough time. Add a timeout parameter to mc146818_get_time() and mc146818_get_time_callback(). If UIP timeout is configured by caller to be >=100 ms and a call takes this long, log a warning. Make all callers use 10ms to ensure no functional changes. Cc: # 6.1.y Fixes: ec5895c0f2d8 ("rtc: mc146818-lib: extract mc146818_avoid_UIP") Signed-off-by: Mario Limonciello Tested-by: Mateusz Jończyk Reviewed-by: Mateusz Jończyk Acked-by: Mateusz Jończyk Link: https://lore.kernel.org/r/20231128053653.101798-4-mario.limonciello@amd.com Signed-off-by: Alexandre Belloni --- arch/alpha/kernel/rtc.c | 2 +- arch/x86/kernel/hpet.c | 2 +- arch/x86/kernel/rtc.c | 2 +- drivers/base/power/trace.c | 2 +- drivers/rtc/rtc-cmos.c | 6 +++--- drivers/rtc/rtc-mc146818-lib.c | 37 ++++++++++++++++++++++++++-------- include/linux/mc146818rtc.h | 3 ++- 7 files changed, 38 insertions(+), 16 deletions(-) diff --git a/arch/alpha/kernel/rtc.c b/arch/alpha/kernel/rtc.c index fb3025396ac96..cfdf90bc8b3f8 100644 --- a/arch/alpha/kernel/rtc.c +++ b/arch/alpha/kernel/rtc.c @@ -80,7 +80,7 @@ init_rtc_epoch(void) static int alpha_rtc_read_time(struct device *dev, struct rtc_time *tm) { - int ret = mc146818_get_time(tm); + int ret = mc146818_get_time(tm, 10); if (ret < 0) { dev_err_ratelimited(dev, "unable to read current time\n"); diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 41eecf180b7f4..17adad4cbe78c 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -1438,7 +1438,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) memset(&curr_time, 0, sizeof(struct rtc_time)); if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) { - if (unlikely(mc146818_get_time(&curr_time) < 0)) { + if (unlikely(mc146818_get_time(&curr_time, 10) < 0)) { pr_err_ratelimited("unable to read current time from RTC\n"); return IRQ_HANDLED; } diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 1309b9b053386..961ebc7f18723 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -67,7 +67,7 @@ void mach_get_cmos_time(struct timespec64 *now) return; } - if (mc146818_get_time(&tm)) { + if (mc146818_get_time(&tm, 10)) { pr_err("Unable to read current time from RTC\n"); now->tv_sec = now->tv_nsec = 0; return; diff --git a/drivers/base/power/trace.c b/drivers/base/power/trace.c index 72b7a92337b18..c2e9253574748 100644 --- a/drivers/base/power/trace.c +++ b/drivers/base/power/trace.c @@ -120,7 +120,7 @@ static unsigned int read_magic_time(void) struct rtc_time time; unsigned int val; - if (mc146818_get_time(&time) < 0) { + if (mc146818_get_time(&time, 10) < 0) { pr_err("Unable to read current time from RTC\n"); return 0; } diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index 2a1a83caf081a..468ccd99f218c 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -231,7 +231,7 @@ static int cmos_read_time(struct device *dev, struct rtc_time *t) if (!pm_trace_rtc_valid()) return -EIO; - ret = mc146818_get_time(t); + ret = mc146818_get_time(t, 10); if (ret < 0) { dev_err_ratelimited(dev, "unable to read current time\n"); return ret; @@ -307,7 +307,7 @@ static int cmos_read_alarm(struct device *dev, struct rtc_wkalrm *t) * * Use the mc146818_avoid_UIP() function to avoid this. */ - if (!mc146818_avoid_UIP(cmos_read_alarm_callback, &p)) + if (!mc146818_avoid_UIP(cmos_read_alarm_callback, 10, &p)) return -EIO; if (!(p.rtc_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { @@ -556,7 +556,7 @@ static int cmos_set_alarm(struct device *dev, struct rtc_wkalrm *t) * * Use mc146818_avoid_UIP() to avoid this. */ - if (!mc146818_avoid_UIP(cmos_set_alarm_callback, &p)) + if (!mc146818_avoid_UIP(cmos_set_alarm_callback, 10, &p)) return -ETIMEDOUT; cmos->alarm_expires = rtc_tm_to_time64(&t->time); diff --git a/drivers/rtc/rtc-mc146818-lib.c b/drivers/rtc/rtc-mc146818-lib.c index 43a28e82674e8..4c17b3cef11ea 100644 --- a/drivers/rtc/rtc-mc146818-lib.c +++ b/drivers/rtc/rtc-mc146818-lib.c @@ -8,26 +8,31 @@ #include #endif +#define UIP_RECHECK_DELAY 100 /* usec */ +#define UIP_RECHECK_DELAY_MS (USEC_PER_MSEC / UIP_RECHECK_DELAY) +#define UIP_RECHECK_LOOPS_MS(x) (x / UIP_RECHECK_DELAY_MS) + /* * Execute a function while the UIP (Update-in-progress) bit of the RTC is - * unset. + * unset. The timeout is configurable by the caller in ms. * * Warning: callback may be executed more then once. */ bool mc146818_avoid_UIP(void (*callback)(unsigned char seconds, void *param), + int timeout, void *param) { int i; unsigned long flags; unsigned char seconds; - for (i = 0; i < 100; i++) { + for (i = 0; UIP_RECHECK_LOOPS_MS(i) < timeout; i++) { spin_lock_irqsave(&rtc_lock, flags); /* * Check whether there is an update in progress during which the * readout is unspecified. The maximum update time is ~2ms. Poll - * every 100 usec for completion. + * for completion. * * Store the second value before checking UIP so a long lasting * NMI which happens to hit after the UIP check cannot make @@ -37,7 +42,7 @@ bool mc146818_avoid_UIP(void (*callback)(unsigned char seconds, void *param), if (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) { spin_unlock_irqrestore(&rtc_lock, flags); - udelay(100); + udelay(UIP_RECHECK_DELAY); continue; } @@ -56,7 +61,7 @@ bool mc146818_avoid_UIP(void (*callback)(unsigned char seconds, void *param), */ if (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) { spin_unlock_irqrestore(&rtc_lock, flags); - udelay(100); + udelay(UIP_RECHECK_DELAY); continue; } @@ -72,6 +77,10 @@ bool mc146818_avoid_UIP(void (*callback)(unsigned char seconds, void *param), } spin_unlock_irqrestore(&rtc_lock, flags); + if (UIP_RECHECK_LOOPS_MS(i) >= 100) + pr_warn("Reading current time from RTC took around %li ms\n", + UIP_RECHECK_LOOPS_MS(i)); + return true; } return false; @@ -84,7 +93,7 @@ EXPORT_SYMBOL_GPL(mc146818_avoid_UIP); */ bool mc146818_does_rtc_work(void) { - return mc146818_avoid_UIP(NULL, NULL); + return mc146818_avoid_UIP(NULL, 10, NULL); } EXPORT_SYMBOL_GPL(mc146818_does_rtc_work); @@ -130,13 +139,25 @@ static void mc146818_get_time_callback(unsigned char seconds, void *param_in) p->ctrl = CMOS_READ(RTC_CONTROL); } -int mc146818_get_time(struct rtc_time *time) +/** + * mc146818_get_time - Get the current time from the RTC + * @time: pointer to struct rtc_time to store the current time + * @timeout: timeout value in ms + * + * This function reads the current time from the RTC and stores it in the + * provided struct rtc_time. The timeout parameter specifies the maximum + * time to wait for the RTC to become ready. + * + * Return: 0 on success, -ETIMEDOUT if the RTC did not become ready within + * the specified timeout, or another error code if an error occurred. + */ +int mc146818_get_time(struct rtc_time *time, int timeout) { struct mc146818_get_time_callback_param p = { .time = time }; - if (!mc146818_avoid_UIP(mc146818_get_time_callback, &p)) { + if (!mc146818_avoid_UIP(mc146818_get_time_callback, timeout, &p)) { memset(time, 0, sizeof(*time)); return -ETIMEDOUT; } diff --git a/include/linux/mc146818rtc.h b/include/linux/mc146818rtc.h index b0da04fe087bb..34dfcc77f505a 100644 --- a/include/linux/mc146818rtc.h +++ b/include/linux/mc146818rtc.h @@ -126,10 +126,11 @@ struct cmos_rtc_board_info { #endif /* ARCH_RTC_LOCATION */ bool mc146818_does_rtc_work(void); -int mc146818_get_time(struct rtc_time *time); +int mc146818_get_time(struct rtc_time *time, int timeout); int mc146818_set_time(struct rtc_time *time); bool mc146818_avoid_UIP(void (*callback)(unsigned char seconds, void *param), + int timeout, void *param); #endif /* _MC146818RTC_H */ -- GitLab From cef9ecc8e938dd48a560f7dd9be1246359248d20 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 27 Nov 2023 23:36:53 -0600 Subject: [PATCH 0363/1290] rtc: Extend timeout for waiting for UIP to clear to 1s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Specs don't say anything about UIP being cleared within 10ms. They only say that UIP won't occur for another 244uS. If a long NMI occurs while UIP is still updating it might not be possible to get valid data in 10ms. This has been observed in the wild that around s2idle some calls can take up to 480ms before UIP is clear. Adjust callers from outside an interrupt context to wait for up to a 1s instead of 10ms. Cc: # 6.1.y Fixes: ec5895c0f2d8 ("rtc: mc146818-lib: extract mc146818_avoid_UIP") Reported-by: Carsten Hatger Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217626 Tested-by: Mateusz Jończyk Reviewed-by: Mateusz Jończyk Acked-by: Mateusz Jończyk Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20231128053653.101798-5-mario.limonciello@amd.com Signed-off-by: Alexandre Belloni --- arch/x86/kernel/rtc.c | 2 +- drivers/base/power/trace.c | 2 +- drivers/rtc/rtc-cmos.c | 2 +- drivers/rtc/rtc-mc146818-lib.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 961ebc7f18723..2e7066980f3e8 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -67,7 +67,7 @@ void mach_get_cmos_time(struct timespec64 *now) return; } - if (mc146818_get_time(&tm, 10)) { + if (mc146818_get_time(&tm, 1000)) { pr_err("Unable to read current time from RTC\n"); now->tv_sec = now->tv_nsec = 0; return; diff --git a/drivers/base/power/trace.c b/drivers/base/power/trace.c index c2e9253574748..cd6e559648b21 100644 --- a/drivers/base/power/trace.c +++ b/drivers/base/power/trace.c @@ -120,7 +120,7 @@ static unsigned int read_magic_time(void) struct rtc_time time; unsigned int val; - if (mc146818_get_time(&time, 10) < 0) { + if (mc146818_get_time(&time, 1000) < 0) { pr_err("Unable to read current time from RTC\n"); return 0; } diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index 468ccd99f218c..7d99cd2c37a0b 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -231,7 +231,7 @@ static int cmos_read_time(struct device *dev, struct rtc_time *t) if (!pm_trace_rtc_valid()) return -EIO; - ret = mc146818_get_time(t, 10); + ret = mc146818_get_time(t, 1000); if (ret < 0) { dev_err_ratelimited(dev, "unable to read current time\n"); return ret; diff --git a/drivers/rtc/rtc-mc146818-lib.c b/drivers/rtc/rtc-mc146818-lib.c index 4c17b3cef11ea..651bf3c279c74 100644 --- a/drivers/rtc/rtc-mc146818-lib.c +++ b/drivers/rtc/rtc-mc146818-lib.c @@ -93,7 +93,7 @@ EXPORT_SYMBOL_GPL(mc146818_avoid_UIP); */ bool mc146818_does_rtc_work(void) { - return mc146818_avoid_UIP(NULL, 10, NULL); + return mc146818_avoid_UIP(NULL, 1000, NULL); } EXPORT_SYMBOL_GPL(mc146818_does_rtc_work); -- GitLab From 3767bba698707d7bad5099371b797d0fd6d39709 Mon Sep 17 00:00:00 2001 From: Jacky Huang Date: Mon, 25 Sep 2023 07:02:49 +0000 Subject: [PATCH 0364/1290] dt-bindings: rtc: Add Nuvoton ma35d1 rtc Add documentation describing the Nuvoton ma35d1 rtc controller. Signed-off-by: Jacky Huang Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20230925070251.28-2-ychuang570808@gmail.com Signed-off-by: Alexandre Belloni --- .../bindings/rtc/nuvoton,ma35d1-rtc.yaml | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 Documentation/devicetree/bindings/rtc/nuvoton,ma35d1-rtc.yaml diff --git a/Documentation/devicetree/bindings/rtc/nuvoton,ma35d1-rtc.yaml b/Documentation/devicetree/bindings/rtc/nuvoton,ma35d1-rtc.yaml new file mode 100644 index 0000000000000..5e4ade803eed1 --- /dev/null +++ b/Documentation/devicetree/bindings/rtc/nuvoton,ma35d1-rtc.yaml @@ -0,0 +1,48 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/rtc/nuvoton,ma35d1-rtc.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Nuvoton MA35D1 Real Time Clock + +maintainers: + - Min-Jen Chen + +allOf: + - $ref: rtc.yaml# + +properties: + compatible: + enum: + - nuvoton,ma35d1-rtc + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + clocks: + maxItems: 1 + +required: + - compatible + - reg + - interrupts + - clocks + +unevaluatedProperties: false + +examples: + - | + #include + #include + rtc@40410000 { + compatible = "nuvoton,ma35d1-rtc"; + reg = <0x40410000 0x200>; + interrupts = ; + clocks = <&clk RTC_GATE>; + }; + +... -- GitLab From dc0684adf3b6be6b20fec9295027980021d30353 Mon Sep 17 00:00:00 2001 From: Jacky Huang Date: Mon, 25 Sep 2023 07:02:51 +0000 Subject: [PATCH 0365/1290] rtc: Add driver for Nuvoton ma35d1 rtc controller The ma35d1 rtc controller provides real-time and calendar messaging capabilities. It supports programmable time tick and alarm match interrupts. The time and calendar messages are expressed in BCD format. This driver supports the built-in rtc controller of the ma35d1. It enables setting and reading the rtc time and configuring and reading the rtc alarm. Signed-off-by: Jacky Huang Link: https://lore.kernel.org/r/20230925070251.28-4-ychuang570808@gmail.com Signed-off-by: Alexandre Belloni --- drivers/rtc/Kconfig | 11 ++ drivers/rtc/Makefile | 1 + drivers/rtc/rtc-ma35d1.c | 324 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 336 insertions(+) create mode 100644 drivers/rtc/rtc-ma35d1.c diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 3814e0845e772..4e3c2fde2d6a6 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -1930,6 +1930,17 @@ config RTC_DRV_TI_K3 This driver can also be built as a module, if so, the module will be called "rtc-ti-k3". +config RTC_DRV_MA35D1 + tristate "Nuvoton MA35D1 RTC" + depends on ARCH_MA35 || COMPILE_TEST + select REGMAP_MMIO + help + If you say yes here you get support for the Nuvoton MA35D1 + On-Chip Real Time Clock. + + This driver can also be built as a module, if so, the module + will be called "rtc-ma35d1". + comment "HID Sensor RTC drivers" config RTC_DRV_HID_SENSOR_TIME diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile index 7b03c3abfd786..6817ea4e44687 100644 --- a/drivers/rtc/Makefile +++ b/drivers/rtc/Makefile @@ -88,6 +88,7 @@ obj-$(CONFIG_RTC_DRV_M41T94) += rtc-m41t94.o obj-$(CONFIG_RTC_DRV_M48T35) += rtc-m48t35.o obj-$(CONFIG_RTC_DRV_M48T59) += rtc-m48t59.o obj-$(CONFIG_RTC_DRV_M48T86) += rtc-m48t86.o +obj-$(CONFIG_RTC_DRV_MA35D1) += rtc-ma35d1.o obj-$(CONFIG_RTC_DRV_MAX6900) += rtc-max6900.o obj-$(CONFIG_RTC_DRV_MAX6902) += rtc-max6902.o obj-$(CONFIG_RTC_DRV_MAX6916) += rtc-max6916.o diff --git a/drivers/rtc/rtc-ma35d1.c b/drivers/rtc/rtc-ma35d1.c new file mode 100644 index 0000000000000..07c9a083a9d5d --- /dev/null +++ b/drivers/rtc/rtc-ma35d1.c @@ -0,0 +1,324 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * RTC driver for Nuvoton MA35D1 + * + * Copyright (C) 2023 Nuvoton Technology Corp. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* MA35D1 RTC Control Registers */ +#define MA35_REG_RTC_INIT 0x00 +#define MA35_REG_RTC_SINFASTS 0x04 +#define MA35_REG_RTC_FREQADJ 0x08 +#define MA35_REG_RTC_TIME 0x0c +#define MA35_REG_RTC_CAL 0x10 +#define MA35_REG_RTC_CLKFMT 0x14 +#define MA35_REG_RTC_WEEKDAY 0x18 +#define MA35_REG_RTC_TALM 0x1c +#define MA35_REG_RTC_CALM 0x20 +#define MA35_REG_RTC_LEAPYEAR 0x24 +#define MA35_REG_RTC_INTEN 0x28 +#define MA35_REG_RTC_INTSTS 0x2c + +/* register MA35_REG_RTC_INIT */ +#define RTC_INIT_ACTIVE BIT(0) +#define RTC_INIT_MAGIC_CODE 0xa5eb1357 + +/* register MA35_REG_RTC_CLKFMT */ +#define RTC_CLKFMT_24HEN BIT(0) +#define RTC_CLKFMT_DCOMPEN BIT(16) + +/* register MA35_REG_RTC_INTEN */ +#define RTC_INTEN_ALMIEN BIT(0) +#define RTC_INTEN_UIEN BIT(1) +#define RTC_INTEN_CLKFIEN BIT(24) +#define RTC_INTEN_CLKSTIEN BIT(25) + +/* register MA35_REG_RTC_INTSTS */ +#define RTC_INTSTS_ALMIF BIT(0) +#define RTC_INTSTS_UIF BIT(1) +#define RTC_INTSTS_CLKFIF BIT(24) +#define RTC_INTSTS_CLKSTIF BIT(25) + +#define RTC_INIT_TIMEOUT 250 + +struct ma35_rtc { + int irq_num; + void __iomem *rtc_reg; + struct rtc_device *rtcdev; +}; + +static u32 rtc_reg_read(struct ma35_rtc *p, u32 offset) +{ + return __raw_readl(p->rtc_reg + offset); +} + +static inline void rtc_reg_write(struct ma35_rtc *p, u32 offset, u32 value) +{ + __raw_writel(value, p->rtc_reg + offset); +} + +static irqreturn_t ma35d1_rtc_interrupt(int irq, void *data) +{ + struct ma35_rtc *rtc = (struct ma35_rtc *)data; + unsigned long events = 0, rtc_irq; + + rtc_irq = rtc_reg_read(rtc, MA35_REG_RTC_INTSTS); + + if (rtc_irq & RTC_INTSTS_ALMIF) { + rtc_reg_write(rtc, MA35_REG_RTC_INTSTS, RTC_INTSTS_ALMIF); + events |= RTC_AF | RTC_IRQF; + } + + if (rtc_irq & RTC_INTSTS_UIF) { + rtc_reg_write(rtc, MA35_REG_RTC_INTSTS, RTC_INTSTS_UIF); + events |= RTC_UF | RTC_IRQF; + } + + rtc_update_irq(rtc->rtcdev, 1, events); + + return IRQ_HANDLED; +} + +static int ma35d1_rtc_init(struct ma35_rtc *rtc, u32 ms_timeout) +{ + const unsigned long timeout = jiffies + msecs_to_jiffies(ms_timeout); + + do { + if (rtc_reg_read(rtc, MA35_REG_RTC_INIT) & RTC_INIT_ACTIVE) + return 0; + + rtc_reg_write(rtc, MA35_REG_RTC_INIT, RTC_INIT_MAGIC_CODE); + + mdelay(1); + + } while (time_before(jiffies, timeout)); + + return -ETIMEDOUT; +} + +static int ma35d1_alarm_irq_enable(struct device *dev, u32 enabled) +{ + struct ma35_rtc *rtc = dev_get_drvdata(dev); + u32 reg_ien; + + reg_ien = rtc_reg_read(rtc, MA35_REG_RTC_INTEN); + + if (enabled) + rtc_reg_write(rtc, MA35_REG_RTC_INTEN, reg_ien | RTC_INTEN_ALMIEN); + else + rtc_reg_write(rtc, MA35_REG_RTC_INTEN, reg_ien & ~RTC_INTEN_ALMIEN); + + return 0; +} + +static int ma35d1_rtc_read_time(struct device *dev, struct rtc_time *tm) +{ + struct ma35_rtc *rtc = dev_get_drvdata(dev); + u32 time, cal, wday; + + do { + time = rtc_reg_read(rtc, MA35_REG_RTC_TIME); + cal = rtc_reg_read(rtc, MA35_REG_RTC_CAL); + wday = rtc_reg_read(rtc, MA35_REG_RTC_WEEKDAY); + } while (time != rtc_reg_read(rtc, MA35_REG_RTC_TIME) || + cal != rtc_reg_read(rtc, MA35_REG_RTC_CAL)); + + tm->tm_mday = bcd2bin(cal >> 0); + tm->tm_wday = wday; + tm->tm_mon = bcd2bin(cal >> 8); + tm->tm_mon = tm->tm_mon - 1; + tm->tm_year = bcd2bin(cal >> 16) + 100; + + tm->tm_sec = bcd2bin(time >> 0); + tm->tm_min = bcd2bin(time >> 8); + tm->tm_hour = bcd2bin(time >> 16); + + return rtc_valid_tm(tm); +} + +static int ma35d1_rtc_set_time(struct device *dev, struct rtc_time *tm) +{ + struct ma35_rtc *rtc = dev_get_drvdata(dev); + u32 val; + + val = bin2bcd(tm->tm_mday) << 0 | bin2bcd(tm->tm_mon + 1) << 8 | + bin2bcd(tm->tm_year - 100) << 16; + rtc_reg_write(rtc, MA35_REG_RTC_CAL, val); + + val = bin2bcd(tm->tm_sec) << 0 | bin2bcd(tm->tm_min) << 8 | + bin2bcd(tm->tm_hour) << 16; + rtc_reg_write(rtc, MA35_REG_RTC_TIME, val); + + val = tm->tm_wday; + rtc_reg_write(rtc, MA35_REG_RTC_WEEKDAY, val); + + return 0; +} + +static int ma35d1_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) +{ + struct ma35_rtc *rtc = dev_get_drvdata(dev); + u32 talm, calm; + + talm = rtc_reg_read(rtc, MA35_REG_RTC_TALM); + calm = rtc_reg_read(rtc, MA35_REG_RTC_CALM); + + alrm->time.tm_mday = bcd2bin(calm >> 0); + alrm->time.tm_mon = bcd2bin(calm >> 8); + alrm->time.tm_mon = alrm->time.tm_mon - 1; + + alrm->time.tm_year = bcd2bin(calm >> 16) + 100; + + alrm->time.tm_sec = bcd2bin(talm >> 0); + alrm->time.tm_min = bcd2bin(talm >> 8); + alrm->time.tm_hour = bcd2bin(talm >> 16); + + return rtc_valid_tm(&alrm->time); +} + +static int ma35d1_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) +{ + struct ma35_rtc *rtc = dev_get_drvdata(dev); + unsigned long val; + + val = bin2bcd(alrm->time.tm_mday) << 0 | bin2bcd(alrm->time.tm_mon + 1) << 8 | + bin2bcd(alrm->time.tm_year - 100) << 16; + rtc_reg_write(rtc, MA35_REG_RTC_CALM, val); + + val = bin2bcd(alrm->time.tm_sec) << 0 | bin2bcd(alrm->time.tm_min) << 8 | + bin2bcd(alrm->time.tm_hour) << 16; + rtc_reg_write(rtc, MA35_REG_RTC_TALM, val); + + ma35d1_alarm_irq_enable(dev, alrm->enabled); + + return 0; +} + +static const struct rtc_class_ops ma35d1_rtc_ops = { + .read_time = ma35d1_rtc_read_time, + .set_time = ma35d1_rtc_set_time, + .read_alarm = ma35d1_rtc_read_alarm, + .set_alarm = ma35d1_rtc_set_alarm, + .alarm_irq_enable = ma35d1_alarm_irq_enable, +}; + +static int ma35d1_rtc_probe(struct platform_device *pdev) +{ + struct ma35_rtc *rtc; + struct clk *clk; + u32 regval; + int ret; + + rtc = devm_kzalloc(&pdev->dev, sizeof(*rtc), GFP_KERNEL); + if (!rtc) + return -ENOMEM; + + rtc->rtc_reg = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(rtc->rtc_reg)) + return PTR_ERR(rtc->rtc_reg); + + clk = of_clk_get(pdev->dev.of_node, 0); + if (IS_ERR(clk)) + return dev_err_probe(&pdev->dev, PTR_ERR(clk), "failed to find rtc clock\n"); + + ret = clk_prepare_enable(clk); + if (ret) + return ret; + + if (!(rtc_reg_read(rtc, MA35_REG_RTC_INIT) & RTC_INIT_ACTIVE)) { + ret = ma35d1_rtc_init(rtc, RTC_INIT_TIMEOUT); + if (ret) + return dev_err_probe(&pdev->dev, ret, "rtc init failed\n"); + } + + rtc->irq_num = platform_get_irq(pdev, 0); + + ret = devm_request_irq(&pdev->dev, rtc->irq_num, ma35d1_rtc_interrupt, + IRQF_NO_SUSPEND, "ma35d1rtc", rtc); + if (ret) + return dev_err_probe(&pdev->dev, ret, "Failed to request rtc irq\n"); + + platform_set_drvdata(pdev, rtc); + + device_init_wakeup(&pdev->dev, true); + + rtc->rtcdev = devm_rtc_allocate_device(&pdev->dev); + if (IS_ERR(rtc->rtcdev)) + return PTR_ERR(rtc->rtcdev); + + rtc->rtcdev->ops = &ma35d1_rtc_ops; + rtc->rtcdev->range_min = RTC_TIMESTAMP_BEGIN_2000; + rtc->rtcdev->range_max = RTC_TIMESTAMP_END_2099; + + ret = devm_rtc_register_device(rtc->rtcdev); + if (ret) + return dev_err_probe(&pdev->dev, ret, "Failed to register rtc device\n"); + + regval = rtc_reg_read(rtc, MA35_REG_RTC_INTEN); + regval |= RTC_INTEN_UIEN; + rtc_reg_write(rtc, MA35_REG_RTC_INTEN, regval); + + return 0; +} + +static int ma35d1_rtc_suspend(struct platform_device *pdev, pm_message_t state) +{ + struct ma35_rtc *rtc = platform_get_drvdata(pdev); + u32 regval; + + if (device_may_wakeup(&pdev->dev)) + enable_irq_wake(rtc->irq_num); + + regval = rtc_reg_read(rtc, MA35_REG_RTC_INTEN); + regval &= ~RTC_INTEN_UIEN; + rtc_reg_write(rtc, MA35_REG_RTC_INTEN, regval); + + return 0; +} + +static int ma35d1_rtc_resume(struct platform_device *pdev) +{ + struct ma35_rtc *rtc = platform_get_drvdata(pdev); + u32 regval; + + if (device_may_wakeup(&pdev->dev)) + disable_irq_wake(rtc->irq_num); + + regval = rtc_reg_read(rtc, MA35_REG_RTC_INTEN); + regval |= RTC_INTEN_UIEN; + rtc_reg_write(rtc, MA35_REG_RTC_INTEN, regval); + + return 0; +} + +static const struct of_device_id ma35d1_rtc_of_match[] = { + { .compatible = "nuvoton,ma35d1-rtc", }, + {}, +}; +MODULE_DEVICE_TABLE(of, ma35d1_rtc_of_match); + +static struct platform_driver ma35d1_rtc_driver = { + .suspend = ma35d1_rtc_suspend, + .resume = ma35d1_rtc_resume, + .probe = ma35d1_rtc_probe, + .driver = { + .name = "rtc-ma35d1", + .of_match_table = ma35d1_rtc_of_match, + }, +}; + +module_platform_driver(ma35d1_rtc_driver); + +MODULE_AUTHOR("Ming-Jen Chen "); +MODULE_DESCRIPTION("MA35D1 RTC driver"); +MODULE_LICENSE("GPL"); -- GitLab From becfce5233a78956654f36555f1b9187f8d11d56 Mon Sep 17 00:00:00 2001 From: Vinod Koul Date: Wed, 29 Nov 2023 18:34:48 +0530 Subject: [PATCH 0366/1290] soundwire: amd: drop bus freq calculation and set 'max_clk_freq' max_dr_freq and curr_dr_freq is calculated and set in sdw_bus_master_add(). Setting in the driver is reduanant, so drop that. Set max_clk_freq instead. Signed-off-by: Vinod Koul Tested-by: Vijendar Mukunda Link: https://lore.kernel.org/r/20231129130449.9892-1-vkoul@kernel.org Signed-off-by: Vinod Koul --- drivers/soundwire/amd_manager.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/soundwire/amd_manager.c b/drivers/soundwire/amd_manager.c index a3b1f4e6f0f90..f54bb4dd2d101 100644 --- a/drivers/soundwire/amd_manager.c +++ b/drivers/soundwire/amd_manager.c @@ -950,13 +950,13 @@ static int amd_sdw_manager_probe(struct platform_device *pdev) amd_manager->reg_mask = &sdw_manager_reg_mask_array[amd_manager->instance]; params = &amd_manager->bus.params; - params->max_dr_freq = AMD_SDW_DEFAULT_CLK_FREQ * 2; - params->curr_dr_freq = AMD_SDW_DEFAULT_CLK_FREQ * 2; + params->col = AMD_SDW_DEFAULT_COLUMNS; params->row = AMD_SDW_DEFAULT_ROWS; prop = &amd_manager->bus.prop; prop->clk_freq = &amd_sdw_freq_tbl[0]; prop->mclk_freq = AMD_SDW_BUS_BASE_FREQ; + prop->max_clk_freq = AMD_SDW_DEFAULT_CLK_FREQ; ret = sdw_bus_master_add(&amd_manager->bus, dev, dev->fwnode); if (ret) { -- GitLab From ace196de694ebea5e2b3161e21ad169eb45accc6 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 14 Dec 2023 12:13:58 -0700 Subject: [PATCH 0367/1290] cxl: Fix unregister_region() callback parameter assignment In devm_cxl_add_region(), devm_add_action_or_reset() is called by passing in unregister_region() with data ptr of 'cxlr'. However, in unregister_region(), the passed in parameter is incorrectly assumed to be a 'struct device' rather than the 'cxlr' pointer. The code has been working because 'struct device' is the first member of 'struct cxl_region'. Issue found by inspection. Fix the assignment so that cxlr is pointing directly to the passed in parameter. Not flagged for -stable since there is no functional impact of this fix. Signed-off-by: Dave Jiang Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/170258123810.952211.3907381447996426480.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- drivers/cxl/core/region.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 56e575c79bb49..a61703c9d72ab 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2083,13 +2083,13 @@ static struct cxl_region *to_cxl_region(struct device *dev) return container_of(dev, struct cxl_region, dev); } -static void unregister_region(void *dev) +static void unregister_region(void *_cxlr) { - struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region *cxlr = _cxlr; struct cxl_region_params *p = &cxlr->params; int i; - device_del(dev); + device_del(&cxlr->dev); /* * Now that region sysfs is shutdown, the parameter block is now @@ -2100,7 +2100,7 @@ static void unregister_region(void *dev) detach_target(cxlr, i); cxl_region_iomem_release(cxlr); - put_device(dev); + put_device(&cxlr->dev); } static struct lock_class_key cxl_region_key; -- GitLab From 0b4b785d1f2557678c493dc1b431ca4ad16fde9b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 15 Dec 2023 15:23:30 -0300 Subject: [PATCH 0368/1290] perf evlist: Move event attributes to after the / when uniquefying using the PMU name When turning an event with attributes to the format including the PMU we need to move the "event:attributes" format to "event/attributes/" so that we can copy the event displayed and use it in the command line, i.e. in 'perf top' we had: 1K cpu_atom/cycles:P/ 11K cpu_core/cycles:P/ If I try to use that on the command line: # perf top -e cpu_atom/cycles:P/ event syntax error: 'cpu_atom/cycles:P/' \___ Bad event or PMU Unable to find PMU or event on a PMU of 'cpu_atom' Initial error: event syntax error: 'cpu_atom/cycles:P/' \___ unknown term 'cycles:P' for pmu 'cpu_atom' valid terms: event,pc,edge,offcore_rsp,ldlat,inv,umask,cmask,config,config1,config2,config3,name,period,freq,branch_type,time,call-graph,stack-size,no-inherit,inherit,max-stack,nr,no-overwrite,overwrite ,driver-config,percore,aux-output,aux-sample-size,metric-id,raw,legacy-cache,hardware Run 'perf list' for a list of valid events Usage: perf top [] -e, --event event selector. use 'perf list' to list available events # Tested-by: Kan Liang Cc: Hector Martin Cc: Ian Rogers Cc: Marc Zyngier Cc: Mark Rutland Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/ZXxyanyZgWBTOnoK@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 6f0892803c224..95f25e9fb994a 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -2521,9 +2521,8 @@ void evlist__warn_user_requested_cpus(struct evlist *evlist, const char *cpu_lis void evlist__uniquify_name(struct evlist *evlist) { + char *new_name, empty_attributes[2] = ":", *attributes; struct evsel *pos; - char *new_name; - int ret; if (perf_pmus__num_core_pmus() == 1) return; @@ -2535,11 +2534,17 @@ void evlist__uniquify_name(struct evlist *evlist) if (strchr(pos->name, '/')) continue; - ret = asprintf(&new_name, "%s/%s/", - pos->pmu_name, pos->name); - if (ret) { + attributes = strchr(pos->name, ':'); + if (attributes) + *attributes = '\0'; + else + attributes = empty_attributes; + + if (asprintf(&new_name, "%s/%s/%s", pos->pmu_name, pos->name, attributes + 1)) { free(pos->name); pos->name = new_name; + } else { + *attributes = ':'; } } } -- GitLab From 9a07a71ed3d23b56e1f05ea808ec5f59448fcc16 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 28 Nov 2023 11:46:24 -0800 Subject: [PATCH 0369/1290] perf tests: Make DSO tests a suite rather than individual Make the DSO data tests a suite rather than individual so their output is grouped. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Yang Jihong Link: https://lore.kernel.org/r/20231128194624.1419260-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/builtin-test.c | 2 -- tools/perf/tests/dso-data.c | 15 ++++++++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index b8c21e81a021b..4a5973f9bb9b3 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -62,8 +62,6 @@ static struct test_suite *generic_tests[] = { &suite__pmu, &suite__pmu_events, &suite__dso_data, - &suite__dso_data_cache, - &suite__dso_data_reopen, &suite__perf_evsel__roundtrip_name_test, #ifdef HAVE_LIBTRACEEVENT &suite__perf_evsel__tp_sched_test, diff --git a/tools/perf/tests/dso-data.c b/tools/perf/tests/dso-data.c index 3419a4ab5590f..2d67422c12229 100644 --- a/tools/perf/tests/dso-data.c +++ b/tools/perf/tests/dso-data.c @@ -394,6 +394,15 @@ static int test__dso_data_reopen(struct test_suite *test __maybe_unused, int sub return 0; } -DEFINE_SUITE("DSO data read", dso_data); -DEFINE_SUITE("DSO data cache", dso_data_cache); -DEFINE_SUITE("DSO data reopen", dso_data_reopen); + +static struct test_case tests__dso_data[] = { + TEST_CASE("read", dso_data), + TEST_CASE("cache", dso_data_cache), + TEST_CASE("reopen", dso_data_reopen), + { .name = NULL, } +}; + +struct test_suite suite__dso_data = { + .desc = "DSO data tests", + .test_cases = tests__dso_data, +}; -- GitLab From 3e0594f9f0f774918d63701dc7de634bb1bc7b1e Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 28 Nov 2023 22:02:07 -0800 Subject: [PATCH 0370/1290] perf top: Avoid repeated function calls to perf_cpu_map__nr(). MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a local variable to avoid repeated calls to perf_cpu_map__nr(). Reviewed-by: James Clark Signed-off-by: Adrian Hunter Signed-off-by: Ian Rogers Acked-by: Adrian Hunter Acked-by: Namhyung Kim Cc: Alexander Shishkin Cc: Alexandre Ghiti Cc: Andrew Jones Cc: André Almeida Cc: Athira Rajeev Cc: Atish Patra Cc: Changbin Du Cc: Darren Hart Cc: Davidlohr Bueso Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Nick Desaulniers Cc: Paolo Bonzini Cc: Paran Lee Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Jihong Cc: Yang Li Cc: Yanteng Si Link: https://lore.kernel.org/r/20231129060211.1890454-11-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/top.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tools/perf/util/top.c b/tools/perf/util/top.c index be7157de04518..4db3d1bd686cf 100644 --- a/tools/perf/util/top.c +++ b/tools/perf/util/top.c @@ -28,6 +28,7 @@ size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size) struct record_opts *opts = &top->record_opts; struct target *target = &opts->target; size_t ret = 0; + int nr_cpus; if (top->samples) { samples_per_sec = top->samples / top->delay_secs; @@ -93,19 +94,17 @@ size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size) else ret += SNPRINTF(bf + ret, size - ret, " (all"); + nr_cpus = perf_cpu_map__nr(top->evlist->core.user_requested_cpus); if (target->cpu_list) ret += SNPRINTF(bf + ret, size - ret, ", CPU%s: %s)", - perf_cpu_map__nr(top->evlist->core.user_requested_cpus) > 1 - ? "s" : "", + nr_cpus > 1 ? "s" : "", target->cpu_list); else { if (target->tid) ret += SNPRINTF(bf + ret, size - ret, ")"); else ret += SNPRINTF(bf + ret, size - ret, ", %d CPU%s)", - perf_cpu_map__nr(top->evlist->core.user_requested_cpus), - perf_cpu_map__nr(top->evlist->core.user_requested_cpus) > 1 - ? "s" : ""); + nr_cpus, nr_cpus > 1 ? "s" : ""); } perf_top__reset_sample_counters(top); -- GitLab From 67bc993446d340d4f059014f6be6ead35ec5f88b Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 28 Nov 2023 22:02:11 -0800 Subject: [PATCH 0371/1290] libperf cpumap: Document perf_cpu_map__nr()'s behavior MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit perf_cpu_map__nr()'s behavior around an empty CPU map is strange as it returns that there is 1 CPU. Changing code that may rely on this behavior is hard, we can at least document the behavior. Reviewed-by: James Clark Signed-off-by: Ian Rogers Acked-by: Adrian Hunter Acked-by: Namhyung Kim Cc: Alexander Shishkin Cc: Alexandre Ghiti Cc: Andrew Jones Cc: André Almeida Cc: Athira Rajeev Cc: Atish Patra Cc: Changbin Du Cc: Darren Hart Cc: Davidlohr Bueso Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mike Leach Cc: Nick Desaulniers Cc: Paolo Bonzini Cc: Paran Lee Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Jihong Cc: Yang Li Cc: Yanteng Si Link: https://lore.kernel.org/r/20231129060211.1890454-15-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/include/perf/cpumap.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/lib/perf/include/perf/cpumap.h b/tools/lib/perf/include/perf/cpumap.h index dbe0a7352b648..228c6c629b0ce 100644 --- a/tools/lib/perf/include/perf/cpumap.h +++ b/tools/lib/perf/include/perf/cpumap.h @@ -44,7 +44,18 @@ LIBPERF_API struct perf_cpu_map *perf_cpu_map__merge(struct perf_cpu_map *orig, LIBPERF_API struct perf_cpu_map *perf_cpu_map__intersect(struct perf_cpu_map *orig, struct perf_cpu_map *other); LIBPERF_API void perf_cpu_map__put(struct perf_cpu_map *map); +/** + * perf_cpu_map__cpu - get the CPU value at the given index. Returns -1 if index + * is invalid. + */ LIBPERF_API struct perf_cpu perf_cpu_map__cpu(const struct perf_cpu_map *cpus, int idx); +/** + * perf_cpu_map__nr - for an empty map returns 1, as perf_cpu_map__cpu returns a + * cpu of -1 for an invalid index, this makes an empty map + * look like it contains the "any CPU"/dummy value. Otherwise + * the result is the number CPUs in the map plus one if the + * "any CPU"/dummy value is present. + */ LIBPERF_API int perf_cpu_map__nr(const struct perf_cpu_map *cpus); /** * perf_cpu_map__has_any_cpu_or_is_empty - is map either empty or has the "any CPU"/dummy value. -- GitLab From 5cc47ffba7b71e37d65c259f7f5778b3622f1e8f Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 6 Dec 2023 17:16:35 -0800 Subject: [PATCH 0372/1290] perf map: Improve map/unmap parameter names The u64 values are either absolute or relative, try to hint better in the parameter names. Suggested-by: Namhyung Kim Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Guilherme Amadio Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Miguel Ojeda Cc: Ming Wang Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231207011722.1220634-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/map.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h index 3a3b7757da5fd..49756716cb132 100644 --- a/tools/perf/util/map.h +++ b/tools/perf/util/map.h @@ -105,25 +105,25 @@ static inline u64 map__dso_map_ip(const struct map *map, u64 ip) } /* dso rip -> ip */ -static inline u64 map__dso_unmap_ip(const struct map *map, u64 ip) +static inline u64 map__dso_unmap_ip(const struct map *map, u64 rip) { - return ip + map__start(map) - map__pgoff(map); + return rip + map__start(map) - map__pgoff(map); } -static inline u64 map__map_ip(const struct map *map, u64 ip) +static inline u64 map__map_ip(const struct map *map, u64 ip_or_rip) { if ((RC_CHK_ACCESS(map)->mapping_type) == MAPPING_TYPE__DSO) - return map__dso_map_ip(map, ip); + return map__dso_map_ip(map, ip_or_rip); else - return ip; + return ip_or_rip; } -static inline u64 map__unmap_ip(const struct map *map, u64 ip) +static inline u64 map__unmap_ip(const struct map *map, u64 ip_or_rip) { if ((RC_CHK_ACCESS(map)->mapping_type) == MAPPING_TYPE__DSO) - return map__dso_unmap_ip(map, ip); + return map__dso_unmap_ip(map, ip_or_rip); else - return ip; + return ip_or_rip; } /* rip/ip <-> addr suitable for passing to `objdump --start-address=` */ -- GitLab From 19b5bd9a59bed4e9937092e2b685d69656bfc606 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 6 Dec 2023 17:16:36 -0800 Subject: [PATCH 0373/1290] perf maps: Add maps__for_each_map to iterate maps holding the lock The macro maps__for_each_entry is error prone as it doesn't require holding the maps lock. Add a new function that iterates the maps holding the read lock. Convert maps__find_symbol_by_name() and maps__fprintf() to use callbacks, the latter being an example of where the read lock wasn't being held. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Guilherme Amadio Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Miguel Ojeda Cc: Ming Wang Cc: Namhyung Kim Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231207011722.1220634-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/maps.c | 99 ++++++++++++++++++++++++++++-------------- tools/perf/util/maps.h | 3 ++ 2 files changed, 69 insertions(+), 33 deletions(-) diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c index 9a011aed4b754..160a6dce54bb6 100644 --- a/tools/perf/util/maps.c +++ b/tools/perf/util/maps.c @@ -196,6 +196,21 @@ void maps__put(struct maps *maps) RC_CHK_PUT(maps); } +int maps__for_each_map(struct maps *maps, int (*cb)(struct map *map, void *data), void *data) +{ + struct map_rb_node *pos; + int ret = 0; + + down_read(maps__lock(maps)); + maps__for_each_entry(maps, pos) { + ret = cb(pos->map, data); + if (ret) + break; + } + up_read(maps__lock(maps)); + return ret; +} + struct symbol *maps__find_symbol(struct maps *maps, u64 addr, struct map **mapp) { struct map *map = maps__find(maps, addr); @@ -210,31 +225,40 @@ struct symbol *maps__find_symbol(struct maps *maps, u64 addr, struct map **mapp) return NULL; } -struct symbol *maps__find_symbol_by_name(struct maps *maps, const char *name, struct map **mapp) -{ +struct maps__find_symbol_by_name_args { + struct map **mapp; + const char *name; struct symbol *sym; - struct map_rb_node *pos; +}; - down_read(maps__lock(maps)); +static int maps__find_symbol_by_name_cb(struct map *map, void *data) +{ + struct maps__find_symbol_by_name_args *args = data; - maps__for_each_entry(maps, pos) { - sym = map__find_symbol_by_name(pos->map, name); + args->sym = map__find_symbol_by_name(map, args->name); + if (!args->sym) + return 0; - if (sym == NULL) - continue; - if (!map__contains_symbol(pos->map, sym)) { - sym = NULL; - continue; - } - if (mapp != NULL) - *mapp = pos->map; - goto out; + if (!map__contains_symbol(map, args->sym)) { + args->sym = NULL; + return 0; } - sym = NULL; -out: - up_read(maps__lock(maps)); - return sym; + if (args->mapp != NULL) + *args->mapp = map__get(map); + return 1; +} + +struct symbol *maps__find_symbol_by_name(struct maps *maps, const char *name, struct map **mapp) +{ + struct maps__find_symbol_by_name_args args = { + .mapp = mapp, + .name = name, + .sym = NULL, + }; + + maps__for_each_map(maps, maps__find_symbol_by_name_cb, &args); + return args.sym; } int maps__find_ams(struct maps *maps, struct addr_map_symbol *ams) @@ -253,25 +277,34 @@ int maps__find_ams(struct maps *maps, struct addr_map_symbol *ams) return ams->ms.sym ? 0 : -1; } -size_t maps__fprintf(struct maps *maps, FILE *fp) -{ - size_t printed = 0; - struct map_rb_node *pos; +struct maps__fprintf_args { + FILE *fp; + size_t printed; +}; - down_read(maps__lock(maps)); +static int maps__fprintf_cb(struct map *map, void *data) +{ + struct maps__fprintf_args *args = data; - maps__for_each_entry(maps, pos) { - printed += fprintf(fp, "Map:"); - printed += map__fprintf(pos->map, fp); - if (verbose > 2) { - printed += dso__fprintf(map__dso(pos->map), fp); - printed += fprintf(fp, "--\n"); - } + args->printed += fprintf(args->fp, "Map:"); + args->printed += map__fprintf(map, args->fp); + if (verbose > 2) { + args->printed += dso__fprintf(map__dso(map), args->fp); + args->printed += fprintf(args->fp, "--\n"); } + return 0; +} - up_read(maps__lock(maps)); +size_t maps__fprintf(struct maps *maps, FILE *fp) +{ + struct maps__fprintf_args args = { + .fp = fp, + .printed = 0, + }; + + maps__for_each_map(maps, maps__fprintf_cb, &args); - return printed; + return args.printed; } int maps__fixup_overlappings(struct maps *maps, struct map *map, FILE *fp) diff --git a/tools/perf/util/maps.h b/tools/perf/util/maps.h index a689149be8c43..14ad959792577 100644 --- a/tools/perf/util/maps.h +++ b/tools/perf/util/maps.h @@ -81,6 +81,9 @@ static inline void __maps__zput(struct maps **map) #define maps__zput(map) __maps__zput(&map) +/* Iterate over map calling cb for each entry. */ +int maps__for_each_map(struct maps *maps, int (*cb)(struct map *map, void *data), void *data); + static inline struct rb_root *maps__entries(struct maps *maps) { return &RC_CHK_ACCESS(maps)->entries; -- GitLab From bc4bc56d9d74f4593f253531edcea9ea8e30e7f1 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 6 Dec 2023 17:16:37 -0800 Subject: [PATCH 0374/1290] perf events x86: Use function to add missing lock Switch from loop macro maps__for_each_entry to maps__for_each_map function that takes a callback. The function holds the maps lock, which should be held during iteration. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Guilherme Amadio Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Miguel Ojeda Cc: Ming Wang Cc: Namhyung Kim Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231207011722.1220634-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/util/event.c | 103 +++++++++++++++++-------------- 1 file changed, 58 insertions(+), 45 deletions(-) diff --git a/tools/perf/arch/x86/util/event.c b/tools/perf/arch/x86/util/event.c index 5741ffe473120..e65b7dbe27fbc 100644 --- a/tools/perf/arch/x86/util/event.c +++ b/tools/perf/arch/x86/util/event.c @@ -14,66 +14,79 @@ #if defined(__x86_64__) -int perf_event__synthesize_extra_kmaps(struct perf_tool *tool, - perf_event__handler_t process, - struct machine *machine) +struct perf_event__synthesize_extra_kmaps_cb_args { + struct perf_tool *tool; + perf_event__handler_t process; + struct machine *machine; + union perf_event *event; +}; + +static int perf_event__synthesize_extra_kmaps_cb(struct map *map, void *data) { - int rc = 0; - struct map_rb_node *pos; - struct maps *kmaps = machine__kernel_maps(machine); - union perf_event *event = zalloc(sizeof(event->mmap) + - machine->id_hdr_size); + struct perf_event__synthesize_extra_kmaps_cb_args *args = data; + union perf_event *event = args->event; + struct kmap *kmap; + size_t size; - if (!event) { - pr_debug("Not enough memory synthesizing mmap event " - "for extra kernel maps\n"); - return -1; - } + if (!__map__is_extra_kernel_map(map)) + return 0; - maps__for_each_entry(kmaps, pos) { - struct kmap *kmap; - size_t size; - struct map *map = pos->map; + kmap = map__kmap(map); - if (!__map__is_extra_kernel_map(map)) - continue; + size = sizeof(event->mmap) - sizeof(event->mmap.filename) + + PERF_ALIGN(strlen(kmap->name) + 1, sizeof(u64)) + + args->machine->id_hdr_size; - kmap = map__kmap(map); + memset(event, 0, size); - size = sizeof(event->mmap) - sizeof(event->mmap.filename) + - PERF_ALIGN(strlen(kmap->name) + 1, sizeof(u64)) + - machine->id_hdr_size; + event->mmap.header.type = PERF_RECORD_MMAP; - memset(event, 0, size); + /* + * kernel uses 0 for user space maps, see kernel/perf_event.c + * __perf_event_mmap + */ + if (machine__is_host(args->machine)) + event->header.misc = PERF_RECORD_MISC_KERNEL; + else + event->header.misc = PERF_RECORD_MISC_GUEST_KERNEL; - event->mmap.header.type = PERF_RECORD_MMAP; + event->mmap.header.size = size; - /* - * kernel uses 0 for user space maps, see kernel/perf_event.c - * __perf_event_mmap - */ - if (machine__is_host(machine)) - event->header.misc = PERF_RECORD_MISC_KERNEL; - else - event->header.misc = PERF_RECORD_MISC_GUEST_KERNEL; + event->mmap.start = map__start(map); + event->mmap.len = map__size(map); + event->mmap.pgoff = map__pgoff(map); + event->mmap.pid = args->machine->pid; - event->mmap.header.size = size; + strlcpy(event->mmap.filename, kmap->name, PATH_MAX); - event->mmap.start = map__start(map); - event->mmap.len = map__size(map); - event->mmap.pgoff = map__pgoff(map); - event->mmap.pid = machine->pid; + if (perf_tool__process_synth_event(args->tool, event, args->machine, args->process) != 0) + return -1; - strlcpy(event->mmap.filename, kmap->name, PATH_MAX); + return 0; +} - if (perf_tool__process_synth_event(tool, event, machine, - process) != 0) { - rc = -1; - break; - } +int perf_event__synthesize_extra_kmaps(struct perf_tool *tool, + perf_event__handler_t process, + struct machine *machine) +{ + int rc; + struct maps *kmaps = machine__kernel_maps(machine); + struct perf_event__synthesize_extra_kmaps_cb_args args = { + .tool = tool, + .process = process, + .machine = machine, + .event = zalloc(sizeof(args.event->mmap) + machine->id_hdr_size), + }; + + if (!args.event) { + pr_debug("Not enough memory synthesizing mmap event " + "for extra kernel maps\n"); + return -1; } - free(event); + rc = maps__for_each_map(kmaps, perf_event__synthesize_extra_kmaps_cb, &args); + + free(args.event); return rc; } -- GitLab From 431be14b193ad19946aeb26a6016dc5b8eb6a248 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 6 Dec 2023 17:16:38 -0800 Subject: [PATCH 0375/1290] perf report: Use function to add missing maps lock Switch maps__fprintf_task from loop macro maps__for_each_entry to maps__for_each_map function that takes a callback. The function holds the maps lock, which should be held during iteration. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Guilherme Amadio Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Miguel Ojeda Cc: Ming Wang Cc: Namhyung Kim Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231207011722.1220634-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 54 +++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 17fb171e898b9..178fb602bc982 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -856,27 +856,47 @@ static struct task *tasks_list(struct task *task, struct machine *machine) return tasks_list(parent_task, machine); } -static size_t maps__fprintf_task(struct maps *maps, int indent, FILE *fp) +struct maps__fprintf_task_args { + int indent; + FILE *fp; + size_t printed; +}; + +static int maps__fprintf_task_cb(struct map *map, void *data) { - size_t printed = 0; - struct map_rb_node *rb_node; + struct maps__fprintf_task_args *args = data; + const struct dso *dso = map__dso(map); + u32 prot = map__prot(map); + int ret; - maps__for_each_entry(maps, rb_node) { - struct map *map = rb_node->map; - const struct dso *dso = map__dso(map); - u32 prot = map__prot(map); + ret = fprintf(args->fp, + "%*s %" PRIx64 "-%" PRIx64 " %c%c%c%c %08" PRIx64 " %" PRIu64 " %s\n", + args->indent, "", map__start(map), map__end(map), + prot & PROT_READ ? 'r' : '-', + prot & PROT_WRITE ? 'w' : '-', + prot & PROT_EXEC ? 'x' : '-', + map__flags(map) ? 's' : 'p', + map__pgoff(map), + dso->id.ino, dso->name); - printed += fprintf(fp, "%*s %" PRIx64 "-%" PRIx64 " %c%c%c%c %08" PRIx64 " %" PRIu64 " %s\n", - indent, "", map__start(map), map__end(map), - prot & PROT_READ ? 'r' : '-', - prot & PROT_WRITE ? 'w' : '-', - prot & PROT_EXEC ? 'x' : '-', - map__flags(map) ? 's' : 'p', - map__pgoff(map), - dso->id.ino, dso->name); - } + if (ret < 0) + return ret; + + args->printed += ret; + return 0; +} + +static size_t maps__fprintf_task(struct maps *maps, int indent, FILE *fp) +{ + struct maps__fprintf_task_args args = { + .indent = indent, + .fp = fp, + .printed = 0, + }; + + maps__for_each_map(maps, maps__fprintf_task_cb, &args); - return printed; + return args.printed; } static void task__print_level(struct task *task, FILE *fp, int level) -- GitLab From b1928ca950386729b3bcd555efe559eaa1e2c8cc Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 6 Dec 2023 17:16:39 -0800 Subject: [PATCH 0376/1290] perf tests: Use function to add missing maps lock Switch loop macro maps__for_each_entry to maps__for_each_map function that takes a callback. The function holds the maps lock, which should be held during iteration. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Guilherme Amadio Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Miguel Ojeda Cc: Ming Wang Cc: Namhyung Kim Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231207011722.1220634-6-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/maps.c | 61 ++++++---- tools/perf/tests/vmlinux-kallsyms.c | 181 +++++++++++++++------------- 2 files changed, 136 insertions(+), 106 deletions(-) diff --git a/tools/perf/tests/maps.c b/tools/perf/tests/maps.c index 5bb1123a91a7c..bb3fbfe5a73e2 100644 --- a/tools/perf/tests/maps.c +++ b/tools/perf/tests/maps.c @@ -14,44 +14,59 @@ struct map_def { u64 end; }; +struct check_maps_cb_args { + struct map_def *merged; + unsigned int i; +}; + +static int check_maps_cb(struct map *map, void *data) +{ + struct check_maps_cb_args *args = data; + struct map_def *merged = &args->merged[args->i]; + + if (map__start(map) != merged->start || + map__end(map) != merged->end || + strcmp(map__dso(map)->name, merged->name) || + refcount_read(map__refcnt(map)) != 1) { + return 1; + } + args->i++; + return 0; +} + +static int failed_cb(struct map *map, void *data __maybe_unused) +{ + pr_debug("\tstart: %" PRIu64 " end: %" PRIu64 " name: '%s' refcnt: %d\n", + map__start(map), + map__end(map), + map__dso(map)->name, + refcount_read(map__refcnt(map))); + + return 0; +} + static int check_maps(struct map_def *merged, unsigned int size, struct maps *maps) { - struct map_rb_node *rb_node; - unsigned int i = 0; bool failed = false; if (maps__nr_maps(maps) != size) { pr_debug("Expected %d maps, got %d", size, maps__nr_maps(maps)); failed = true; } else { - maps__for_each_entry(maps, rb_node) { - struct map *map = rb_node->map; - - if (map__start(map) != merged[i].start || - map__end(map) != merged[i].end || - strcmp(map__dso(map)->name, merged[i].name) || - refcount_read(map__refcnt(map)) != 1) { - failed = true; - } - i++; - } + struct check_maps_cb_args args = { + .merged = merged, + .i = 0, + }; + failed = maps__for_each_map(maps, check_maps_cb, &args); } if (failed) { pr_debug("Expected:\n"); - for (i = 0; i < size; i++) { + for (unsigned int i = 0; i < size; i++) { pr_debug("\tstart: %" PRIu64 " end: %" PRIu64 " name: '%s' refcnt: 1\n", merged[i].start, merged[i].end, merged[i].name); } pr_debug("Got:\n"); - maps__for_each_entry(maps, rb_node) { - struct map *map = rb_node->map; - - pr_debug("\tstart: %" PRIu64 " end: %" PRIu64 " name: '%s' refcnt: %d\n", - map__start(map), - map__end(map), - map__dso(map)->name, - refcount_read(map__refcnt(map))); - } + maps__for_each_map(maps, failed_cb, NULL); } return failed ? TEST_FAIL : TEST_OK; } diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c index 1078a93b01aa0..822f893e67d5f 100644 --- a/tools/perf/tests/vmlinux-kallsyms.c +++ b/tools/perf/tests/vmlinux-kallsyms.c @@ -112,18 +112,92 @@ static bool is_ignored_symbol(const char *name, char type) return false; } +struct test__vmlinux_matches_kallsyms_cb_args { + struct machine kallsyms; + struct map *vmlinux_map; + bool header_printed; +}; + +static int test__vmlinux_matches_kallsyms_cb1(struct map *map, void *data) +{ + struct test__vmlinux_matches_kallsyms_cb_args *args = data; + struct dso *dso = map__dso(map); + /* + * If it is the kernel, kallsyms is always "[kernel.kallsyms]", while + * the kernel will have the path for the vmlinux file being used, so use + * the short name, less descriptive but the same ("[kernel]" in both + * cases. + */ + struct map *pair = maps__find_by_name(args->kallsyms.kmaps, + (dso->kernel ? dso->short_name : dso->name)); + + if (pair) + map__set_priv(pair, 1); + else { + if (!args->header_printed) { + pr_info("WARN: Maps only in vmlinux:\n"); + args->header_printed = true; + } + map__fprintf(map, stderr); + } + return 0; +} + +static int test__vmlinux_matches_kallsyms_cb2(struct map *map, void *data) +{ + struct test__vmlinux_matches_kallsyms_cb_args *args = data; + struct map *pair; + u64 mem_start = map__unmap_ip(args->vmlinux_map, map__start(map)); + u64 mem_end = map__unmap_ip(args->vmlinux_map, map__end(map)); + + pair = maps__find(args->kallsyms.kmaps, mem_start); + if (pair == NULL || map__priv(pair)) + return 0; + + if (map__start(pair) == mem_start) { + struct dso *dso = map__dso(map); + + if (!args->header_printed) { + pr_info("WARN: Maps in vmlinux with a different name in kallsyms:\n"); + args->header_printed = true; + } + + pr_info("WARN: %" PRIx64 "-%" PRIx64 " %" PRIx64 " %s in kallsyms as", + map__start(map), map__end(map), map__pgoff(map), dso->name); + if (mem_end != map__end(pair)) + pr_info(":\nWARN: *%" PRIx64 "-%" PRIx64 " %" PRIx64, + map__start(pair), map__end(pair), map__pgoff(pair)); + pr_info(" %s\n", dso->name); + map__set_priv(pair, 1); + } + return 0; +} + +static int test__vmlinux_matches_kallsyms_cb3(struct map *map, void *data) +{ + struct test__vmlinux_matches_kallsyms_cb_args *args = data; + + if (!map__priv(map)) { + if (!args->header_printed) { + pr_info("WARN: Maps only in kallsyms:\n"); + args->header_printed = true; + } + map__fprintf(map, stderr); + } + return 0; +} + static int test__vmlinux_matches_kallsyms(struct test_suite *test __maybe_unused, int subtest __maybe_unused) { int err = TEST_FAIL; struct rb_node *nd; struct symbol *sym; - struct map *kallsyms_map, *vmlinux_map; - struct map_rb_node *rb_node; - struct machine kallsyms, vmlinux; + struct map *kallsyms_map; + struct machine vmlinux; struct maps *maps; u64 mem_start, mem_end; - bool header_printed; + struct test__vmlinux_matches_kallsyms_cb_args args; /* * Step 1: @@ -131,7 +205,7 @@ static int test__vmlinux_matches_kallsyms(struct test_suite *test __maybe_unused * Init the machines that will hold kernel, modules obtained from * both vmlinux + .ko files and from /proc/kallsyms split by modules. */ - machine__init(&kallsyms, "", HOST_KERNEL_ID); + machine__init(&args.kallsyms, "", HOST_KERNEL_ID); machine__init(&vmlinux, "", HOST_KERNEL_ID); maps = machine__kernel_maps(&vmlinux); @@ -143,7 +217,7 @@ static int test__vmlinux_matches_kallsyms(struct test_suite *test __maybe_unused * load /proc/kallsyms. Also create the modules maps from /proc/modules * and find the .ko files that match them in /lib/modules/`uname -r`/. */ - if (machine__create_kernel_maps(&kallsyms) < 0) { + if (machine__create_kernel_maps(&args.kallsyms) < 0) { pr_debug("machine__create_kernel_maps failed"); err = TEST_SKIP; goto out; @@ -160,7 +234,7 @@ static int test__vmlinux_matches_kallsyms(struct test_suite *test __maybe_unused * be compacted against the list of modules found in the "vmlinux" * code and with the one got from /proc/modules from the "kallsyms" code. */ - if (machine__load_kallsyms(&kallsyms, "/proc/kallsyms") <= 0) { + if (machine__load_kallsyms(&args.kallsyms, "/proc/kallsyms") <= 0) { pr_debug("machine__load_kallsyms failed"); err = TEST_SKIP; goto out; @@ -174,7 +248,7 @@ static int test__vmlinux_matches_kallsyms(struct test_suite *test __maybe_unused * to see if the running kernel was relocated by checking if it has the * same value in the vmlinux file we load. */ - kallsyms_map = machine__kernel_map(&kallsyms); + kallsyms_map = machine__kernel_map(&args.kallsyms); /* * Step 5: @@ -186,7 +260,7 @@ static int test__vmlinux_matches_kallsyms(struct test_suite *test __maybe_unused goto out; } - vmlinux_map = machine__kernel_map(&vmlinux); + args.vmlinux_map = machine__kernel_map(&vmlinux); /* * Step 6: @@ -213,7 +287,7 @@ static int test__vmlinux_matches_kallsyms(struct test_suite *test __maybe_unused * in the kallsyms dso. For the ones that are in both, check its names and * end addresses too. */ - map__for_each_symbol(vmlinux_map, sym, nd) { + map__for_each_symbol(args.vmlinux_map, sym, nd) { struct symbol *pair, *first_pair; sym = rb_entry(nd, struct symbol, rb_node); @@ -221,10 +295,10 @@ static int test__vmlinux_matches_kallsyms(struct test_suite *test __maybe_unused if (sym->start == sym->end) continue; - mem_start = map__unmap_ip(vmlinux_map, sym->start); - mem_end = map__unmap_ip(vmlinux_map, sym->end); + mem_start = map__unmap_ip(args.vmlinux_map, sym->start); + mem_end = map__unmap_ip(args.vmlinux_map, sym->end); - first_pair = machine__find_kernel_symbol(&kallsyms, mem_start, NULL); + first_pair = machine__find_kernel_symbol(&args.kallsyms, mem_start, NULL); pair = first_pair; if (pair && UM(pair->start) == mem_start) { @@ -253,7 +327,8 @@ next_pair: */ continue; } else { - pair = machine__find_kernel_symbol_by_name(&kallsyms, sym->name, NULL); + pair = machine__find_kernel_symbol_by_name(&args.kallsyms, + sym->name, NULL); if (pair) { if (UM(pair->start) == mem_start) goto next_pair; @@ -267,7 +342,7 @@ next_pair: continue; } - } else if (mem_start == map__end(kallsyms.vmlinux_map)) { + } else if (mem_start == map__end(args.kallsyms.vmlinux_map)) { /* * Ignore aliases to _etext, i.e. to the end of the kernel text area, * such as __indirect_thunk_end. @@ -289,78 +364,18 @@ next_pair: if (verbose <= 0) goto out; - header_printed = false; - - maps__for_each_entry(maps, rb_node) { - struct map *map = rb_node->map; - struct dso *dso = map__dso(map); - /* - * If it is the kernel, kallsyms is always "[kernel.kallsyms]", while - * the kernel will have the path for the vmlinux file being used, - * so use the short name, less descriptive but the same ("[kernel]" in - * both cases. - */ - struct map *pair = maps__find_by_name(kallsyms.kmaps, (dso->kernel ? - dso->short_name : - dso->name)); - if (pair) { - map__set_priv(pair, 1); - } else { - if (!header_printed) { - pr_info("WARN: Maps only in vmlinux:\n"); - header_printed = true; - } - map__fprintf(map, stderr); - } - } - - header_printed = false; - - maps__for_each_entry(maps, rb_node) { - struct map *pair, *map = rb_node->map; - - mem_start = map__unmap_ip(vmlinux_map, map__start(map)); - mem_end = map__unmap_ip(vmlinux_map, map__end(map)); + args.header_printed = false; + maps__for_each_map(maps, test__vmlinux_matches_kallsyms_cb1, &args); - pair = maps__find(kallsyms.kmaps, mem_start); - if (pair == NULL || map__priv(pair)) - continue; - - if (map__start(pair) == mem_start) { - struct dso *dso = map__dso(map); - - if (!header_printed) { - pr_info("WARN: Maps in vmlinux with a different name in kallsyms:\n"); - header_printed = true; - } - - pr_info("WARN: %" PRIx64 "-%" PRIx64 " %" PRIx64 " %s in kallsyms as", - map__start(map), map__end(map), map__pgoff(map), dso->name); - if (mem_end != map__end(pair)) - pr_info(":\nWARN: *%" PRIx64 "-%" PRIx64 " %" PRIx64, - map__start(pair), map__end(pair), map__pgoff(pair)); - pr_info(" %s\n", dso->name); - map__set_priv(pair, 1); - } - } - - header_printed = false; - - maps = machine__kernel_maps(&kallsyms); + args.header_printed = false; + maps__for_each_map(maps, test__vmlinux_matches_kallsyms_cb2, &args); - maps__for_each_entry(maps, rb_node) { - struct map *map = rb_node->map; + args.header_printed = false; + maps = machine__kernel_maps(&args.kallsyms); + maps__for_each_map(maps, test__vmlinux_matches_kallsyms_cb3, &args); - if (!map__priv(map)) { - if (!header_printed) { - pr_info("WARN: Maps only in kallsyms:\n"); - header_printed = true; - } - map__fprintf(map, stderr); - } - } out: - machine__exit(&kallsyms); + machine__exit(&args.kallsyms); machine__exit(&vmlinux); return err; } -- GitLab From 2dc549b1dd495e7cdaa822a0af97365a8a1de840 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 6 Dec 2023 17:16:40 -0800 Subject: [PATCH 0377/1290] perf machine: Use function to add missing maps lock Switch machine__map_x86_64_entry_trampolines and machine__for_each_kernel_map from loop macro maps__for_each_entry to maps__for_each_map function that takes a callback. The function holds the maps lock, which should be held during iteration. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Guilherme Amadio Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Miguel Ojeda Cc: Ming Wang Cc: Namhyung Kim Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231207011722.1220634-7-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 53 +++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index c5de5363b5e75..ca855fc435ac5 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -1285,33 +1285,46 @@ static u64 find_entry_trampoline(struct dso *dso) #define X86_64_CPU_ENTRY_AREA_SIZE 0x2c000 #define X86_64_ENTRY_TRAMPOLINE 0x6000 +struct machine__map_x86_64_entry_trampolines_args { + struct maps *kmaps; + bool found; +}; + +static int machine__map_x86_64_entry_trampolines_cb(struct map *map, void *data) +{ + struct machine__map_x86_64_entry_trampolines_args *args = data; + struct map *dest_map; + struct kmap *kmap = __map__kmap(map); + + if (!kmap || !is_entry_trampoline(kmap->name)) + return 0; + + dest_map = maps__find(args->kmaps, map__pgoff(map)); + if (dest_map != map) + map__set_pgoff(map, map__map_ip(dest_map, map__pgoff(map))); + + args->found = true; + return 0; +} + /* Map x86_64 PTI entry trampolines */ int machine__map_x86_64_entry_trampolines(struct machine *machine, struct dso *kernel) { - struct maps *kmaps = machine__kernel_maps(machine); + struct machine__map_x86_64_entry_trampolines_args args = { + .kmaps = machine__kernel_maps(machine), + .found = false, + }; int nr_cpus_avail, cpu; - bool found = false; - struct map_rb_node *rb_node; u64 pgoff; /* * In the vmlinux case, pgoff is a virtual address which must now be * mapped to a vmlinux offset. */ - maps__for_each_entry(kmaps, rb_node) { - struct map *dest_map, *map = rb_node->map; - struct kmap *kmap = __map__kmap(map); - - if (!kmap || !is_entry_trampoline(kmap->name)) - continue; + maps__for_each_map(args.kmaps, machine__map_x86_64_entry_trampolines_cb, &args); - dest_map = maps__find(kmaps, map__pgoff(map)); - if (dest_map != map) - map__set_pgoff(map, map__map_ip(dest_map, map__pgoff(map))); - found = true; - } - if (found || machine->trampolines_mapped) + if (args.found || machine->trampolines_mapped) return 0; pgoff = find_entry_trampoline(kernel); @@ -3398,16 +3411,8 @@ int machine__for_each_dso(struct machine *machine, machine__dso_t fn, void *priv int machine__for_each_kernel_map(struct machine *machine, machine__map_t fn, void *priv) { struct maps *maps = machine__kernel_maps(machine); - struct map_rb_node *pos; - int err = 0; - maps__for_each_entry(maps, pos) { - err = fn(pos->map, priv); - if (err != 0) { - break; - } - } - return err; + return maps__for_each_map(maps, fn, priv); } bool machine__is_lock_function(struct machine *machine, u64 addr) -- GitLab From 300b53d5b819a33e2d4567cc35e1d92b4b49cd9c Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 6 Dec 2023 17:16:41 -0800 Subject: [PATCH 0378/1290] perf probe-event: Use function to add missing maps lock Switch kernel_get_module_map from loop macro maps__for_each_entry to maps__for_each_map function that takes a callback. The function holds the maps lock, which should be held during iteration. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Guilherme Amadio Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Miguel Ojeda Cc: Ming Wang Cc: Namhyung Kim Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231207011722.1220634-8-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-event.c | 40 +++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 1a5b7fa459b23..a1a796043691f 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -149,10 +149,32 @@ static int kernel_get_symbol_address_by_name(const char *name, u64 *addr, return 0; } +struct kernel_get_module_map_cb_args { + const char *module; + struct map *result; +}; + +static int kernel_get_module_map_cb(struct map *map, void *data) +{ + struct kernel_get_module_map_cb_args *args = data; + struct dso *dso = map__dso(map); + const char *short_name = dso->short_name; /* short_name is "[module]" */ + u16 short_name_len = dso->short_name_len; + + if (strncmp(short_name + 1, args->module, short_name_len - 2) == 0 && + args->module[short_name_len - 2] == '\0') { + args->result = map__get(map); + return 1; + } + return 0; +} + static struct map *kernel_get_module_map(const char *module) { - struct maps *maps = machine__kernel_maps(host_machine); - struct map_rb_node *pos; + struct kernel_get_module_map_cb_args args = { + .module = module, + .result = NULL, + }; /* A file path -- this is an offline module */ if (module && strchr(module, '/')) @@ -164,19 +186,9 @@ static struct map *kernel_get_module_map(const char *module) return map__get(map); } - maps__for_each_entry(maps, pos) { - /* short_name is "[module]" */ - struct dso *dso = map__dso(pos->map); - const char *short_name = dso->short_name; - u16 short_name_len = dso->short_name_len; + maps__for_each_map(machine__kernel_maps(host_machine), kernel_get_module_map_cb, &args); - if (strncmp(short_name + 1, module, - short_name_len - 2) == 0 && - module[short_name_len - 2] == '\0') { - return map__get(pos->map); - } - } - return NULL; + return args.result; } struct map *get_target_map(const char *target, struct nsinfo *nsi, bool user) -- GitLab From 111350c67d15ffc284801509acdcf256d77876ca Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 6 Dec 2023 17:16:42 -0800 Subject: [PATCH 0379/1290] perf symbol: Use function to add missing maps lock Switch do_validate_kcore_modules from loop macro maps__for_each_entry to maps__for_each_map function that takes a callback. The function holds the maps lock, which should be held during iteration. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Guilherme Amadio Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Miguel Ojeda Cc: Ming Wang Cc: Namhyung Kim Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231207011722.1220634-9-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/symbol.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 1cc42b8d8afb7..72f03b875478a 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -1114,33 +1114,35 @@ out_delete_from: return ret; } +static int do_validate_kcore_modules_cb(struct map *old_map, void *data) +{ + struct rb_root *modules = data; + struct module_info *mi; + struct dso *dso; + + if (!__map__is_kmodule(old_map)) + return 0; + + dso = map__dso(old_map); + /* Module must be in memory at the same address */ + mi = find_module(dso->short_name, modules); + if (!mi || mi->start != map__start(old_map)) + return -EINVAL; + + return 0; +} + static int do_validate_kcore_modules(const char *filename, struct maps *kmaps) { struct rb_root modules = RB_ROOT; - struct map_rb_node *old_node; int err; err = read_proc_modules(filename, &modules); if (err) return err; - maps__for_each_entry(kmaps, old_node) { - struct map *old_map = old_node->map; - struct module_info *mi; - struct dso *dso; + err = maps__for_each_map(kmaps, do_validate_kcore_modules_cb, &modules); - if (!__map__is_kmodule(old_map)) { - continue; - } - dso = map__dso(old_map); - /* Module must be in memory at the same address */ - mi = find_module(dso->short_name, &modules); - if (!mi || mi->start != map__start(old_map)) { - err = -EINVAL; - goto out; - } - } -out: delete_modules(&modules); return err; } -- GitLab From 228493d0a83bc2aec7f4a25491621f9d3b6d7bf2 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 6 Dec 2023 17:16:43 -0800 Subject: [PATCH 0380/1290] perf synthetic-events: Use function to add missing maps lock Switch perf_event__synthesize_modules from loop macro maps__for_each_entry to maps__for_each_map function that takes a callback. The function holds the maps lock, which should be held during iteration. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Guilherme Amadio Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Miguel Ojeda Cc: Ming Wang Cc: Namhyung Kim Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231207011722.1220634-10-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/synthetic-events.c | 118 ++++++++++++++++------------- 1 file changed, 67 insertions(+), 51 deletions(-) diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index a0579c7d7b9e9..3712186353fb9 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -665,18 +665,74 @@ int perf_event__synthesize_cgroups(struct perf_tool *tool __maybe_unused, } #endif +struct perf_event__synthesize_modules_maps_cb_args { + struct perf_tool *tool; + perf_event__handler_t process; + struct machine *machine; + union perf_event *event; +}; + +static int perf_event__synthesize_modules_maps_cb(struct map *map, void *data) +{ + struct perf_event__synthesize_modules_maps_cb_args *args = data; + union perf_event *event = args->event; + struct dso *dso; + size_t size; + + if (!__map__is_kmodule(map)) + return 0; + + dso = map__dso(map); + if (symbol_conf.buildid_mmap2) { + size = PERF_ALIGN(dso->long_name_len + 1, sizeof(u64)); + event->mmap2.header.type = PERF_RECORD_MMAP2; + event->mmap2.header.size = (sizeof(event->mmap2) - + (sizeof(event->mmap2.filename) - size)); + memset(event->mmap2.filename + size, 0, args->machine->id_hdr_size); + event->mmap2.header.size += args->machine->id_hdr_size; + event->mmap2.start = map__start(map); + event->mmap2.len = map__size(map); + event->mmap2.pid = args->machine->pid; + + memcpy(event->mmap2.filename, dso->long_name, dso->long_name_len + 1); + + perf_record_mmap2__read_build_id(&event->mmap2, args->machine, false); + } else { + size = PERF_ALIGN(dso->long_name_len + 1, sizeof(u64)); + event->mmap.header.type = PERF_RECORD_MMAP; + event->mmap.header.size = (sizeof(event->mmap) - + (sizeof(event->mmap.filename) - size)); + memset(event->mmap.filename + size, 0, args->machine->id_hdr_size); + event->mmap.header.size += args->machine->id_hdr_size; + event->mmap.start = map__start(map); + event->mmap.len = map__size(map); + event->mmap.pid = args->machine->pid; + + memcpy(event->mmap.filename, dso->long_name, dso->long_name_len + 1); + } + + if (perf_tool__process_synth_event(args->tool, event, args->machine, args->process) != 0) + return -1; + + return 0; +} + int perf_event__synthesize_modules(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine) { - int rc = 0; - struct map_rb_node *pos; + int rc; struct maps *maps = machine__kernel_maps(machine); - union perf_event *event; - size_t size = symbol_conf.buildid_mmap2 ? - sizeof(event->mmap2) : sizeof(event->mmap); + struct perf_event__synthesize_modules_maps_cb_args args = { + .tool = tool, + .process = process, + .machine = machine, + }; + size_t size = symbol_conf.buildid_mmap2 + ? sizeof(args.event->mmap2) + : sizeof(args.event->mmap); - event = zalloc(size + machine->id_hdr_size); - if (event == NULL) { + args.event = zalloc(size + machine->id_hdr_size); + if (args.event == NULL) { pr_debug("Not enough memory synthesizing mmap event " "for kernel modules\n"); return -1; @@ -687,53 +743,13 @@ int perf_event__synthesize_modules(struct perf_tool *tool, perf_event__handler_t * __perf_event_mmap */ if (machine__is_host(machine)) - event->header.misc = PERF_RECORD_MISC_KERNEL; + args.event->header.misc = PERF_RECORD_MISC_KERNEL; else - event->header.misc = PERF_RECORD_MISC_GUEST_KERNEL; - - maps__for_each_entry(maps, pos) { - struct map *map = pos->map; - struct dso *dso; + args.event->header.misc = PERF_RECORD_MISC_GUEST_KERNEL; - if (!__map__is_kmodule(map)) - continue; + rc = maps__for_each_map(maps, perf_event__synthesize_modules_maps_cb, &args); - dso = map__dso(map); - if (symbol_conf.buildid_mmap2) { - size = PERF_ALIGN(dso->long_name_len + 1, sizeof(u64)); - event->mmap2.header.type = PERF_RECORD_MMAP2; - event->mmap2.header.size = (sizeof(event->mmap2) - - (sizeof(event->mmap2.filename) - size)); - memset(event->mmap2.filename + size, 0, machine->id_hdr_size); - event->mmap2.header.size += machine->id_hdr_size; - event->mmap2.start = map__start(map); - event->mmap2.len = map__size(map); - event->mmap2.pid = machine->pid; - - memcpy(event->mmap2.filename, dso->long_name, dso->long_name_len + 1); - - perf_record_mmap2__read_build_id(&event->mmap2, machine, false); - } else { - size = PERF_ALIGN(dso->long_name_len + 1, sizeof(u64)); - event->mmap.header.type = PERF_RECORD_MMAP; - event->mmap.header.size = (sizeof(event->mmap) - - (sizeof(event->mmap.filename) - size)); - memset(event->mmap.filename + size, 0, machine->id_hdr_size); - event->mmap.header.size += machine->id_hdr_size; - event->mmap.start = map__start(map); - event->mmap.len = map__size(map); - event->mmap.pid = machine->pid; - - memcpy(event->mmap.filename, dso->long_name, dso->long_name_len + 1); - } - - if (perf_tool__process_synth_event(tool, event, machine, process) != 0) { - rc = -1; - break; - } - } - - free(event); + free(args.event); return rc; } -- GitLab From 71225af17f611632226a4a2fae25235fd8dad268 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 6 Dec 2023 17:16:44 -0800 Subject: [PATCH 0381/1290] perf thread: Use function to add missing maps lock Switch thread__prepare_access from loop macro maps__for_each_entry to maps__for_each_map function that takes a callback. The function holds the maps lock, which should be held during iteration. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Guilherme Amadio Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Miguel Ojeda Cc: Ming Wang Cc: Namhyung Kim Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231207011722.1220634-11-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/thread.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index b9c2039c4230c..b6986a81aa6df 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -349,34 +349,33 @@ int thread__insert_map(struct thread *thread, struct map *map) return maps__insert(thread__maps(thread), map); } -static int __thread__prepare_access(struct thread *thread) +struct thread__prepare_access_maps_cb_args { + int err; + struct maps *maps; +}; + +static int thread__prepare_access_maps_cb(struct map *map, void *data) { bool initialized = false; - int err = 0; - struct maps *maps = thread__maps(thread); - struct map_rb_node *rb_node; - - down_read(maps__lock(maps)); - - maps__for_each_entry(maps, rb_node) { - err = unwind__prepare_access(thread__maps(thread), rb_node->map, &initialized); - if (err || initialized) - break; - } + struct thread__prepare_access_maps_cb_args *args = data; - up_read(maps__lock(maps)); + args->err = unwind__prepare_access(args->maps, map, &initialized); - return err; + return (args->err || initialized) ? 1 : 0; } static int thread__prepare_access(struct thread *thread) { - int err = 0; + struct thread__prepare_access_maps_cb_args args = { + .err = 0, + }; - if (dwarf_callchain_users) - err = __thread__prepare_access(thread); + if (dwarf_callchain_users) { + args.maps = thread__maps(thread); + maps__for_each_map(thread__maps(thread), thread__prepare_access_maps_cb, &args); + } - return err; + return args.err; } static int thread__clone_maps(struct thread *thread, struct thread *parent, bool do_maps_clone) -- GitLab From d23569979ca1cd139a42c410e0c7b9e6014c3b3a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 13 Dec 2023 09:37:01 -0500 Subject: [PATCH 0382/1290] tracing: Allow creating instances with specified system events A trace instance may only need to enable specific events. As the eventfs directory of an instance currently creates all events which adds overhead, allow internal instances to be created with just the events in systems that they care about. This currently only deals with systems and not individual events, but this should bring down the overhead of creating instances for specific use cases quite bit. The trace_array_get_by_name() now has another parameter "systems". This parameter is a const string pointer of a comma/space separated list of event systems that should be created by the trace_array. (Note if the trace_array already exists, this parameter is ignored). The list of systems is saved and if a module is loaded, its events will not be added unless the system for those events also match the systems string. Link: https://lore.kernel.org/linux-trace-kernel/20231213093701.03fddec0@gandalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Sean Paul Cc: Arun Easi Cc: Daniel Wagner Tested-by: Dmytro Maluka Signed-off-by: Steven Rostedt (Google) --- drivers/scsi/qla2xxx/qla_os.c | 2 +- include/linux/trace.h | 4 +-- kernel/trace/trace.c | 23 +++++++++++--- kernel/trace/trace.h | 1 + kernel/trace/trace_boot.c | 2 +- kernel/trace/trace_events.c | 48 +++++++++++++++++++++++++++-- samples/ftrace/sample-trace-array.c | 2 +- 7 files changed, 70 insertions(+), 12 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 03348f605c2e9..dd674378f2f39 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -2889,7 +2889,7 @@ static void qla2x00_iocb_work_fn(struct work_struct *work) static void qla_trace_init(void) { - qla_trc_array = trace_array_get_by_name("qla2xxx"); + qla_trc_array = trace_array_get_by_name("qla2xxx", NULL); if (!qla_trc_array) { ql_log(ql_log_fatal, NULL, 0x0001, "Unable to create qla2xxx trace instance, instance logging will be disabled.\n"); diff --git a/include/linux/trace.h b/include/linux/trace.h index 2a70a447184c9..fdcd76b7be83d 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -51,7 +51,7 @@ int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...); int trace_array_init_printk(struct trace_array *tr); void trace_array_put(struct trace_array *tr); -struct trace_array *trace_array_get_by_name(const char *name); +struct trace_array *trace_array_get_by_name(const char *name, const char *systems); int trace_array_destroy(struct trace_array *tr); /* For osnoise tracer */ @@ -84,7 +84,7 @@ static inline int trace_array_init_printk(struct trace_array *tr) static inline void trace_array_put(struct trace_array *tr) { } -static inline struct trace_array *trace_array_get_by_name(const char *name) +static inline struct trace_array *trace_array_get_by_name(const char *name, const char *systems) { return NULL; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 199df497db07e..59e39b652afb5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9490,7 +9490,8 @@ static int trace_array_create_dir(struct trace_array *tr) return ret; } -static struct trace_array *trace_array_create(const char *name) +static struct trace_array * +trace_array_create_systems(const char *name, const char *systems) { struct trace_array *tr; int ret; @@ -9510,6 +9511,12 @@ static struct trace_array *trace_array_create(const char *name) if (!zalloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL)) goto out_free_tr; + if (systems) { + tr->system_names = kstrdup_const(systems, GFP_KERNEL); + if (!tr->system_names) + goto out_free_tr; + } + tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS; cpumask_copy(tr->tracing_cpumask, cpu_all_mask); @@ -9556,12 +9563,18 @@ static struct trace_array *trace_array_create(const char *name) free_trace_buffers(tr); free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); + kfree_const(tr->system_names); kfree(tr->name); kfree(tr); return ERR_PTR(ret); } +static struct trace_array *trace_array_create(const char *name) +{ + return trace_array_create_systems(name, NULL); +} + static int instance_mkdir(const char *name) { struct trace_array *tr; @@ -9587,6 +9600,7 @@ out_unlock: /** * trace_array_get_by_name - Create/Lookup a trace array, given its name. * @name: The name of the trace array to be looked up/created. + * @systems: A list of systems to create event directories for (NULL for all) * * Returns pointer to trace array with given name. * NULL, if it cannot be created. @@ -9600,7 +9614,7 @@ out_unlock: * trace_array_put() is called, user space can not delete it. * */ -struct trace_array *trace_array_get_by_name(const char *name) +struct trace_array *trace_array_get_by_name(const char *name, const char *systems) { struct trace_array *tr; @@ -9612,7 +9626,7 @@ struct trace_array *trace_array_get_by_name(const char *name) goto out_unlock; } - tr = trace_array_create(name); + tr = trace_array_create_systems(name, systems); if (IS_ERR(tr)) tr = NULL; @@ -9659,6 +9673,7 @@ static int __remove_instance(struct trace_array *tr) free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); + kfree_const(tr->system_names); kfree(tr->name); kfree(tr); @@ -10377,7 +10392,7 @@ __init static void enable_instances(void) if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE)) do_allocate_snapshot(tok); - tr = trace_array_get_by_name(tok); + tr = trace_array_get_by_name(tok, NULL); if (!tr) { pr_warn("Failed to create instance buffer %s\n", curr_str); continue; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0489e72c8169c..79180aed13eee 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -377,6 +377,7 @@ struct trace_array { unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; unsigned int flags; raw_spinlock_t start_lock; + const char *system_names; struct list_head err_log; struct dentry *dir; struct dentry *options; diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 7ccc7a8e155b9..dbe29b4c6a7a0 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -633,7 +633,7 @@ trace_boot_init_instances(struct xbc_node *node) if (!p || *p == '\0') continue; - tr = trace_array_get_by_name(p); + tr = trace_array_get_by_name(p, NULL); if (!tr) { pr_err("Failed to get trace instance %s\n", p); continue; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f29e815ca5b2e..b70d038180384 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2896,6 +2896,27 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) up_write(&trace_event_sem); } +static bool event_in_systems(struct trace_event_call *call, + const char *systems) +{ + const char *system; + const char *p; + + if (!systems) + return true; + + system = call->class->system; + p = strstr(systems, system); + if (!p) + return false; + + if (p != systems && !isspace(*(p - 1)) && *(p - 1) != ',') + return false; + + p += strlen(system); + return !*p || isspace(*p) || *p == ','; +} + static struct trace_event_file * trace_create_new_event(struct trace_event_call *call, struct trace_array *tr) @@ -2905,9 +2926,12 @@ trace_create_new_event(struct trace_event_call *call, struct trace_event_file *file; unsigned int first; + if (!event_in_systems(call, tr->system_names)) + return NULL; + file = kmem_cache_alloc(file_cachep, GFP_TRACE); if (!file) - return NULL; + return ERR_PTR(-ENOMEM); pid_list = rcu_dereference_protected(tr->filtered_pids, lockdep_is_held(&event_mutex)); @@ -2972,8 +2996,17 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) struct trace_event_file *file; file = trace_create_new_event(call, tr); + /* + * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed + * allocation, or NULL if the event is not part of the tr->system_names. + * When the event is not part of the tr->system_names, return zero, not + * an error. + */ if (!file) - return -ENOMEM; + return 0; + + if (IS_ERR(file)) + return PTR_ERR(file); if (eventdir_initialized) return event_create_dir(tr->event_dir, file); @@ -3012,8 +3045,17 @@ __trace_early_add_new_event(struct trace_event_call *call, int ret; file = trace_create_new_event(call, tr); + /* + * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed + * allocation, or NULL if the event is not part of the tr->system_names. + * When the event is not part of the tr->system_names, return zero, not + * an error. + */ if (!file) - return -ENOMEM; + return 0; + + if (IS_ERR(file)) + return PTR_ERR(file); ret = event_define_fields(call); if (ret) diff --git a/samples/ftrace/sample-trace-array.c b/samples/ftrace/sample-trace-array.c index 6aba02a31c96c..d0ee9001c7b37 100644 --- a/samples/ftrace/sample-trace-array.c +++ b/samples/ftrace/sample-trace-array.c @@ -105,7 +105,7 @@ static int __init sample_trace_array_init(void) * NOTE: This function increments the reference counter * associated with the trace array - "tr". */ - tr = trace_array_get_by_name("sample-instance"); + tr = trace_array_get_by_name("sample-instance", "sched,timer,kprobes"); if (!tr) return -1; -- GitLab From 0b9036efd83d4adefab197a1ec98c496c9df44f0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Mon, 11 Dec 2023 13:16:23 -0500 Subject: [PATCH 0383/1290] ring-buffer: Add offset of events in dump on mismatch On bugs that have the ring buffer timestamp get out of sync, the config CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS, that checks for it and if it is detected it causes a dump of the bad sub buffer. It shows each event and their timestamp as well as the delta in the event. But it's also good to see the offset into the subbuffer for that event to know if how close to the end it is. Also print where the last event actually ended compared to where it was expected to end. Link: https://lore.kernel.org/linux-trace-kernel/20231211131623.59eaebd2@gandalf.local.home Cc: Mark Rutland Cc: Mathieu Desnoyers Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 83eab547f1d1f..bfe2697a92ee4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3358,29 +3358,34 @@ static void dump_buffer_page(struct buffer_data_page *bpage, case RINGBUF_TYPE_TIME_EXTEND: delta = rb_event_time_stamp(event); ts += delta; - pr_warn(" [%lld] delta:%lld TIME EXTEND\n", ts, delta); + pr_warn(" 0x%x: [%lld] delta:%lld TIME EXTEND\n", + e, ts, delta); break; case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); ts = rb_fix_abs_ts(delta, ts); - pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta); + pr_warn(" 0x%x: [%lld] absolute:%lld TIME STAMP\n", + e, ts, delta); break; case RINGBUF_TYPE_PADDING: ts += event->time_delta; - pr_warn(" [%lld] delta:%d PADDING\n", ts, event->time_delta); + pr_warn(" 0x%x: [%lld] delta:%d PADDING\n", + e, ts, event->time_delta); break; case RINGBUF_TYPE_DATA: ts += event->time_delta; - pr_warn(" [%lld] delta:%d\n", ts, event->time_delta); + pr_warn(" 0x%x: [%lld] delta:%d\n", + e, ts, event->time_delta); break; default: break; } } + pr_warn("expected end:0x%lx last event actually ended at:0x%x\n", tail, e); } static DEFINE_PER_CPU(atomic_t, checking); -- GitLab From 8ec90be7f15fac42992ea821be929d3b06cd0fd9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 12 Dec 2023 13:19:01 -0500 Subject: [PATCH 0384/1290] tracing: Allow for max buffer data size trace_marker writes Allow a trace write to be as big as the ring buffer tracing data will allow. Currently, it only allows writes of 1KB in size, but there's no reason that it cannot allow what the ring buffer can hold. Link: https://lore.kernel.org/linux-trace-kernel/20231212131901.5f501e72@gandalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 1 + kernel/trace/ring_buffer.c | 15 +++++++++++++++ kernel/trace/trace.c | 31 ++++++++++++++++++++++++------- 3 files changed, 40 insertions(+), 7 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 782e14f62201f..b1b03b2c0f08e 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -141,6 +141,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter); bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter); unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu); +unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer); void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu); void ring_buffer_reset_online_cpus(struct trace_buffer *buffer); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bfe2697a92ee4..16b640d824f92 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5190,6 +5190,21 @@ unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_size); +/** + * ring_buffer_max_event_size - return the max data size of an event + * @buffer: The ring buffer. + * + * Returns the maximum size an event can be. + */ +unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer) +{ + /* If abs timestamp is requested, events have a timestamp too */ + if (ring_buffer_time_stamp_abs(buffer)) + return BUF_MAX_DATA_SIZE - RB_LEN_TIME_EXTEND; + return BUF_MAX_DATA_SIZE; +} +EXPORT_SYMBOL_GPL(ring_buffer_max_event_size); + static void rb_clear_buffer_page(struct buffer_page *page) { local_set(&page->write, 0); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 59e39b652afb5..dba1328e454b3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7278,8 +7278,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, enum event_trigger_type tt = ETT_NONE; struct trace_buffer *buffer; struct print_entry *entry; + int meta_size; ssize_t written; - int size; + size_t size; int len; /* Used in tracing_mark_raw_write() as well */ @@ -7292,12 +7293,12 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (!(tr->trace_flags & TRACE_ITER_MARKERS)) return -EINVAL; - if (cnt > TRACE_BUF_SIZE) - cnt = TRACE_BUF_SIZE; - - BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); + if ((ssize_t)cnt < 0) + return -EINVAL; - size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */ + meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */ + again: + size = cnt + meta_size; /* If less than "", then make sure we can still add that */ if (cnt < FAULTED_SIZE) @@ -7306,9 +7307,25 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, buffer = tr->array_buffer.buffer; event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, tracing_gen_ctx()); - if (unlikely(!event)) + if (unlikely(!event)) { + /* + * If the size was greater than what was allowed, then + * make it smaller and try again. + */ + if (size > ring_buffer_max_event_size(buffer)) { + /* cnt < FAULTED size should never be bigger than max */ + if (WARN_ON_ONCE(cnt < FAULTED_SIZE)) + return -EBADF; + cnt = ring_buffer_max_event_size(buffer) - meta_size; + /* The above should only happen once */ + if (WARN_ON_ONCE(cnt + meta_size == size)) + return -EBADF; + goto again; + } + /* Ring buffer disabled, return as if not open for write */ return -EBADF; + } entry = ring_buffer_event_data(event); entry->ip = _THIS_IP_; -- GitLab From 40fc60e36c60ba85b2974e507b67df40c94e9578 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sat, 9 Dec 2023 17:52:20 -0500 Subject: [PATCH 0385/1290] trace_seq: Increase the buffer size to almost two pages Now that trace_marker can hold more than 1KB string, and can write as much as the ring buffer can hold, the trace_seq is not big enough to hold writes: ~# a="1234567890" ~# cnt=4080 ~# s="" ~# while [ $cnt -gt 10 ]; do ~# s="${s}${a}" ~# cnt=$((cnt-10)) ~# done ~# echo $s > trace_marker ~# cat trace # tracer: nop # # entries-in-buffer/entries-written: 2/2 #P:8 # # _-----=> irqs-off/BH-disabled # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / _-=> migrate-disable # |||| / delay # TASK-PID CPU# ||||| TIMESTAMP FUNCTION # | | | ||||| | | <...>-860 [002] ..... 105.543465: tracing_mark_write[LINE TOO BIG] <...>-860 [002] ..... 105.543496: tracing_mark_write: 789012345678901234567890 By increasing the trace_seq buffer to almost two pages, it can now print out the first line. This also subtracts the rest of the trace_seq fields from the buffer, so that the entire trace_seq is now PAGE_SIZE aligned. Link: https://lore.kernel.org/linux-trace-kernel/20231209175220.19867af4@gandalf.local.home Cc: Mark Rutland Cc: Mathieu Desnoyers Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_seq.h | 9 ++++++--- kernel/trace/trace.c | 6 +++--- kernel/trace/trace_seq.c | 3 --- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 3691e0e76a1a2..9ec229dfddaa7 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -8,11 +8,14 @@ /* * Trace sequences are used to allow a function to call several other functions - * to create a string of data to use (up to a max of PAGE_SIZE). + * to create a string of data to use. */ +#define TRACE_SEQ_BUFFER_SIZE (PAGE_SIZE * 2 - \ + (sizeof(struct seq_buf) + sizeof(size_t) + sizeof(int))) + struct trace_seq { - char buffer[PAGE_SIZE]; + char buffer[TRACE_SEQ_BUFFER_SIZE]; struct seq_buf seq; size_t readpos; int full; @@ -21,7 +24,7 @@ struct trace_seq { static inline void trace_seq_init(struct trace_seq *s) { - seq_buf_init(&s->seq, s->buffer, PAGE_SIZE); + seq_buf_init(&s->seq, s->buffer, TRACE_SEQ_BUFFER_SIZE); s->full = 0; s->readpos = 0; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dba1328e454b3..0be30cccabb4b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3753,7 +3753,7 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str, /* OK if part of the temp seq buffer */ if ((addr >= (unsigned long)iter->tmp_seq.buffer) && - (addr < (unsigned long)iter->tmp_seq.buffer + PAGE_SIZE)) + (addr < (unsigned long)iter->tmp_seq.buffer + TRACE_SEQ_BUFFER_SIZE)) return true; /* Core rodata can not be freed */ @@ -6932,8 +6932,8 @@ waitagain: goto out; } - if (cnt >= PAGE_SIZE) - cnt = PAGE_SIZE - 1; + if (cnt >= TRACE_SEQ_BUFFER_SIZE) + cnt = TRACE_SEQ_BUFFER_SIZE - 1; /* reset all but tr, trace, and overruns */ trace_iterator_reset(iter); diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index 7be97229ddf86..c158d65a8a886 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -13,9 +13,6 @@ * trace_seq_init() more than once to reset the trace_seq to start * from scratch. * - * The buffer size is currently PAGE_SIZE, although it may become dynamic - * in the future. - * * A write to the buffer will either succeed or fail. That is, unlike * sprintf() there will not be a partial write (well it may write into * the buffer but it wont update the pointers). This allows users to -- GitLab From 9482341d9bdaf194552673b6c1c81a824e985e7f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 12 Dec 2023 19:04:22 -0500 Subject: [PATCH 0386/1290] tracing: Have trace_marker break up by lines by size of trace_seq If a trace_marker write is bigger than what trace_seq can hold, then it will print "LINE TOO BIG" message and not what was written. Instead, check if the write is bigger than the trace_seq and break it up by that size. Ideally, we could make the trace_seq dynamic that could hold this. But that's for another time. Link: https://lore.kernel.org/linux-trace-kernel/20231212190422.1eaf224f@gandalf.local.home Cc: Mark Rutland Cc: Mathieu Desnoyers Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0be30cccabb4b..9cf58383d2fb3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7304,6 +7304,11 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (cnt < FAULTED_SIZE) size += FAULTED_SIZE - cnt; + if (size > TRACE_SEQ_BUFFER_SIZE) { + cnt -= size - TRACE_SEQ_BUFFER_SIZE; + goto again; + } + buffer = tr->array_buffer.buffer; event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, tracing_gen_ctx()); -- GitLab From 76ca20c748684f63bb985166850049f242797f65 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 13 Dec 2023 10:42:18 -0500 Subject: [PATCH 0387/1290] tracing: Increase size of trace_marker_raw to max ring buffer entry There's no reason to give an arbitrary limit to the size of a raw trace marker. Just let it be as big as the size that is allowed by the ring buffer itself. And there's also no reason to artificially break up the write to TRACE_BUF_SIZE, as that's not even used. Link: https://lore.kernel.org/linux-trace-kernel/20231213104218.2efc70c1@gandalf.local.home Cc: Mark Rutland Cc: Mathieu Desnoyers Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9cf58383d2fb3..55dabee4c78b1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7365,9 +7365,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, return written; } -/* Limit it for now to 3K (including tag) */ -#define RAW_DATA_MAX_SIZE (1024*3) - static ssize_t tracing_mark_raw_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) @@ -7389,19 +7386,18 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, return -EINVAL; /* The marker must at least have a tag id */ - if (cnt < sizeof(unsigned int) || cnt > RAW_DATA_MAX_SIZE) + if (cnt < sizeof(unsigned int)) return -EINVAL; - if (cnt > TRACE_BUF_SIZE) - cnt = TRACE_BUF_SIZE; - - BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); - size = sizeof(*entry) + cnt; if (cnt < FAULT_SIZE_ID) size += FAULT_SIZE_ID - cnt; buffer = tr->array_buffer.buffer; + + if (size > ring_buffer_max_event_size(buffer)) + return -EINVAL; + event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size, tracing_gen_ctx()); if (!event) -- GitLab From 3bf7009251f0f41cdd0188ab7b3879df81810703 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 13 Dec 2023 11:15:27 -0500 Subject: [PATCH 0388/1290] tracing/selftests: Add test to test the trace_marker Add a test that writes longs strings, some over the size of the sub buffer and make sure that the entire content is there. Link: https://lore.kernel.org/linux-trace-kernel/20231213111527.6a4c9b58@gandalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Acked-by: Shuah Khan Signed-off-by: Steven Rostedt (Google) --- .../ftrace/test.d/00basic/trace_marker.tc | 82 +++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100755 tools/testing/selftests/ftrace/test.d/00basic/trace_marker.tc diff --git a/tools/testing/selftests/ftrace/test.d/00basic/trace_marker.tc b/tools/testing/selftests/ftrace/test.d/00basic/trace_marker.tc new file mode 100755 index 0000000000000..9aa0db2b84fc5 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/00basic/trace_marker.tc @@ -0,0 +1,82 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Basic tests on writing to trace_marker +# requires: trace_marker +# flags: instance + +get_buffer_data_size() { + sed -ne 's/^.*data.*size:\([0-9][0-9]*\).*/\1/p' events/header_page +} + +get_buffer_data_offset() { + sed -ne 's/^.*data.*offset:\([0-9][0-9]*\).*/\1/p' events/header_page +} + +get_event_header_size() { + type_len=`sed -ne 's/^.*type_len.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event` + time_len=`sed -ne 's/^.*time_delta.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event` + array_len=`sed -ne 's/^.*array.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event` + total_bits=$((type_len+time_len+array_len)) + total_bits=$((total_bits+7)) + echo $((total_bits/8)) +} + +get_print_event_buf_offset() { + sed -ne 's/^.*buf.*offset:\([0-9][0-9]*\).*/\1/p' events/ftrace/print/format +} + +event_header_size=`get_event_header_size` +print_header_size=`get_print_event_buf_offset` + +data_offset=`get_buffer_data_offset` + +marker_meta=$((event_header_size+print_header_size)) + +make_str() { + cnt=$1 + # subtract two for \n\0 as marker adds these + cnt=$((cnt-2)) + printf -- 'X%.0s' $(seq $cnt) +} + +write_buffer() { + size=$1 + + str=`make_str $size` + + # clear the buffer + echo > trace + + # write the string into the marker + echo -n $str > trace_marker + + echo $str +} + +test_buffer() { + + size=`get_buffer_data_size` + oneline_size=$((size-marker_meta)) + echo size = $size + echo meta size = $marker_meta + + # Now add a little more the meta data overhead will overflow + + str=`write_buffer $size` + + # Make sure the line was broken + new_str=`awk ' /tracing_mark_write:/ { sub(/^.*tracing_mark_write: /,"");printf "%s", $0; exit}' trace` + + if [ "$new_str" = "$str" ]; then + exit fail; + fi + + # Make sure the entire line can be found + new_str=`awk ' /tracing_mark_write:/ { sub(/^.*tracing_mark_write: /,"");printf "%s", $0; }' trace` + + if [ "$new_str" != "$str" ]; then + exit fail; + fi +} + +test_buffer -- GitLab From c84897c0ff5925090af0c6a8bf61424b71481764 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 19 Dec 2023 07:43:03 -0500 Subject: [PATCH 0389/1290] ring-buffer: Remove 32bit timestamp logic Each event has a 27 bit timestamp delta that is used to hold the delta from the last event. If the time between events is greater than 2^27, then a timestamp is added that holds a 59 bit absolute timestamp. Until a389d86f7fd09 ("ring-buffer: Have nested events still record running time stamp"), if an interrupt interrupted an event in progress, all the events delta would be zero to not deal with the races that need to be handled. The commit a389d86f7fd09 changed that to handle the races giving all events, even those that preempt other events, still have an accurate timestamp. To handle those races requires performing 64-bit cmpxchg on the timestamps. But doing 64-bit cmpxchg on 32-bit architectures is considered very slow. To try to deal with this the timestamp logic was broken into two and then three 32-bit cmpxchgs, with the thought that two (or three) 32-bit cmpxchgs are still faster than a single 64-bit cmpxchg on 32-bit architectures. Part of the problem with this is that I didn't have any 32-bit architectures to test on. After hitting several subtle bugs in this code, an effort was made to try and see if three 32-bit cmpxchgs are indeed faster than a single 64-bit. After a few people brushed off the dust of their old 32-bit machines, tests were done, and even though 32-bit cmpxchg was faster than a single 64-bit, it was in the order of 50% at best, not 300%. After some more refactoring of the code, all 4 64-bit cmpxchg were removed: https://lore.kernel.org/linux-trace-kernel/20231211114420.36dde01b@gandalf.local.home https://lore.kernel.org/linux-trace-kernel/20231214222921.193037a7@gandalf.local.home https://lore.kernel.org/linux-trace-kernel/20231215081810.1f4f38fe@rorschach.local.home https://lore.kernel.org/linux-trace-kernel/20231218230712.3a76b081@gandalf.local.home/ With all the 64-bit cmpxchg removed, the complex 32-bit workaround can also be removed. The 32-bit and 64-bit logic is now exactly the same. Link: https://lore.kernel.org/all/20231213214632.15047c40@gandalf.local.home/ Link: https://lore.kernel.org/linux-trace-kernel/20231219074303.28f9abda@gandalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Linus Torvalds Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 176 ++----------------------------------- 1 file changed, 9 insertions(+), 167 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 16b640d824f92..a4ada4f303c4b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -27,6 +27,7 @@ #include #include +#include #include /* @@ -463,27 +464,9 @@ enum { RB_CTX_MAX }; -#if BITS_PER_LONG == 32 -#define RB_TIME_32 -#endif - -/* To test on 64 bit machines */ -//#define RB_TIME_32 - -#ifdef RB_TIME_32 - -struct rb_time_struct { - local_t cnt; - local_t top; - local_t bottom; - local_t msb; -}; -#else -#include struct rb_time_struct { local64_t time; }; -#endif typedef struct rb_time_struct rb_time_t; #define MAX_NEST 5 @@ -573,147 +556,14 @@ struct ring_buffer_iter { int missed_events; }; -#ifdef RB_TIME_32 - -/* - * On 32 bit machines, local64_t is very expensive. As the ring - * buffer doesn't need all the features of a true 64 bit atomic, - * on 32 bit, it uses these functions (64 still uses local64_t). - * - * For the ring buffer, 64 bit required operations for the time is - * the following: - * - * - Reads may fail if it interrupted a modification of the time stamp. - * It will succeed if it did not interrupt another write even if - * the read itself is interrupted by a write. - * It returns whether it was successful or not. - * - * - Writes always succeed and will overwrite other writes and writes - * that were done by events interrupting the current write. - * - * - A write followed by a read of the same time stamp will always succeed, - * but may not contain the same value. - * - * - A cmpxchg will fail if it interrupted another write or cmpxchg. - * Other than that, it acts like a normal cmpxchg. - * - * The 60 bit time stamp is broken up by 30 bits in a top and bottom half - * (bottom being the least significant 30 bits of the 60 bit time stamp). - * - * The two most significant bits of each half holds a 2 bit counter (0-3). - * Each update will increment this counter by one. - * When reading the top and bottom, if the two counter bits match then the - * top and bottom together make a valid 60 bit number. - */ -#define RB_TIME_SHIFT 30 -#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1) -#define RB_TIME_MSB_SHIFT 60 - -static inline int rb_time_cnt(unsigned long val) -{ - return (val >> RB_TIME_SHIFT) & 3; -} - -static inline u64 rb_time_val(unsigned long top, unsigned long bottom) -{ - u64 val; - - val = top & RB_TIME_VAL_MASK; - val <<= RB_TIME_SHIFT; - val |= bottom & RB_TIME_VAL_MASK; - - return val; -} - -static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) -{ - unsigned long top, bottom, msb; - unsigned long c; - - /* - * If the read is interrupted by a write, then the cnt will - * be different. Loop until both top and bottom have been read - * without interruption. - */ - do { - c = local_read(&t->cnt); - top = local_read(&t->top); - bottom = local_read(&t->bottom); - msb = local_read(&t->msb); - } while (c != local_read(&t->cnt)); - - *cnt = rb_time_cnt(top); - - /* If top, msb or bottom counts don't match, this interrupted a write */ - if (*cnt != rb_time_cnt(msb) || *cnt != rb_time_cnt(bottom)) - return false; - - /* The shift to msb will lose its cnt bits */ - *ret = rb_time_val(top, bottom) | ((u64)msb << RB_TIME_MSB_SHIFT); - return true; -} - -static bool rb_time_read(rb_time_t *t, u64 *ret) -{ - unsigned long cnt; - - return __rb_time_read(t, ret, &cnt); -} - -static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt) -{ - return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT); -} - -static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom, - unsigned long *msb) -{ - *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK); - *bottom = (unsigned long)(val & RB_TIME_VAL_MASK); - *msb = (unsigned long)(val >> RB_TIME_MSB_SHIFT); -} - -static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt) -{ - val = rb_time_val_cnt(val, cnt); - local_set(t, val); -} - -static void rb_time_set(rb_time_t *t, u64 val) -{ - unsigned long cnt, top, bottom, msb; - - rb_time_split(val, &top, &bottom, &msb); - - /* Writes always succeed with a valid number even if it gets interrupted. */ - do { - cnt = local_inc_return(&t->cnt); - rb_time_val_set(&t->top, top, cnt); - rb_time_val_set(&t->bottom, bottom, cnt); - rb_time_val_set(&t->msb, val >> RB_TIME_MSB_SHIFT, cnt); - } while (cnt != local_read(&t->cnt)); -} - -static inline bool -rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set) -{ - return local_try_cmpxchg(l, &expect, set); -} - -#else /* 64 bits */ - -/* local64_t always succeeds */ - -static inline bool rb_time_read(rb_time_t *t, u64 *ret) +static inline void rb_time_read(rb_time_t *t, u64 *ret) { *ret = local64_read(&t->time); - return true; } static void rb_time_set(rb_time_t *t, u64 val) { local64_set(&t->time, val); } -#endif /* * Enable this to make sure that the event passed to @@ -820,10 +670,7 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer, WARN_ONCE(1, "nest (%d) greater than max", nest); fail: - /* Can only fail on 32 bit */ - if (!rb_time_read(&cpu_buffer->write_stamp, &ts)) - /* Screw it, just read the current time */ - ts = rb_time_stamp(cpu_buffer->buffer); + rb_time_read(&cpu_buffer->write_stamp, &ts); return ts; } @@ -2820,7 +2667,7 @@ rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, (unsigned long long)info->ts, (unsigned long long)info->before, (unsigned long long)info->after, - (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0), + (unsigned long long)({rb_time_read(&cpu_buffer->write_stamp, &write_stamp); write_stamp;}), sched_clock_stable() ? "" : "If you just came from a suspend/resume,\n" "please switch to the trace global clock:\n" @@ -3497,16 +3344,14 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event; struct buffer_page *tail_page; unsigned long tail, write, w; - bool a_ok; - bool b_ok; /* Don't let the compiler play games with cpu_buffer->tail_page */ tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page); /*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK; barrier(); - b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); - a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); + rb_time_read(&cpu_buffer->before_stamp, &info->before); + rb_time_read(&cpu_buffer->write_stamp, &info->after); barrier(); info->ts = rb_time_stamp(cpu_buffer->buffer); @@ -3521,7 +3366,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (!w) { /* Use the sub-buffer timestamp */ info->delta = 0; - } else if (unlikely(!a_ok || !b_ok || info->before != info->after)) { + } else if (unlikely(info->before != info->after)) { info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND; info->length += RB_LEN_TIME_EXTEND; } else { @@ -3570,8 +3415,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* SLOW PATH - Interrupted between A and C */ /* Save the old before_stamp */ - a_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); - RB_WARN_ON(cpu_buffer, !a_ok); + rb_time_read(&cpu_buffer->before_stamp, &info->before); /* * Read a new timestamp and update the before_stamp to make @@ -3583,9 +3427,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, rb_time_set(&cpu_buffer->before_stamp, ts); barrier(); - /*E*/ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); - /* Was interrupted before here, write_stamp must be valid */ - RB_WARN_ON(cpu_buffer, !a_ok); + /*E*/ rb_time_read(&cpu_buffer->write_stamp, &info->after); barrier(); /*F*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && info->after == info->before && info->after < ts) { -- GitLab From d40dbb617ae9a8fe57343b26b4ad2bd7bb05c130 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 19 Dec 2023 07:45:42 -0500 Subject: [PATCH 0390/1290] ring-buffer: Add interrupt information to dump of data sub-buffer When the ring buffer timestamp verifier triggers, it dumps the content of the sub-buffer. But currently it only dumps the timestamps and the offset of the data as well as the deltas. It would be even more informative if the event data also showed the interrupt context level it was in. That is, if each event showed that the event was written in normal, softirq, irq or NMI context. Then a better idea about how the events may have been interrupted from each other. As the payload of the ring buffer is really a black box of the ring buffer, just assume that if the payload is larger than a trace entry, that it is a trace entry. As trace entries have the interrupt context information saved in a flags field, look at that location and report the output of the flags. If the payload is not a trace entry, there's no way to really know, and the information will be garbage. But that's OK, because this is for debugging only (this output is not used in production as the buffer check that calls it causes a huge overhead to the tracing). This information, when available, is crucial for debugging timestamp issues. If it's garbage, it will also be pretty obvious that its garbage too. As this output usually happens in kselftests of the tracing code, the user will know what the payload is at the time. Link: https://lore.kernel.org/linux-trace-kernel/20231219074542.6f304601@gandalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Suggested-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 79 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 75 insertions(+), 4 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a4ada4f303c4b..c0cc45482e1e7 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3185,6 +3185,76 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); #define CHECK_FULL_PAGE 1L #ifdef CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS + +static const char *show_irq_str(int bits) +{ + const char *type[] = { + ".", // 0 + "s", // 1 + "h", // 2 + "Hs", // 3 + "n", // 4 + "Ns", // 5 + "Nh", // 6 + "NHs", // 7 + }; + + return type[bits]; +} + +/* Assume this is an trace event */ +static const char *show_flags(struct ring_buffer_event *event) +{ + struct trace_entry *entry; + int bits = 0; + + if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry)) + return "X"; + + entry = ring_buffer_event_data(event); + + if (entry->flags & TRACE_FLAG_SOFTIRQ) + bits |= 1; + + if (entry->flags & TRACE_FLAG_HARDIRQ) + bits |= 2; + + if (entry->flags & TRACE_FLAG_NMI) + bits |= 4; + + return show_irq_str(bits); +} + +static const char *show_irq(struct ring_buffer_event *event) +{ + struct trace_entry *entry; + + if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry)) + return ""; + + entry = ring_buffer_event_data(event); + if (entry->flags & TRACE_FLAG_IRQS_OFF) + return "d"; + return ""; +} + +static const char *show_interrupt_level(void) +{ + unsigned long pc = preempt_count(); + unsigned char level = 0; + + if (pc & SOFTIRQ_OFFSET) + level |= 1; + + if (pc & HARDIRQ_MASK) + level |= 2; + + if (pc & NMI_MASK) + level |= 4; + + return show_irq_str(level); +} + static void dump_buffer_page(struct buffer_data_page *bpage, struct rb_event_info *info, unsigned long tail) @@ -3224,8 +3294,9 @@ static void dump_buffer_page(struct buffer_data_page *bpage, case RINGBUF_TYPE_DATA: ts += event->time_delta; - pr_warn(" 0x%x: [%lld] delta:%d\n", - e, ts, event->time_delta); + pr_warn(" 0x%x: [%lld] delta:%d %s%s\n", + e, ts, event->time_delta, + show_flags(event), show_irq(event)); break; default: @@ -3316,11 +3387,11 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, atomic_inc(&cpu_buffer->record_disabled); /* There's some cases in boot up that this can happen */ WARN_ON_ONCE(system_state != SYSTEM_BOOTING); - pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s\n", + pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n", cpu_buffer->cpu, ts + info->delta, info->ts, info->delta, info->before, info->after, - full ? " (full)" : ""); + full ? " (full)" : "", show_interrupt_level()); dump_buffer_page(bpage, info, tail); atomic_dec(&ts_dump); /* Do not re-enable checking */ -- GitLab From f50345b49b16f7fe6dbebbec065b9248c38556ff Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 19 Dec 2023 07:47:32 -0500 Subject: [PATCH 0391/1290] ring-buffer: Check if absolute timestamp goes backwards The check_buffer() which checks the timestamps of the ring buffer sub-buffer page, when enabled, only checks if the adding of deltas of the events from the last absolute timestamp or the timestamp of the sub-buffer page adds up to the current event. What it does not check is if the absolute timestamp causes the time of the events to go backwards, as that can cause issues elsewhere. Test for the timestamp going backwards too. This also fixes a slight issue where if the warning triggers at boot up (because of the resetting of the tsc), it will disable all further checks, even those that are after boot Have it continue checking if the warning was ignored during boot up. Link: https://lore.kernel.org/linux-trace-kernel/20231219074732.18b092d4@gandalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 46 +++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c0cc45482e1e7..f7dc74e45ebfa 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3309,6 +3309,23 @@ static void dump_buffer_page(struct buffer_data_page *bpage, static DEFINE_PER_CPU(atomic_t, checking); static atomic_t ts_dump; +#define buffer_warn_return(fmt, ...) \ + do { \ + /* If another report is happening, ignore this one */ \ + if (atomic_inc_return(&ts_dump) != 1) { \ + atomic_dec(&ts_dump); \ + goto out; \ + } \ + atomic_inc(&cpu_buffer->record_disabled); \ + pr_warn(fmt, ##__VA_ARGS__); \ + dump_buffer_page(bpage, info, tail); \ + atomic_dec(&ts_dump); \ + /* There's some cases in boot up that this can happen */ \ + if (WARN_ON_ONCE(system_state != SYSTEM_BOOTING)) \ + /* Do not re-enable checking */ \ + return; \ + } while (0) + /* * Check if the current event time stamp matches the deltas on * the buffer page. @@ -3362,7 +3379,12 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); - ts = rb_fix_abs_ts(delta, ts); + delta = rb_fix_abs_ts(delta, ts); + if (delta < ts) { + buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n", + cpu_buffer->cpu, ts, delta); + } + ts = delta; break; case RINGBUF_TYPE_PADDING: @@ -3379,23 +3401,11 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, } if ((full && ts > info->ts) || (!full && ts + info->delta != info->ts)) { - /* If another report is happening, ignore this one */ - if (atomic_inc_return(&ts_dump) != 1) { - atomic_dec(&ts_dump); - goto out; - } - atomic_inc(&cpu_buffer->record_disabled); - /* There's some cases in boot up that this can happen */ - WARN_ON_ONCE(system_state != SYSTEM_BOOTING); - pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n", - cpu_buffer->cpu, - ts + info->delta, info->ts, info->delta, - info->before, info->after, - full ? " (full)" : "", show_interrupt_level()); - dump_buffer_page(bpage, info, tail); - atomic_dec(&ts_dump); - /* Do not re-enable checking */ - return; + buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n", + cpu_buffer->cpu, + ts + info->delta, info->ts, info->delta, + info->before, info->after, + full ? " (full)" : "", show_interrupt_level()); } out: atomic_dec(this_cpu_ptr(&checking)); -- GitLab From 1903ef8f0d77680e6eb36bd0cc3ea3aa6e95a050 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 18 Dec 2023 15:34:42 +0800 Subject: [PATCH 0392/1290] iommu/vt-d: Refactor device_to_iommu() to retrieve iommu directly The device_to_iommu() helper was originally designed to look up the DMAR ACPI table to retrieve the iommu device and the request ID for a given device. However, it was also being used in other places where there was no need to lookup the ACPI table at all. Retrieve the iommu device directly from the per-device iommu private data in functions called after device is probed. Rename the original device_to_iommu() function to a more meaningful name, device_lookup_iommu(), to avoid mis-using it. Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20231116015048.29675-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 30 ++++++++++-------------------- drivers/iommu/intel/iommu.h | 1 - drivers/iommu/intel/svm.c | 20 +++----------------- 3 files changed, 13 insertions(+), 38 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 3531b956556c7..3b25b36bb9c73 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -703,7 +703,7 @@ static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev) return false; } -struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn) +static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn) { struct dmar_drhd_unit *drhd = NULL; struct pci_dev *pdev = NULL; @@ -2081,14 +2081,11 @@ static int domain_context_mapping_cb(struct pci_dev *pdev, static int domain_context_mapping(struct dmar_domain *domain, struct device *dev) { + struct device_domain_info *info = dev_iommu_priv_get(dev); struct domain_context_mapping_data data; + struct intel_iommu *iommu = info->iommu; + u8 bus = info->bus, devfn = info->devfn; struct pasid_table *table; - struct intel_iommu *iommu; - u8 bus, devfn; - - iommu = device_to_iommu(dev, &bus, &devfn); - if (!iommu) - return -ENODEV; table = intel_pasid_get_table(dev); @@ -2447,15 +2444,10 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); - struct intel_iommu *iommu; + struct intel_iommu *iommu = info->iommu; unsigned long flags; - u8 bus, devfn; int ret; - iommu = device_to_iommu(dev, &bus, &devfn); - if (!iommu) - return -ENODEV; - ret = domain_attach_iommu(domain, iommu); if (ret) return ret; @@ -4116,14 +4108,11 @@ static void intel_iommu_domain_free(struct iommu_domain *domain) int prepare_domain_attach_device(struct iommu_domain *domain, struct device *dev) { + struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *dmar_domain = to_dmar_domain(domain); - struct intel_iommu *iommu; + struct intel_iommu *iommu = info->iommu; int addr_width; - iommu = device_to_iommu(dev, NULL, NULL); - if (!iommu) - return -ENODEV; - if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) return -EINVAL; @@ -4399,7 +4388,7 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev) u8 bus, devfn; int ret; - iommu = device_to_iommu(dev, &bus, &devfn); + iommu = device_lookup_iommu(dev, &bus, &devfn); if (!iommu || !iommu->iommu.ops) return ERR_PTR(-ENODEV); @@ -4735,8 +4724,9 @@ static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain, static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid) { - struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); + struct device_domain_info *info = dev_iommu_priv_get(dev); struct dev_pasid_info *curr, *dev_pasid = NULL; + struct intel_iommu *iommu = info->iommu; struct dmar_domain *dmar_domain; struct iommu_domain *domain; unsigned long flags; diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 65d37a138c75d..f8f956ce43c52 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -897,7 +897,6 @@ int dmar_ir_support(void); void *alloc_pgtable_page(int node, gfp_t gfp); void free_pgtable_page(void *vaddr); void iommu_flush_write_buffer(struct intel_iommu *iommu); -struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn); struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, const struct iommu_user_data *user_data); diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 50a481c895b86..cc138e3ed4a6b 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -366,14 +366,9 @@ free_svm: void intel_svm_remove_dev_pasid(struct device *dev, u32 pasid) { struct intel_svm_dev *sdev; - struct intel_iommu *iommu; struct intel_svm *svm; struct mm_struct *mm; - iommu = device_to_iommu(dev, NULL, NULL); - if (!iommu) - return; - if (pasid_to_svm_sdev(dev, pasid, &svm, &sdev)) return; mm = svm->mm; @@ -724,25 +719,16 @@ int intel_svm_page_response(struct device *dev, struct iommu_fault_event *evt, struct iommu_page_response *msg) { + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + u8 bus = info->bus, devfn = info->devfn; struct iommu_fault_page_request *prm; - struct intel_iommu *iommu; bool private_present; bool pasid_present; bool last_page; - u8 bus, devfn; int ret = 0; u16 sid; - if (!dev || !dev_is_pci(dev)) - return -ENODEV; - - iommu = device_to_iommu(dev, &bus, &devfn); - if (!iommu) - return -ENODEV; - - if (!msg || !evt) - return -EINVAL; - prm = &evt->fault.prm; sid = PCI_DEVID(bus, devfn); pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; -- GitLab From 47642bdd5a25cd46228221719af2902b583bd36c Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 18 Dec 2023 15:34:43 +0800 Subject: [PATCH 0393/1290] iommu/vt-d: Remove unused parameter of intel_pasid_setup_pass_through() The domain parameter of this helper is unused and can be deleted to avoid dead code. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20231116015048.29675-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 5 ++--- drivers/iommu/intel/pasid.c | 1 - drivers/iommu/intel/pasid.h | 1 - 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 3b25b36bb9c73..cf4d13248bf28 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -2460,7 +2460,7 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { /* Setup the PASID entry for requests without PASID: */ if (hw_pass_through && domain_type_is_si(domain)) - ret = intel_pasid_setup_pass_through(iommu, domain, + ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID); else if (domain->use_first_level) ret = domain_setup_first_level(iommu, domain, dev, @@ -4797,8 +4797,7 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, goto out_free; if (domain_type_is_si(dmar_domain)) - ret = intel_pasid_setup_pass_through(iommu, dmar_domain, - dev, pasid); + ret = intel_pasid_setup_pass_through(iommu, dev, pasid); else if (dmar_domain->use_first_level) ret = domain_setup_first_level(iommu, dmar_domain, dev, pasid); diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 74e8e4c17e814..8a1bcabf71a93 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -767,7 +767,6 @@ int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, * Set up the scalable mode pasid entry for passthrough translation type. */ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, - struct dmar_domain *domain, struct device *dev, u32 pasid) { u16 did = FLPT_DEFAULT_DID; diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index dd37611175cc1..16265bc1f7ec6 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -111,7 +111,6 @@ int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, struct device *dev, u32 pasid, bool enabled); int intel_pasid_setup_pass_through(struct intel_iommu *iommu, - struct dmar_domain *domain, struct device *dev, u32 pasid); int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, u32 pasid, struct dmar_domain *domain); -- GitLab From d2b66903464ec29e54ac0991fa43c0da2c2a4892 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 18 Dec 2023 15:34:44 +0800 Subject: [PATCH 0394/1290] iommu/vt-d: Remove unused vcmd interfaces Commit 99b5726b4423 ("iommu: Remove ioasid infrastructure") has removed ioasid allocation interfaces from the iommu subsystem. As a result, these vcmd interfaces have become obsolete. Remove them to avoid dead code. Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20231116015048.29675-4-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/debugfs.c | 3 -- drivers/iommu/intel/iommu.h | 3 -- drivers/iommu/intel/pasid.c | 57 ----------------------------------- drivers/iommu/intel/pasid.h | 12 -------- 4 files changed, 75 deletions(-) diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c index dee61e513be6d..86b506af7daa1 100644 --- a/drivers/iommu/intel/debugfs.c +++ b/drivers/iommu/intel/debugfs.c @@ -106,9 +106,6 @@ static const struct iommu_regset iommu_regs_64[] = { IOMMU_REGSET_ENTRY(MTRR_PHYSMASK8), IOMMU_REGSET_ENTRY(MTRR_PHYSBASE9), IOMMU_REGSET_ENTRY(MTRR_PHYSMASK9), - IOMMU_REGSET_ENTRY(VCCAP), - IOMMU_REGSET_ENTRY(VCMD), - IOMMU_REGSET_ENTRY(VCRSP), }; static struct dentry *intel_iommu_debug; diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index f8f956ce43c52..37ddbf03d0a40 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -140,9 +140,6 @@ #define DMAR_ECEO_REG 0x408 #define DMAR_ECRSP_REG 0x410 #define DMAR_ECCAP_REG 0x430 -#define DMAR_VCCAP_REG 0xe30 /* Virtual command capability register */ -#define DMAR_VCMD_REG 0xe00 /* Virtual command register */ -#define DMAR_VCRSP_REG 0xe10 /* Virtual command response register */ #define DMAR_IQER_REG_IQEI(reg) FIELD_GET(GENMASK_ULL(3, 0), reg) #define DMAR_IQER_REG_ITESID(reg) FIELD_GET(GENMASK_ULL(47, 32), reg) diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 8a1bcabf71a93..57ae716a2c70a 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -26,63 +26,6 @@ */ u32 intel_pasid_max_id = PASID_MAX; -int vcmd_alloc_pasid(struct intel_iommu *iommu, u32 *pasid) -{ - unsigned long flags; - u8 status_code; - int ret = 0; - u64 res; - - raw_spin_lock_irqsave(&iommu->register_lock, flags); - dmar_writeq(iommu->reg + DMAR_VCMD_REG, VCMD_CMD_ALLOC); - IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq, - !(res & VCMD_VRSP_IP), res); - raw_spin_unlock_irqrestore(&iommu->register_lock, flags); - - status_code = VCMD_VRSP_SC(res); - switch (status_code) { - case VCMD_VRSP_SC_SUCCESS: - *pasid = VCMD_VRSP_RESULT_PASID(res); - break; - case VCMD_VRSP_SC_NO_PASID_AVAIL: - pr_info("IOMMU: %s: No PASID available\n", iommu->name); - ret = -ENOSPC; - break; - default: - ret = -ENODEV; - pr_warn("IOMMU: %s: Unexpected error code %d\n", - iommu->name, status_code); - } - - return ret; -} - -void vcmd_free_pasid(struct intel_iommu *iommu, u32 pasid) -{ - unsigned long flags; - u8 status_code; - u64 res; - - raw_spin_lock_irqsave(&iommu->register_lock, flags); - dmar_writeq(iommu->reg + DMAR_VCMD_REG, - VCMD_CMD_OPERAND(pasid) | VCMD_CMD_FREE); - IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq, - !(res & VCMD_VRSP_IP), res); - raw_spin_unlock_irqrestore(&iommu->register_lock, flags); - - status_code = VCMD_VRSP_SC(res); - switch (status_code) { - case VCMD_VRSP_SC_SUCCESS: - break; - case VCMD_VRSP_SC_INVALID_PASID: - pr_info("IOMMU: %s: Invalid PASID\n", iommu->name); - break; - default: - pr_warn("IOMMU: %s: Unexpected error code %d\n", - iommu->name, status_code); - } -} - /* * Per device pasid table management: */ diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index 16265bc1f7ec6..00401cfc2a406 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -22,16 +22,6 @@ #define is_pasid_enabled(entry) (((entry)->lo >> 3) & 0x1) #define get_pasid_dir_size(entry) (1 << ((((entry)->lo >> 9) & 0x7) + 7)) -/* Virtual command interface for enlightened pasid management. */ -#define VCMD_CMD_ALLOC 0x1 -#define VCMD_CMD_FREE 0x2 -#define VCMD_VRSP_IP 0x1 -#define VCMD_VRSP_SC(e) (((e) & 0xff) >> 1) -#define VCMD_VRSP_SC_SUCCESS 0 -#define VCMD_VRSP_SC_NO_PASID_AVAIL 16 -#define VCMD_VRSP_SC_INVALID_PASID 16 -#define VCMD_VRSP_RESULT_PASID(e) (((e) >> 16) & 0xfffff) -#define VCMD_CMD_OPERAND(e) ((e) << 16) /* * Domain ID reserved for pasid entries programmed for first-level * only and pass-through transfer modes. @@ -117,8 +107,6 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, u32 pasid, bool fault_ignore); -int vcmd_alloc_pasid(struct intel_iommu *iommu, u32 *pasid); -void vcmd_free_pasid(struct intel_iommu *iommu, u32 pasid); void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu, struct device *dev, u32 pasid); #endif /* __INTEL_PASID_H */ -- GitLab From 80b79e141da7251d6ea82573f058ee9418896465 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 18 Dec 2023 15:34:45 +0800 Subject: [PATCH 0395/1290] iommu/vt-d: Move inline helpers to header files Move inline helpers to header files so that other files can use them without duplicating the code. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20231116015048.29675-5-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 204 ++-------------------------------- drivers/iommu/intel/iommu.h | 175 +++++++++++++++++++++++++++++ drivers/iommu/intel/pasid.c | 216 +----------------------------------- drivers/iommu/intel/pasid.h | 210 +++++++++++++++++++++++++++++++++++ 4 files changed, 400 insertions(+), 405 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index cf4d13248bf28..dc23a1a49c80f 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -46,9 +46,6 @@ #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 -#define MAX_AGAW_WIDTH 64 -#define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT) - #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) @@ -63,74 +60,6 @@ #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) -/* page table handling */ -#define LEVEL_STRIDE (9) -#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) - -static inline int agaw_to_level(int agaw) -{ - return agaw + 2; -} - -static inline int agaw_to_width(int agaw) -{ - return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); -} - -static inline int width_to_agaw(int width) -{ - return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); -} - -static inline unsigned int level_to_offset_bits(int level) -{ - return (level - 1) * LEVEL_STRIDE; -} - -static inline int pfn_level_offset(u64 pfn, int level) -{ - return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; -} - -static inline u64 level_mask(int level) -{ - return -1ULL << level_to_offset_bits(level); -} - -static inline u64 level_size(int level) -{ - return 1ULL << level_to_offset_bits(level); -} - -static inline u64 align_to_level(u64 pfn, int level) -{ - return (pfn + level_size(level) - 1) & level_mask(level); -} - -static inline unsigned long lvl_to_nr_pages(unsigned int lvl) -{ - return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); -} - -/* VT-d pages must always be _smaller_ than MM pages. Otherwise things - are never going to work. */ -static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn) -{ - return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); -} -static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn) -{ - return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1; -} -static inline unsigned long page_to_dma_pfn(struct page *pg) -{ - return mm_to_dma_pfn_start(page_to_pfn(pg)); -} -static inline unsigned long virt_to_dma_pfn(void *p) -{ - return page_to_dma_pfn(virt_to_page(p)); -} - static void __init check_tylersburg_isoch(void); static int rwbf_quirk; @@ -168,78 +97,6 @@ static phys_addr_t root_entry_uctp(struct root_entry *re) return re->hi & VTD_PAGE_MASK; } -static inline void context_set_present(struct context_entry *context) -{ - context->lo |= 1; -} - -static inline void context_set_fault_enable(struct context_entry *context) -{ - context->lo &= (((u64)-1) << 2) | 1; -} - -static inline void context_set_translation_type(struct context_entry *context, - unsigned long value) -{ - context->lo &= (((u64)-1) << 4) | 3; - context->lo |= (value & 3) << 2; -} - -static inline void context_set_address_root(struct context_entry *context, - unsigned long value) -{ - context->lo &= ~VTD_PAGE_MASK; - context->lo |= value & VTD_PAGE_MASK; -} - -static inline void context_set_address_width(struct context_entry *context, - unsigned long value) -{ - context->hi |= value & 7; -} - -static inline void context_set_domain_id(struct context_entry *context, - unsigned long value) -{ - context->hi |= (value & ((1 << 16) - 1)) << 8; -} - -static inline void context_set_pasid(struct context_entry *context) -{ - context->lo |= CONTEXT_PASIDE; -} - -static inline int context_domain_id(struct context_entry *c) -{ - return((c->hi >> 8) & 0xffff); -} - -static inline void context_clear_entry(struct context_entry *context) -{ - context->lo = 0; - context->hi = 0; -} - -static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) -{ - if (!iommu->copied_tables) - return false; - - return test_bit(((long)bus << 8) | devfn, iommu->copied_tables); -} - -static inline void -set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) -{ - set_bit(((long)bus << 8) | devfn, iommu->copied_tables); -} - -static inline void -clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) -{ - clear_bit(((long)bus << 8) | devfn, iommu->copied_tables); -} - /* * This domain is a statically identity mapping domain. * 1. This domain creats a static 1:1 mapping to all usable memory. @@ -383,13 +240,12 @@ void free_pgtable_page(void *vaddr) free_page((unsigned long)vaddr); } -static inline int domain_type_is_si(struct dmar_domain *domain) +static int domain_type_is_si(struct dmar_domain *domain) { return domain->domain.type == IOMMU_DOMAIN_IDENTITY; } -static inline int domain_pfn_supported(struct dmar_domain *domain, - unsigned long pfn) +static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn) { int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; @@ -451,7 +307,7 @@ int iommu_calculate_agaw(struct intel_iommu *iommu) return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); } -static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu) +static bool iommu_paging_structure_coherency(struct intel_iommu *iommu) { return sm_supported(iommu) ? ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); @@ -1574,9 +1430,8 @@ static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, } /* Notification for newly created mappings */ -static inline void __mapping_notify_one(struct intel_iommu *iommu, - struct dmar_domain *domain, - unsigned long pfn, unsigned int pages) +static void __mapping_notify_one(struct intel_iommu *iommu, struct dmar_domain *domain, + unsigned long pfn, unsigned int pages) { /* * It's a non-present to present mapping. Only flush if caching mode @@ -1843,7 +1698,7 @@ void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) spin_unlock(&iommu->lock); } -static inline int guestwidth_to_adjustwidth(int gaw) +static int guestwidth_to_adjustwidth(int gaw) { int agaw; int r = (gaw - 12) % 9; @@ -1877,7 +1732,7 @@ static void domain_exit(struct dmar_domain *domain) * Value of X in the PDTS field of a scalable mode context entry * indicates PASID directory with 2^(X + 7) entries. */ -static inline unsigned long context_get_sm_pds(struct pasid_table *table) +static unsigned long context_get_sm_pds(struct pasid_table *table) { unsigned long pds, max_pde; @@ -1889,38 +1744,6 @@ static inline unsigned long context_get_sm_pds(struct pasid_table *table) return pds - 7; } -/* - * Set the RID_PASID field of a scalable mode context entry. The - * IOMMU hardware will use the PASID value set in this field for - * DMA translations of DMA requests without PASID. - */ -static inline void -context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) -{ - context->hi |= pasid & ((1 << 20) - 1); -} - -/* - * Set the DTE(Device-TLB Enable) field of a scalable mode context - * entry. - */ -static inline void context_set_sm_dte(struct context_entry *context) -{ - context->lo |= BIT_ULL(2); -} - -/* - * Set the PRE(Page Request Enable) field of a scalable mode context - * entry. - */ -static inline void context_set_sm_pre(struct context_entry *context) -{ - context->lo |= BIT_ULL(4); -} - -/* Convert value to context PASID directory size field coding. */ -#define context_pdts(pds) (((pds) & 0x7) << 9) - static int domain_context_mapping_one(struct dmar_domain *domain, struct intel_iommu *iommu, struct pasid_table *table, @@ -2102,18 +1925,15 @@ domain_context_mapping(struct dmar_domain *domain, struct device *dev) } /* Returns a number of VTD pages, but aligned to MM page size */ -static inline unsigned long aligned_nrpages(unsigned long host_addr, - size_t size) +static unsigned long aligned_nrpages(unsigned long host_addr, size_t size) { host_addr &= ~PAGE_MASK; return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; } /* Return largest possible superpage level for a given mapping */ -static inline int hardware_largepage_caps(struct dmar_domain *domain, - unsigned long iov_pfn, - unsigned long phy_pfn, - unsigned long pages) +static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn, + unsigned long phy_pfn, unsigned long pages) { int support, level = 1; unsigned long pfnmerge; @@ -3604,7 +3424,7 @@ void intel_iommu_shutdown(void) up_write(&dmar_global_lock); } -static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev) +static struct intel_iommu *dev_to_intel_iommu(struct device *dev) { struct iommu_device *iommu_dev = dev_to_iommu_device(dev); @@ -3683,7 +3503,7 @@ const struct attribute_group *intel_iommu_groups[] = { NULL, }; -static inline bool has_external_pci(void) +static bool has_external_pci(void) { struct pci_dev *pdev = NULL; diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 37ddbf03d0a40..0af1399836352 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -848,6 +848,181 @@ static inline bool context_present(struct context_entry *context) return (context->lo & 1); } +#define LEVEL_STRIDE (9) +#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) +#define MAX_AGAW_WIDTH (64) +#define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT) + +static inline int agaw_to_level(int agaw) +{ + return agaw + 2; +} + +static inline int agaw_to_width(int agaw) +{ + return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); +} + +static inline int width_to_agaw(int width) +{ + return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); +} + +static inline unsigned int level_to_offset_bits(int level) +{ + return (level - 1) * LEVEL_STRIDE; +} + +static inline int pfn_level_offset(u64 pfn, int level) +{ + return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; +} + +static inline u64 level_mask(int level) +{ + return -1ULL << level_to_offset_bits(level); +} + +static inline u64 level_size(int level) +{ + return 1ULL << level_to_offset_bits(level); +} + +static inline u64 align_to_level(u64 pfn, int level) +{ + return (pfn + level_size(level) - 1) & level_mask(level); +} + +static inline unsigned long lvl_to_nr_pages(unsigned int lvl) +{ + return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); +} + +/* VT-d pages must always be _smaller_ than MM pages. Otherwise things + are never going to work. */ +static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn) +{ + return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT); +} +static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn) +{ + return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1; +} +static inline unsigned long page_to_dma_pfn(struct page *pg) +{ + return mm_to_dma_pfn_start(page_to_pfn(pg)); +} +static inline unsigned long virt_to_dma_pfn(void *p) +{ + return page_to_dma_pfn(virt_to_page(p)); +} + +static inline void context_set_present(struct context_entry *context) +{ + context->lo |= 1; +} + +static inline void context_set_fault_enable(struct context_entry *context) +{ + context->lo &= (((u64)-1) << 2) | 1; +} + +static inline void context_set_translation_type(struct context_entry *context, + unsigned long value) +{ + context->lo &= (((u64)-1) << 4) | 3; + context->lo |= (value & 3) << 2; +} + +static inline void context_set_address_root(struct context_entry *context, + unsigned long value) +{ + context->lo &= ~VTD_PAGE_MASK; + context->lo |= value & VTD_PAGE_MASK; +} + +static inline void context_set_address_width(struct context_entry *context, + unsigned long value) +{ + context->hi |= value & 7; +} + +static inline void context_set_domain_id(struct context_entry *context, + unsigned long value) +{ + context->hi |= (value & ((1 << 16) - 1)) << 8; +} + +static inline void context_set_pasid(struct context_entry *context) +{ + context->lo |= CONTEXT_PASIDE; +} + +static inline int context_domain_id(struct context_entry *c) +{ + return((c->hi >> 8) & 0xffff); +} + +static inline void context_clear_entry(struct context_entry *context) +{ + context->lo = 0; + context->hi = 0; +} + +#ifdef CONFIG_INTEL_IOMMU +static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) +{ + if (!iommu->copied_tables) + return false; + + return test_bit(((long)bus << 8) | devfn, iommu->copied_tables); +} + +static inline void +set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) +{ + set_bit(((long)bus << 8) | devfn, iommu->copied_tables); +} + +static inline void +clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) +{ + clear_bit(((long)bus << 8) | devfn, iommu->copied_tables); +} +#endif /* CONFIG_INTEL_IOMMU */ + +/* + * Set the RID_PASID field of a scalable mode context entry. The + * IOMMU hardware will use the PASID value set in this field for + * DMA translations of DMA requests without PASID. + */ +static inline void +context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) +{ + context->hi |= pasid & ((1 << 20) - 1); +} + +/* + * Set the DTE(Device-TLB Enable) field of a scalable mode context + * entry. + */ +static inline void context_set_sm_dte(struct context_entry *context) +{ + context->lo |= BIT_ULL(2); +} + +/* + * Set the PRE(Page Request Enable) field of a scalable mode context + * entry. + */ +static inline void context_set_sm_pre(struct context_entry *context) +{ + context->lo |= BIT_ULL(4); +} + +/* Convert value to context PASID directory size field coding. */ +#define context_pdts(pds) (((pds) & 0x7) << 9) + struct dmar_drhd_unit *dmar_find_matched_drhd_unit(struct pci_dev *dev); int dmar_enable_qi(struct intel_iommu *iommu); diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 57ae716a2c70a..3239cefa4c337 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -173,30 +173,6 @@ retry: /* * Interfaces for PASID table entry manipulation: */ -static inline void pasid_clear_entry(struct pasid_entry *pe) -{ - WRITE_ONCE(pe->val[0], 0); - WRITE_ONCE(pe->val[1], 0); - WRITE_ONCE(pe->val[2], 0); - WRITE_ONCE(pe->val[3], 0); - WRITE_ONCE(pe->val[4], 0); - WRITE_ONCE(pe->val[5], 0); - WRITE_ONCE(pe->val[6], 0); - WRITE_ONCE(pe->val[7], 0); -} - -static inline void pasid_clear_entry_with_fpd(struct pasid_entry *pe) -{ - WRITE_ONCE(pe->val[0], PASID_PTE_FPD); - WRITE_ONCE(pe->val[1], 0); - WRITE_ONCE(pe->val[2], 0); - WRITE_ONCE(pe->val[3], 0); - WRITE_ONCE(pe->val[4], 0); - WRITE_ONCE(pe->val[5], 0); - WRITE_ONCE(pe->val[6], 0); - WRITE_ONCE(pe->val[7], 0); -} - static void intel_pasid_clear_entry(struct device *dev, u32 pasid, bool fault_ignore) { @@ -212,192 +188,6 @@ intel_pasid_clear_entry(struct device *dev, u32 pasid, bool fault_ignore) pasid_clear_entry(pe); } -static inline void pasid_set_bits(u64 *ptr, u64 mask, u64 bits) -{ - u64 old; - - old = READ_ONCE(*ptr); - WRITE_ONCE(*ptr, (old & ~mask) | bits); -} - -static inline u64 pasid_get_bits(u64 *ptr) -{ - return READ_ONCE(*ptr); -} - -/* - * Setup the DID(Domain Identifier) field (Bit 64~79) of scalable mode - * PASID entry. - */ -static inline void -pasid_set_domain_id(struct pasid_entry *pe, u64 value) -{ - pasid_set_bits(&pe->val[1], GENMASK_ULL(15, 0), value); -} - -/* - * Get domain ID value of a scalable mode PASID entry. - */ -static inline u16 -pasid_get_domain_id(struct pasid_entry *pe) -{ - return (u16)(READ_ONCE(pe->val[1]) & GENMASK_ULL(15, 0)); -} - -/* - * Setup the SLPTPTR(Second Level Page Table Pointer) field (Bit 12~63) - * of a scalable mode PASID entry. - */ -static inline void -pasid_set_slptr(struct pasid_entry *pe, u64 value) -{ - pasid_set_bits(&pe->val[0], VTD_PAGE_MASK, value); -} - -/* - * Setup the AW(Address Width) field (Bit 2~4) of a scalable mode PASID - * entry. - */ -static inline void -pasid_set_address_width(struct pasid_entry *pe, u64 value) -{ - pasid_set_bits(&pe->val[0], GENMASK_ULL(4, 2), value << 2); -} - -/* - * Setup the PGTT(PASID Granular Translation Type) field (Bit 6~8) - * of a scalable mode PASID entry. - */ -static inline void -pasid_set_translation_type(struct pasid_entry *pe, u64 value) -{ - pasid_set_bits(&pe->val[0], GENMASK_ULL(8, 6), value << 6); -} - -/* - * Enable fault processing by clearing the FPD(Fault Processing - * Disable) field (Bit 1) of a scalable mode PASID entry. - */ -static inline void pasid_set_fault_enable(struct pasid_entry *pe) -{ - pasid_set_bits(&pe->val[0], 1 << 1, 0); -} - -/* - * Enable second level A/D bits by setting the SLADE (Second Level - * Access Dirty Enable) field (Bit 9) of a scalable mode PASID - * entry. - */ -static inline void pasid_set_ssade(struct pasid_entry *pe) -{ - pasid_set_bits(&pe->val[0], 1 << 9, 1 << 9); -} - -/* - * Disable second level A/D bits by clearing the SLADE (Second Level - * Access Dirty Enable) field (Bit 9) of a scalable mode PASID - * entry. - */ -static inline void pasid_clear_ssade(struct pasid_entry *pe) -{ - pasid_set_bits(&pe->val[0], 1 << 9, 0); -} - -/* - * Checks if second level A/D bits specifically the SLADE (Second Level - * Access Dirty Enable) field (Bit 9) of a scalable mode PASID - * entry is set. - */ -static inline bool pasid_get_ssade(struct pasid_entry *pe) -{ - return pasid_get_bits(&pe->val[0]) & (1 << 9); -} - -/* - * Setup the SRE(Supervisor Request Enable) field (Bit 128) of a - * scalable mode PASID entry. - */ -static inline void pasid_set_sre(struct pasid_entry *pe) -{ - pasid_set_bits(&pe->val[2], 1 << 0, 1); -} - -/* - * Setup the WPE(Write Protect Enable) field (Bit 132) of a - * scalable mode PASID entry. - */ -static inline void pasid_set_wpe(struct pasid_entry *pe) -{ - pasid_set_bits(&pe->val[2], 1 << 4, 1 << 4); -} - -/* - * Setup the P(Present) field (Bit 0) of a scalable mode PASID - * entry. - */ -static inline void pasid_set_present(struct pasid_entry *pe) -{ - pasid_set_bits(&pe->val[0], 1 << 0, 1); -} - -/* - * Setup Page Walk Snoop bit (Bit 87) of a scalable mode PASID - * entry. - */ -static inline void pasid_set_page_snoop(struct pasid_entry *pe, bool value) -{ - pasid_set_bits(&pe->val[1], 1 << 23, value << 23); -} - -/* - * Setup No Execute Enable bit (Bit 133) of a scalable mode PASID - * entry. It is required when XD bit of the first level page table - * entry is about to be set. - */ -static inline void pasid_set_nxe(struct pasid_entry *pe) -{ - pasid_set_bits(&pe->val[2], 1 << 5, 1 << 5); -} - -/* - * Setup the Page Snoop (PGSNP) field (Bit 88) of a scalable mode - * PASID entry. - */ -static inline void -pasid_set_pgsnp(struct pasid_entry *pe) -{ - pasid_set_bits(&pe->val[1], 1ULL << 24, 1ULL << 24); -} - -/* - * Setup the First Level Page table Pointer field (Bit 140~191) - * of a scalable mode PASID entry. - */ -static inline void -pasid_set_flptr(struct pasid_entry *pe, u64 value) -{ - pasid_set_bits(&pe->val[2], VTD_PAGE_MASK, value); -} - -/* - * Setup the First Level Paging Mode field (Bit 130~131) of a - * scalable mode PASID entry. - */ -static inline void -pasid_set_flpm(struct pasid_entry *pe, u64 value) -{ - pasid_set_bits(&pe->val[2], GENMASK_ULL(3, 2), value << 2); -} - -/* - * Setup the Extended Access Flag Enable (EAFE) field (Bit 135) - * of a scalable mode PASID entry. - */ -static inline void pasid_set_eafe(struct pasid_entry *pe) -{ - pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7); -} - static void pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu, u16 did, u32 pasid) @@ -556,9 +346,9 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, * Skip top levels of page tables for iommu which has less agaw * than default. Unnecessary for PT mode. */ -static inline int iommu_skip_agaw(struct dmar_domain *domain, - struct intel_iommu *iommu, - struct dma_pte **pgd) +static int iommu_skip_agaw(struct dmar_domain *domain, + struct intel_iommu *iommu, + struct dma_pte **pgd) { int agaw; diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index 00401cfc2a406..8d40d4c66e319 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -86,6 +86,216 @@ static inline u16 pasid_pte_get_pgtt(struct pasid_entry *pte) return (u16)((READ_ONCE(pte->val[0]) >> 6) & 0x7); } +static inline void pasid_clear_entry(struct pasid_entry *pe) +{ + WRITE_ONCE(pe->val[0], 0); + WRITE_ONCE(pe->val[1], 0); + WRITE_ONCE(pe->val[2], 0); + WRITE_ONCE(pe->val[3], 0); + WRITE_ONCE(pe->val[4], 0); + WRITE_ONCE(pe->val[5], 0); + WRITE_ONCE(pe->val[6], 0); + WRITE_ONCE(pe->val[7], 0); +} + +static inline void pasid_clear_entry_with_fpd(struct pasid_entry *pe) +{ + WRITE_ONCE(pe->val[0], PASID_PTE_FPD); + WRITE_ONCE(pe->val[1], 0); + WRITE_ONCE(pe->val[2], 0); + WRITE_ONCE(pe->val[3], 0); + WRITE_ONCE(pe->val[4], 0); + WRITE_ONCE(pe->val[5], 0); + WRITE_ONCE(pe->val[6], 0); + WRITE_ONCE(pe->val[7], 0); +} + +static inline void pasid_set_bits(u64 *ptr, u64 mask, u64 bits) +{ + u64 old; + + old = READ_ONCE(*ptr); + WRITE_ONCE(*ptr, (old & ~mask) | bits); +} + +static inline u64 pasid_get_bits(u64 *ptr) +{ + return READ_ONCE(*ptr); +} + +/* + * Setup the DID(Domain Identifier) field (Bit 64~79) of scalable mode + * PASID entry. + */ +static inline void +pasid_set_domain_id(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[1], GENMASK_ULL(15, 0), value); +} + +/* + * Get domain ID value of a scalable mode PASID entry. + */ +static inline u16 +pasid_get_domain_id(struct pasid_entry *pe) +{ + return (u16)(READ_ONCE(pe->val[1]) & GENMASK_ULL(15, 0)); +} + +/* + * Setup the SLPTPTR(Second Level Page Table Pointer) field (Bit 12~63) + * of a scalable mode PASID entry. + */ +static inline void +pasid_set_slptr(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[0], VTD_PAGE_MASK, value); +} + +/* + * Setup the AW(Address Width) field (Bit 2~4) of a scalable mode PASID + * entry. + */ +static inline void +pasid_set_address_width(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[0], GENMASK_ULL(4, 2), value << 2); +} + +/* + * Setup the PGTT(PASID Granular Translation Type) field (Bit 6~8) + * of a scalable mode PASID entry. + */ +static inline void +pasid_set_translation_type(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[0], GENMASK_ULL(8, 6), value << 6); +} + +/* + * Enable fault processing by clearing the FPD(Fault Processing + * Disable) field (Bit 1) of a scalable mode PASID entry. + */ +static inline void pasid_set_fault_enable(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[0], 1 << 1, 0); +} + +/* + * Enable second level A/D bits by setting the SLADE (Second Level + * Access Dirty Enable) field (Bit 9) of a scalable mode PASID + * entry. + */ +static inline void pasid_set_ssade(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[0], 1 << 9, 1 << 9); +} + +/* + * Disable second level A/D bits by clearing the SLADE (Second Level + * Access Dirty Enable) field (Bit 9) of a scalable mode PASID + * entry. + */ +static inline void pasid_clear_ssade(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[0], 1 << 9, 0); +} + +/* + * Checks if second level A/D bits specifically the SLADE (Second Level + * Access Dirty Enable) field (Bit 9) of a scalable mode PASID + * entry is set. + */ +static inline bool pasid_get_ssade(struct pasid_entry *pe) +{ + return pasid_get_bits(&pe->val[0]) & (1 << 9); +} + +/* + * Setup the SRE(Supervisor Request Enable) field (Bit 128) of a + * scalable mode PASID entry. + */ +static inline void pasid_set_sre(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[2], 1 << 0, 1); +} + +/* + * Setup the WPE(Write Protect Enable) field (Bit 132) of a + * scalable mode PASID entry. + */ +static inline void pasid_set_wpe(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[2], 1 << 4, 1 << 4); +} + +/* + * Setup the P(Present) field (Bit 0) of a scalable mode PASID + * entry. + */ +static inline void pasid_set_present(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[0], 1 << 0, 1); +} + +/* + * Setup Page Walk Snoop bit (Bit 87) of a scalable mode PASID + * entry. + */ +static inline void pasid_set_page_snoop(struct pasid_entry *pe, bool value) +{ + pasid_set_bits(&pe->val[1], 1 << 23, value << 23); +} + +/* + * Setup No Execute Enable bit (Bit 133) of a scalable mode PASID + * entry. It is required when XD bit of the first level page table + * entry is about to be set. + */ +static inline void pasid_set_nxe(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[2], 1 << 5, 1 << 5); +} + +/* + * Setup the Page Snoop (PGSNP) field (Bit 88) of a scalable mode + * PASID entry. + */ +static inline void +pasid_set_pgsnp(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[1], 1ULL << 24, 1ULL << 24); +} + +/* + * Setup the First Level Page table Pointer field (Bit 140~191) + * of a scalable mode PASID entry. + */ +static inline void +pasid_set_flptr(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[2], VTD_PAGE_MASK, value); +} + +/* + * Setup the First Level Paging Mode field (Bit 130~131) of a + * scalable mode PASID entry. + */ +static inline void +pasid_set_flpm(struct pasid_entry *pe, u64 value) +{ + pasid_set_bits(&pe->val[2], GENMASK_ULL(3, 2), value << 2); +} + +/* + * Setup the Extended Access Flag Enable (EAFE) field (Bit 135) + * of a scalable mode PASID entry. + */ +static inline void pasid_set_eafe(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7); +} + extern unsigned int intel_pasid_max_id; int intel_pasid_alloc_table(struct device *dev); void intel_pasid_free_table(struct device *dev); -- GitLab From bb57f6705960bebeb832142ce9abf43220c3eab1 Mon Sep 17 00:00:00 2001 From: Ashish Mhetre Date: Tue, 5 Dec 2023 12:26:56 +0530 Subject: [PATCH 0396/1290] iommu: Don't reserve 0-length IOVA region When the bootloader/firmware doesn't setup the framebuffers, their address and size are 0 in "iommu-addresses" property. If IOVA region is reserved with 0 length, then it ends up corrupting the IOVA rbtree with an entry which has pfn_hi < pfn_lo. If we intend to use display driver in kernel without framebuffer then it's causing the display IOMMU mappings to fail as entire valid IOVA space is reserved when address and length are passed as 0. An ideal solution would be firmware removing the "iommu-addresses" property and corresponding "memory-region" if display is not present. But the kernel should be able to handle this by checking for size of IOVA region and skipping the IOVA reservation if size is 0. Also, add a warning if firmware is requesting 0-length IOVA region reservation. Fixes: a5bf3cfce8cb ("iommu: Implement of_iommu_get_resv_regions()") Signed-off-by: Ashish Mhetre Acked-by: Robin Murphy Link: https://lore.kernel.org/r/20231205065656.9544-1-amhetre@nvidia.com Signed-off-by: Joerg Roedel --- drivers/iommu/of_iommu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index 164317bfb8a81..691335a903a89 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -252,6 +252,10 @@ void of_iommu_get_resv_regions(struct device *dev, struct list_head *list) prot |= IOMMU_CACHE; maps = of_translate_dma_region(np, maps, &iova, &length); + if (length == 0) { + dev_warn(dev, "Cannot reserve IOVA region of 0 size\n"); + continue; + } type = iommu_resv_region_get_type(dev, &phys, iova, length); region = iommu_alloc_resv_region(iova, length, prot, type, -- GitLab From 624dda101e03c3a3a155d51e37a7bb7607cb760b Mon Sep 17 00:00:00 2001 From: Veronika Molnarova Date: Tue, 12 Dec 2023 17:59:08 +0100 Subject: [PATCH 0397/1290] perf archive: Add new option '--all' to pack perf.data with DSOs 'perf archive' has limited functionality and people from Red Hat Global Support Services sent a request for a new feature that would pack perf.data file together with an archive with debug symbols created by the command 'perf archive' as customers were being confused and often would forget to send perf.data file with the debug symbols. With this patch 'perf archive' now accepts an option '--all' that generates archive 'perf.all-hostname-date-time.tar.bz2' that holds file 'perf.data' and a sub-tar 'perf.symbols.tar.bz2' with debug symbols. The functionality of the command 'perf archive' was not changed. Committer testing: Run 'perf record' on a Intel 14900K machine, hybrid: root@number:~# perf record -a sleep 5s [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 4.006 MB perf.data (15427 samples) ] root@number:~# perf archive --all Now please run: $ tar xvf perf.all-number-20231219-104854.tar.bz2 && tar xvf perf.symbols.tar.bz2 -C ~/.debug wherever you need to run 'perf report' on. root@number:~# root@number:~# perf report --header-only # ======== # captured on : Tue Dec 19 10:48:48 2023 # header version : 1 # data offset : 1008 # data size : 4199936 # feat offset : 4200944 # hostname : number # os release : 6.6.4-200.fc39.x86_64 # perf version : 6.7.rc6.gca90f8e17b84 # arch : x86_64 # nrcpus online : 28 # nrcpus avail : 28 # cpudesc : Intel(R) Core(TM) i7-14700K # cpuid : GenuineIntel,6,183,1 # total memory : 32610508 kB # cmdline : /home/acme/bin/perf (deleted) record -a sleep 5s # event : name = cpu_atom/cycles/P, , id = { 5088024, 5088025, 5088026, 5088027, 5088028, 5088029, 5088030, 5088031, 5088032, 5088033, 5088034, 5088035 }, type = 0 (PERF_TYPE_HARDWARE), size> # event : name = cpu_core/cycles/P, , id = { 5088036, 5088037, 5088038, 5088039, 5088040, 5088041, 5088042, 5088043, 5088044, 5088045, 5088046, 5088047, 5088048, 5088049, 5088050, 5088051 },> # event : name = dummy:u, , id = { 5088052, 5088053, 5088054, 5088055, 5088056, 5088057, 5088058, 5088059, 5088060, 5088061, 5088062, 5088063, 5088064, 5088065, 5088066, 5088067, 5088068, 50> # CPU_TOPOLOGY info available, use -I to display # NUMA_TOPOLOGY info available, use -I to display # pmu mappings: cpu_atom = 10, cpu_core = 4, breakpoint = 5, cstate_core = 34, cstate_pkg = 35, i915 = 14, intel_bts = 11, intel_pt = 12, kprobe = 8, msr = 13, power = 36, software = 1, trac> # CACHE info available, use -I to display # time of first sample : 124739.850375 # time of last sample : 124744.855181 # sample duration : 5004.806 ms # sample duration : 5004.806 ms # MEM_TOPOLOGY info available, use -I to display # bpf_prog_info 2: bpf_prog_7cc47bbf07148bfe_hid_tail_call addr 0xffffffffc0000978 size 113 # bpf_prog_info 47: bpf_prog_713a545fe0530ce7_restrict_filesystems addr 0xffffffffc0000748 size 305 # bpf_prog_info 163: bpf_prog_bd834b0730296056 addr 0xffffffffc000df14 size 331 # bpf_prog_info 258: bpf_prog_ee0e253c78993a24_sd_devices addr 0xffffffffc001fc08 size 264 # bpf_prog_info 259: bpf_prog_40ddf486530245f5_sd_devices addr 0xffffffffc00204bc size 318 # bpf_prog_info 260: bpf_prog_6deef7357e7b4530_sd_fw_egress addr 0xffffffffc0020630 size 63 # bpf_prog_info 261: bpf_prog_6deef7357e7b4530_sd_fw_ingress addr 0xffffffffc0020688 size 63 # bpf_prog_info 262: bpf_prog_b37200ab714f0e17_sd_devices addr 0xffffffffc002072c size 110 # bpf_prog_info 263: bpf_prog_b90a282ee45cfed9_sd_devices addr 0xffffffffc00207d8 size 393 # bpf_prog_info 264: bpf_prog_ee0e253c78993a24_sd_devices addr 0xffffffffc002099c size 264 # bpf_prog_info 265: bpf_prog_6deef7357e7b4530_sd_fw_egress addr 0xffffffffc0020ad4 size 63 # bpf_prog_info 266: bpf_prog_6deef7357e7b4530_sd_fw_ingress addr 0xffffffffc0020b50 size 63 # bpf_prog_info 267: bpf_prog_ee0e253c78993a24_sd_devices addr 0xffffffffc002d98c size 264 # bpf_prog_info 268: bpf_prog_be31ae23198a0378_sd_devices addr 0xffffffffc002dac8 size 297 # bpf_prog_info 269: bpf_prog_ccbbf91f3c6979c7_sd_devices addr 0xffffffffc002dc54 size 360 # bpf_prog_info 270: bpf_prog_3a0ef5414c2f6fca_sd_devices addr 0xffffffffc002dde8 size 456 # bpf_prog_info 271: bpf_prog_6deef7357e7b4530_sd_fw_egress addr 0xffffffffc0020bd4 size 63 # bpf_prog_info 272: bpf_prog_6deef7357e7b4530_sd_fw_ingress addr 0xffffffffc00299b4 size 63 # bpf_prog_info 273: bpf_prog_ee0e253c78993a24_sd_devices addr 0xffffffffc002dfd0 size 264 # bpf_prog_info 274: bpf_prog_6deef7357e7b4530_sd_fw_egress addr 0xffffffffc0029a3c size 63 # bpf_prog_info 275: bpf_prog_6deef7357e7b4530_sd_fw_ingress addr 0xffffffffc002d71c size 63 # bpf_prog_info 276: bpf_prog_6deef7357e7b4530_sd_fw_egress addr 0xffffffffc002d7a8 size 63 # bpf_prog_info 277: bpf_prog_6deef7357e7b4530_sd_fw_ingress addr 0xffffffffc002e13c size 63 # bpf_prog_info 278: bpf_prog_6deef7357e7b4530_sd_fw_egress addr 0xffffffffc002e1a8 size 63 # bpf_prog_info 279: bpf_prog_6deef7357e7b4530_sd_fw_ingress addr 0xffffffffc002e234 size 63 # bpf_prog_info 280: bpf_prog_be31ae23198a0378_sd_devices addr 0xffffffffc002e2ac size 297 # bpf_prog_info 281: bpf_prog_6deef7357e7b4530_sd_fw_egress addr 0xffffffffc002e42c size 63 # bpf_prog_info 282: bpf_prog_6deef7357e7b4530_sd_fw_ingress addr 0xffffffffc002e49c size 63 # bpf_prog_info 290: bpf_prog_ee0e253c78993a24_sd_devices addr 0xffffffffc0004b18 size 264 # bpf_prog_info 294: bpf_prog_0b1566e4b83190c5_sd_devices addr 0xffffffffc0004c50 size 360 # bpf_prog_info 295: bpf_prog_ee0e253c78993a24_sd_devices addr 0xffffffffc001cfc8 size 264 # bpf_prog_info 296: bpf_prog_6deef7357e7b4530_sd_fw_egress addr 0xffffffffc0013abc size 63 # bpf_prog_info 297: bpf_prog_6deef7357e7b4530_sd_fw_ingress addr 0xffffffffc0013b24 size 63 # btf info of id 2 # btf info of id 52 # HYBRID_TOPOLOGY info available, use -I to display # cpu_atom pmu capabilities: branches=32, max_precise=3, pmu_name=alderlake_hybrid # cpu_core pmu capabilities: branches=32, max_precise=3, pmu_name=alderlake_hybrid # intel_pt pmu capabilities: topa_multiple_entries=1, psb_cyc=1, single_range_output=1, mtc_periods=249, ip_filtering=1, output_subsys=0, cr3_filtering=1, psb_periods=3f, event_trace=0, cycl> # missing features: TRACING_DATA BRANCH_STACK GROUP_DESC AUXTRACE STAT CLOCKID DIR_FORMAT COMPRESSED CPU_PMU_CAPS CLOCK_DATA # ======== # root@number:~# And then transferring it to a ARM64 machine, a Libre Computer RK3399-PC: root@number:~# scp perf.all-number-20231219-104854.tar.bz2 acme@192.168.86.114:. acme@192.168.86.114's password: perf.all-number-20231219-104854.tar.bz2 100% 145MB 85.4MB/s 00:01 root@number:~# root@number:~# ssh acme@192.168.86.114 acme@192.168.86.114's password: Welcome to Ubuntu 23.04 (GNU/Linux 6.1.68-12200-g1c40dda3081e aarch64) * Documentation: https://help.ubuntu.com * Management: https://landscape.canonical.com * Support: https://ubuntu.com/advantage Last login: Tue Dec 19 14:53:18 2023 from 192.168.86.42 acme@roc-rk3399-pc:~$ tar xvf perf.all-number-20231219-104854.tar.bz2 && tar xvf perf.symbols.tar.bz2 -C ~/.debug perf.data perf.symbols.tar.bz2 .build-id/ad/acc227f470409213308050b71f664322e2956c [kernel.kallsyms]/adacc227f470409213308050b71f664322e2956c/ [kernel.kallsyms]/adacc227f470409213308050b71f664322e2956c/kallsyms [kernel.kallsyms]/adacc227f470409213308050b71f664322e2956c/probes .build-id/76/c91f4d62baa06bb52e07e20aba36d21a8f9797 usr/lib64/libz.so.1.2.13/76c91f4d62baa06bb52e07e20aba36d21a8f9797/ .build-id/09/d7e96bc1e3f599d15ca28b36959124b2d74410 usr/lib64/librpm_sequoia.so.1/09d7e96bc1e3f599d15ca28b36959124b2d74410/ usr/lib64/librpm_sequoia.so.1/09d7e96bc1e3f599d15ca28b36959124b2d74410/elf usr/lib64/librpm_sequoia.so.1/09d7e96bc1e3f599d15ca28b36959124b2d74410/probes acme@roc-rk3399-pc:~$ acme@roc-rk3399-pc:~$ perf report --stdio | head -40 # To display the perf.data header info, please use --header/--header-only options. # # Total Lost Samples: 0 # # Samples: 6K of event 'cpu_atom/cycles/P' # Event count (approx.): 4519946621 # # Overhead Command Shared Object Symbol # ........ ............... .............................................. ......................................................................................................................................................... # 1.73% swapper [kernel.kallsyms] [k] intel_idle 1.43% sh [kernel.kallsyms] [k] next_uptodate_folio 0.94% make ld-linux-x86-64.so.2 [.] do_lookup_x 0.90% sh ld-linux-x86-64.so.2 [.] do_lookup_x 0.82% sh [kernel.kallsyms] [k] perf_event_mmap_output 0.74% sh [kernel.kallsyms] [k] filemap_map_pages 0.72% sh ld-linux-x86-64.so.2 [.] _dl_relocate_object 0.69% cc1 [kernel.kallsyms] [k] clear_page_erms 0.61% sh [kernel.kallsyms] [k] unmap_page_range 0.56% swapper [kernel.kallsyms] [k] poll_idle 0.52% cc1 ld-linux-x86-64.so.2 [.] do_lookup_x 0.47% make ld-linux-x86-64.so.2 [.] _dl_relocate_object 0.44% cc1 cc1 [.] make_node(tree_code) 0.43% sh [kernel.kallsyms] [k] native_irq_return_iret 0.38% sh libc.so.6 [.] _int_malloc 0.38% cc1 cc1 [.] decl_attributes(tree_node**, tree_node*, int, tree_node*) 0.38% sh [kernel.kallsyms] [k] clear_page_erms 0.37% cc1 cc1 [.] ht_lookup_with_hash(ht*, unsigned char const*, unsigned long, unsigned int, ht_lookup_option) 0.37% make [kernel.kallsyms] [k] perf_event_mmap_output 0.37% make ld-linux-x86-64.so.2 [.] _dl_lookup_symbol_x 0.35% sh [kernel.kallsyms] [k] _compound_head 0.35% make make [.] hash_find_slot 0.33% sh libc.so.6 [.] __strlen_avx2 0.33% cc1 cc1 [.] ggc_internal_alloc(unsigned long, void (*)(void*), unsigned long, unsigned long) 0.33% sh [kernel.kallsyms] [k] perf_iterate_ctx 0.31% make make [.] jhash_string 0.31% sh [kernel.kallsyms] [k] page_remove_rmap 0.30% cc1 libc.so.6 [.] _int_malloc 0.30% make libc.so.6 [.] _int_malloc acme@roc-rk3399-pc:~$ Signed-off-by: Veronika Molnarova Tested-by: Arnaldo Carvalho de Melo Cc: Michael Petlan Link: https://lore.kernel.org/r/20231212165909.14459-1-vmolnaro@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/perf-archive.sh | 40 ++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) mode change 100644 => 100755 tools/perf/perf-archive.sh diff --git a/tools/perf/perf-archive.sh b/tools/perf/perf-archive.sh old mode 100644 new mode 100755 index 133f0eddbcc46..a92042eae95a1 --- a/tools/perf/perf-archive.sh +++ b/tools/perf/perf-archive.sh @@ -4,9 +4,19 @@ # Arnaldo Carvalho de Melo PERF_DATA=perf.data -if [ $# -ne 0 ] ; then - PERF_DATA=$1 -fi +PERF_SYMBOLS=perf.symbols +PERF_ALL=perf.all +ALL=0 + +while [ $# -gt 0 ] ; do + if [ $1 == "--all" ]; then + ALL=1 + shift + else + PERF_DATA=$1 + shift + fi +done # # PERF_BUILDID_DIR environment variable set by perf @@ -39,9 +49,23 @@ while read build_id ; do echo ${filename#$PERF_BUILDID_LINKDIR} >> $MANIFEST done -tar cjf $PERF_DATA.tar.bz2 -C $PERF_BUILDID_DIR -T $MANIFEST -rm $MANIFEST $BUILDIDS || true -echo -e "Now please run:\n" -echo -e "$ tar xvf $PERF_DATA.tar.bz2 -C ~/.debug\n" -echo "wherever you need to run 'perf report' on." +if [ $ALL -eq 1 ]; then # pack perf.data file together with tar containing debug symbols + HOSTNAME=$(hostname) + DATE=$(date '+%Y%m%d-%H%M%S') + tar cjf $PERF_SYMBOLS.tar.bz2 -C $PERF_BUILDID_DIR -T $MANIFEST + tar cjf $PERF_ALL-$HOSTNAME-$DATE.tar.bz2 $PERF_DATA $PERF_SYMBOLS.tar.bz2 + rm $PERF_SYMBOLS.tar.bz2 $MANIFEST $BUILDIDS || true + + echo -e "Now please run:\n" + echo -e "$ tar xvf $PERF_ALL-$HOSTNAME-$DATE.tar.bz2 && tar xvf $PERF_SYMBOLS.tar.bz2 -C ~/.debug\n" + echo "wherever you need to run 'perf report' on." +else # pack only the debug symbols + tar cjf $PERF_DATA.tar.bz2 -C $PERF_BUILDID_DIR -T $MANIFEST + rm $MANIFEST $BUILDIDS || true + + echo -e "Now please run:\n" + echo -e "$ tar xvf $PERF_DATA.tar.bz2 -C ~/.debug\n" + echo "wherever you need to run 'perf report' on." +fi + exit 0 -- GitLab From 838bebb4c926a573839ff1bfe1b654a6ed0f9779 Mon Sep 17 00:00:00 2001 From: Feng Liu Date: Tue, 19 Dec 2023 11:32:39 +0200 Subject: [PATCH 0398/1290] virtio: Define feature bit for administration virtqueue Introduce VIRTIO_F_ADMIN_VQ which is used for administration virtqueue support. Signed-off-by: Feng Liu Reviewed-by: Parav Pandit Reviewed-by: Jiri Pirko Acked-by: Michael S. Tsirkin Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20231219093247.170936-2-yishaih@nvidia.com Signed-off-by: Alex Williamson --- include/uapi/linux/virtio_config.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h index 8881aea60f6f1..2445f365bce74 100644 --- a/include/uapi/linux/virtio_config.h +++ b/include/uapi/linux/virtio_config.h @@ -52,7 +52,7 @@ * rest are per-device feature bits. */ #define VIRTIO_TRANSPORT_F_START 28 -#define VIRTIO_TRANSPORT_F_END 41 +#define VIRTIO_TRANSPORT_F_END 42 #ifndef VIRTIO_CONFIG_NO_LEGACY /* Do we get callbacks when the ring is completely used, even if we've @@ -114,4 +114,10 @@ * This feature indicates that the driver can reset a queue individually. */ #define VIRTIO_F_RING_RESET 40 + +/* + * This feature indicates that the device support administration virtqueues. + */ +#define VIRTIO_F_ADMIN_VQ 41 + #endif /* _UAPI_LINUX_VIRTIO_CONFIG_H */ -- GitLab From fd27ef6b44bec26915c5b2b22c13856d9f0ba17a Mon Sep 17 00:00:00 2001 From: Feng Liu Date: Tue, 19 Dec 2023 11:32:40 +0200 Subject: [PATCH 0399/1290] virtio-pci: Introduce admin virtqueue Introduce support for the admin virtqueue. By negotiating VIRTIO_F_ADMIN_VQ feature, driver detects capability and creates one administration virtqueue. Administration virtqueue implementation in virtio pci generic layer, enables multiple types of upper layer drivers such as vfio, net, blk to utilize it. Signed-off-by: Feng Liu Reviewed-by: Parav Pandit Reviewed-by: Jiri Pirko Acked-by: Michael S. Tsirkin Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20231219093247.170936-3-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/virtio/virtio.c | 37 +++++++++++-- drivers/virtio/virtio_pci_common.c | 3 ++ drivers/virtio/virtio_pci_common.h | 15 +++++- drivers/virtio/virtio_pci_modern.c | 75 +++++++++++++++++++++++++- drivers/virtio/virtio_pci_modern_dev.c | 24 ++++++++- include/linux/virtio_config.h | 4 ++ include/linux/virtio_pci_modern.h | 2 + include/uapi/linux/virtio_pci.h | 5 ++ 8 files changed, 157 insertions(+), 8 deletions(-) diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index 3893dc29eb263..f4080692b3513 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -302,9 +302,15 @@ static int virtio_dev_probe(struct device *_d) if (err) goto err; + if (dev->config->create_avq) { + err = dev->config->create_avq(dev); + if (err) + goto err; + } + err = drv->probe(dev); if (err) - goto err; + goto err_probe; /* If probe didn't do it, mark device DRIVER_OK ourselves. */ if (!(dev->config->get_status(dev) & VIRTIO_CONFIG_S_DRIVER_OK)) @@ -316,6 +322,10 @@ static int virtio_dev_probe(struct device *_d) virtio_config_enable(dev); return 0; + +err_probe: + if (dev->config->destroy_avq) + dev->config->destroy_avq(dev); err: virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED); return err; @@ -331,6 +341,9 @@ static void virtio_dev_remove(struct device *_d) drv->remove(dev); + if (dev->config->destroy_avq) + dev->config->destroy_avq(dev); + /* Driver should have reset device. */ WARN_ON_ONCE(dev->config->get_status(dev)); @@ -489,13 +502,20 @@ EXPORT_SYMBOL_GPL(unregister_virtio_device); int virtio_device_freeze(struct virtio_device *dev) { struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); + int ret; virtio_config_disable(dev); dev->failed = dev->config->get_status(dev) & VIRTIO_CONFIG_S_FAILED; - if (drv && drv->freeze) - return drv->freeze(dev); + if (drv && drv->freeze) { + ret = drv->freeze(dev); + if (ret) + return ret; + } + + if (dev->config->destroy_avq) + dev->config->destroy_avq(dev); return 0; } @@ -532,10 +552,16 @@ int virtio_device_restore(struct virtio_device *dev) if (ret) goto err; + if (dev->config->create_avq) { + ret = dev->config->create_avq(dev); + if (ret) + goto err; + } + if (drv->restore) { ret = drv->restore(dev); if (ret) - goto err; + goto err_restore; } /* If restore didn't do it, mark device DRIVER_OK ourselves. */ @@ -546,6 +572,9 @@ int virtio_device_restore(struct virtio_device *dev) return 0; +err_restore: + if (dev->config->destroy_avq) + dev->config->destroy_avq(dev); err: virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED); return ret; diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index 7a5593997e0ef..fafd13d0e4d46 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -236,6 +236,9 @@ void vp_del_vqs(struct virtio_device *vdev) int i; list_for_each_entry_safe(vq, n, &vdev->vqs, list) { + if (vp_dev->is_avq(vdev, vq->index)) + continue; + if (vp_dev->per_vq_vectors) { int v = vp_dev->vqs[vq->index]->msix_vector; diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h index 4b773bd7c58cb..7306128e63e98 100644 --- a/drivers/virtio/virtio_pci_common.h +++ b/drivers/virtio/virtio_pci_common.h @@ -41,6 +41,14 @@ struct virtio_pci_vq_info { unsigned int msix_vector; }; +struct virtio_pci_admin_vq { + /* Virtqueue info associated with this admin queue. */ + struct virtio_pci_vq_info info; + /* Name of the admin queue: avq.$vq_index. */ + char name[10]; + u16 vq_index; +}; + /* Our device structure */ struct virtio_pci_device { struct virtio_device vdev; @@ -58,9 +66,13 @@ struct virtio_pci_device { spinlock_t lock; struct list_head virtqueues; - /* array of all queues for house-keeping */ + /* Array of all virtqueues reported in the + * PCI common config num_queues field + */ struct virtio_pci_vq_info **vqs; + struct virtio_pci_admin_vq admin_vq; + /* MSI-X support */ int msix_enabled; int intx_enabled; @@ -86,6 +98,7 @@ struct virtio_pci_device { void (*del_vq)(struct virtio_pci_vq_info *info); u16 (*config_vector)(struct virtio_pci_device *vp_dev, u16 vector); + bool (*is_avq)(struct virtio_device *vdev, unsigned int index); }; /* Constants for MSI-X */ diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index ee6a386d250b1..ce915018b5b0f 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -19,6 +19,8 @@ #define VIRTIO_RING_NO_LEGACY #include "virtio_pci_common.h" +#define VIRTIO_AVQ_SGS_MAX 4 + static u64 vp_get_features(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); @@ -26,6 +28,16 @@ static u64 vp_get_features(struct virtio_device *vdev) return vp_modern_get_features(&vp_dev->mdev); } +static bool vp_is_avq(struct virtio_device *vdev, unsigned int index) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + + if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ)) + return false; + + return index == vp_dev->admin_vq.vq_index; +} + static void vp_transport_features(struct virtio_device *vdev, u64 features) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); @@ -37,6 +49,9 @@ static void vp_transport_features(struct virtio_device *vdev, u64 features) if (features & BIT_ULL(VIRTIO_F_RING_RESET)) __virtio_set_bit(vdev, VIRTIO_F_RING_RESET); + + if (features & BIT_ULL(VIRTIO_F_ADMIN_VQ)) + __virtio_set_bit(vdev, VIRTIO_F_ADMIN_VQ); } static int __vp_check_common_size_one_feature(struct virtio_device *vdev, u32 fbit, @@ -69,6 +84,9 @@ static int vp_check_common_size(struct virtio_device *vdev) if (vp_check_common_size_one_feature(vdev, VIRTIO_F_RING_RESET, queue_reset)) return -EINVAL; + if (vp_check_common_size_one_feature(vdev, VIRTIO_F_ADMIN_VQ, admin_queue_num)) + return -EINVAL; + return 0; } @@ -345,6 +363,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, struct virtio_pci_modern_device *mdev = &vp_dev->mdev; bool (*notify)(struct virtqueue *vq); struct virtqueue *vq; + bool is_avq; u16 num; int err; @@ -353,11 +372,13 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, else notify = vp_notify; - if (index >= vp_modern_get_num_queues(mdev)) + is_avq = vp_is_avq(&vp_dev->vdev, index); + if (index >= vp_modern_get_num_queues(mdev) && !is_avq) return ERR_PTR(-EINVAL); + num = is_avq ? + VIRTIO_AVQ_SGS_MAX : vp_modern_get_queue_size(mdev, index); /* Check if queue is either not available or already active. */ - num = vp_modern_get_queue_size(mdev, index); if (!num || vp_modern_get_queue_enable(mdev, index)) return ERR_PTR(-ENOENT); @@ -383,6 +404,9 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, goto err; } + if (is_avq) + vp_dev->admin_vq.info.vq = vq; + return vq; err: @@ -418,6 +442,9 @@ static void del_vq(struct virtio_pci_vq_info *info) struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + if (vp_is_avq(&vp_dev->vdev, vq->index)) + vp_dev->admin_vq.info.vq = NULL; + if (vp_dev->msix_enabled) vp_modern_queue_vector(mdev, vq->index, VIRTIO_MSI_NO_VECTOR); @@ -527,6 +554,45 @@ static bool vp_get_shm_region(struct virtio_device *vdev, return true; } +static int vp_modern_create_avq(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtio_pci_admin_vq *avq; + struct virtqueue *vq; + u16 admin_q_num; + + if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ)) + return 0; + + admin_q_num = vp_modern_avq_num(&vp_dev->mdev); + if (!admin_q_num) + return -EINVAL; + + avq = &vp_dev->admin_vq; + avq->vq_index = vp_modern_avq_index(&vp_dev->mdev); + sprintf(avq->name, "avq.%u", avq->vq_index); + vq = vp_dev->setup_vq(vp_dev, &vp_dev->admin_vq.info, avq->vq_index, NULL, + avq->name, NULL, VIRTIO_MSI_NO_VECTOR); + if (IS_ERR(vq)) { + dev_err(&vdev->dev, "failed to setup admin virtqueue, err=%ld", + PTR_ERR(vq)); + return PTR_ERR(vq); + } + + vp_modern_set_queue_enable(&vp_dev->mdev, avq->info.vq->index, true); + return 0; +} + +static void vp_modern_destroy_avq(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + + if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ)) + return; + + vp_dev->del_vq(&vp_dev->admin_vq.info); +} + static const struct virtio_config_ops virtio_pci_config_nodev_ops = { .get = NULL, .set = NULL, @@ -545,6 +611,8 @@ static const struct virtio_config_ops virtio_pci_config_nodev_ops = { .get_shm_region = vp_get_shm_region, .disable_vq_and_reset = vp_modern_disable_vq_and_reset, .enable_vq_after_reset = vp_modern_enable_vq_after_reset, + .create_avq = vp_modern_create_avq, + .destroy_avq = vp_modern_destroy_avq, }; static const struct virtio_config_ops virtio_pci_config_ops = { @@ -565,6 +633,8 @@ static const struct virtio_config_ops virtio_pci_config_ops = { .get_shm_region = vp_get_shm_region, .disable_vq_and_reset = vp_modern_disable_vq_and_reset, .enable_vq_after_reset = vp_modern_enable_vq_after_reset, + .create_avq = vp_modern_create_avq, + .destroy_avq = vp_modern_destroy_avq, }; /* the PCI probing function */ @@ -588,6 +658,7 @@ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev) vp_dev->config_vector = vp_config_vector; vp_dev->setup_vq = setup_vq; vp_dev->del_vq = del_vq; + vp_dev->is_avq = vp_is_avq; vp_dev->isr = mdev->isr; vp_dev->vdev.id = mdev->id; diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c index 7de8b1ebabac4..0d3dbfaf4b236 100644 --- a/drivers/virtio/virtio_pci_modern_dev.c +++ b/drivers/virtio/virtio_pci_modern_dev.c @@ -207,6 +207,10 @@ static inline void check_offsets(void) offsetof(struct virtio_pci_modern_common_cfg, queue_notify_data)); BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_RESET != offsetof(struct virtio_pci_modern_common_cfg, queue_reset)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_ADM_Q_IDX != + offsetof(struct virtio_pci_modern_common_cfg, admin_queue_index)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_ADM_Q_NUM != + offsetof(struct virtio_pci_modern_common_cfg, admin_queue_num)); } /* @@ -296,7 +300,7 @@ int vp_modern_probe(struct virtio_pci_modern_device *mdev) mdev->common = vp_modern_map_capability(mdev, common, sizeof(struct virtio_pci_common_cfg), 4, 0, offsetofend(struct virtio_pci_modern_common_cfg, - queue_reset), + admin_queue_num), &mdev->common_len, NULL); if (!mdev->common) goto err_map_common; @@ -719,6 +723,24 @@ void __iomem *vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev, } EXPORT_SYMBOL_GPL(vp_modern_map_vq_notify); +u16 vp_modern_avq_num(struct virtio_pci_modern_device *mdev) +{ + struct virtio_pci_modern_common_cfg __iomem *cfg; + + cfg = (struct virtio_pci_modern_common_cfg __iomem *)mdev->common; + return vp_ioread16(&cfg->admin_queue_num); +} +EXPORT_SYMBOL_GPL(vp_modern_avq_num); + +u16 vp_modern_avq_index(struct virtio_pci_modern_device *mdev) +{ + struct virtio_pci_modern_common_cfg __iomem *cfg; + + cfg = (struct virtio_pci_modern_common_cfg __iomem *)mdev->common; + return vp_ioread16(&cfg->admin_queue_index); +} +EXPORT_SYMBOL_GPL(vp_modern_avq_index); + MODULE_VERSION("0.1"); MODULE_DESCRIPTION("Modern Virtio PCI Device"); MODULE_AUTHOR("Jason Wang "); diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 2b3438de2c4d4..da9b271b54db8 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -93,6 +93,8 @@ typedef void vq_callback_t(struct virtqueue *); * Returns 0 on success or error status * If disable_vq_and_reset is set, then enable_vq_after_reset must also be * set. + * @create_avq: create admin virtqueue resource. + * @destroy_avq: destroy admin virtqueue resource. */ struct virtio_config_ops { void (*get)(struct virtio_device *vdev, unsigned offset, @@ -120,6 +122,8 @@ struct virtio_config_ops { struct virtio_shm_region *region, u8 id); int (*disable_vq_and_reset)(struct virtqueue *vq); int (*enable_vq_after_reset)(struct virtqueue *vq); + int (*create_avq)(struct virtio_device *vdev); + void (*destroy_avq)(struct virtio_device *vdev); }; /* If driver didn't advertise the feature, it will never appear. */ diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h index a09e13a577a99..c0b1b1ca11635 100644 --- a/include/linux/virtio_pci_modern.h +++ b/include/linux/virtio_pci_modern.h @@ -125,4 +125,6 @@ int vp_modern_probe(struct virtio_pci_modern_device *mdev); void vp_modern_remove(struct virtio_pci_modern_device *mdev); int vp_modern_get_queue_reset(struct virtio_pci_modern_device *mdev, u16 index); void vp_modern_set_queue_reset(struct virtio_pci_modern_device *mdev, u16 index); +u16 vp_modern_avq_num(struct virtio_pci_modern_device *mdev); +u16 vp_modern_avq_index(struct virtio_pci_modern_device *mdev); #endif diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h index 44f4dd2add188..240ddeef7eae3 100644 --- a/include/uapi/linux/virtio_pci.h +++ b/include/uapi/linux/virtio_pci.h @@ -175,6 +175,9 @@ struct virtio_pci_modern_common_cfg { __le16 queue_notify_data; /* read-write */ __le16 queue_reset; /* read-write */ + + __le16 admin_queue_index; /* read-only */ + __le16 admin_queue_num; /* read-only */ }; /* Fields in VIRTIO_PCI_CAP_PCI_CFG: */ @@ -215,6 +218,8 @@ struct virtio_pci_cfg_cap { #define VIRTIO_PCI_COMMON_Q_USEDHI 52 #define VIRTIO_PCI_COMMON_Q_NDATA 56 #define VIRTIO_PCI_COMMON_Q_RESET 58 +#define VIRTIO_PCI_COMMON_ADM_Q_IDX 60 +#define VIRTIO_PCI_COMMON_ADM_Q_NUM 62 #endif /* VIRTIO_PCI_NO_MODERN */ -- GitLab From 92792ac752aa80d5ee71bc291d90edd06cd76bd1 Mon Sep 17 00:00:00 2001 From: Feng Liu Date: Tue, 19 Dec 2023 11:32:41 +0200 Subject: [PATCH 0400/1290] virtio-pci: Introduce admin command sending function Add support for sending admin command through admin virtqueue interface. Abort any inflight admin commands once device reset completes. Activate admin queue when device becomes ready; deactivate on device reset. To comply to the below specification statement [1], the admin virtqueue is activated for upper layer users only after setting DRIVER_OK status. [1] The driver MUST NOT send any buffer available notifications to the device before setting DRIVER_OK. Signed-off-by: Feng Liu Reviewed-by: Parav Pandit Acked-by: Michael S. Tsirkin Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20231219093247.170936-4-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/virtio/virtio_pci_common.h | 6 ++ drivers/virtio/virtio_pci_modern.c | 143 ++++++++++++++++++++++++++++- include/linux/virtio.h | 8 ++ include/uapi/linux/virtio_pci.h | 22 +++++ 4 files changed, 177 insertions(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h index 7306128e63e98..282d087a92667 100644 --- a/drivers/virtio/virtio_pci_common.h +++ b/drivers/virtio/virtio_pci_common.h @@ -29,6 +29,7 @@ #include #include #include +#include struct virtio_pci_vq_info { /* the actual virtqueue */ @@ -44,6 +45,8 @@ struct virtio_pci_vq_info { struct virtio_pci_admin_vq { /* Virtqueue info associated with this admin queue. */ struct virtio_pci_vq_info info; + /* serializing admin commands execution and virtqueue deletion */ + struct mutex cmd_lock; /* Name of the admin queue: avq.$vq_index. */ char name[10]; u16 vq_index; @@ -152,4 +155,7 @@ static inline void virtio_pci_legacy_remove(struct virtio_pci_device *vp_dev) int virtio_pci_modern_probe(struct virtio_pci_device *); void virtio_pci_modern_remove(struct virtio_pci_device *); +int vp_modern_admin_cmd_exec(struct virtio_device *vdev, + struct virtio_admin_cmd *cmd); + #endif diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index ce915018b5b0f..9bd66300a80aa 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -38,6 +38,132 @@ static bool vp_is_avq(struct virtio_device *vdev, unsigned int index) return index == vp_dev->admin_vq.vq_index; } +static int virtqueue_exec_admin_cmd(struct virtio_pci_admin_vq *admin_vq, + struct scatterlist **sgs, + unsigned int out_num, + unsigned int in_num, + void *data) +{ + struct virtqueue *vq; + int ret, len; + + vq = admin_vq->info.vq; + if (!vq) + return -EIO; + + ret = virtqueue_add_sgs(vq, sgs, out_num, in_num, data, GFP_KERNEL); + if (ret < 0) + return -EIO; + + if (unlikely(!virtqueue_kick(vq))) + return -EIO; + + while (!virtqueue_get_buf(vq, &len) && + !virtqueue_is_broken(vq)) + cpu_relax(); + + if (virtqueue_is_broken(vq)) + return -EIO; + + return 0; +} + +int vp_modern_admin_cmd_exec(struct virtio_device *vdev, + struct virtio_admin_cmd *cmd) +{ + struct scatterlist *sgs[VIRTIO_AVQ_SGS_MAX], hdr, stat; + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtio_admin_cmd_status *va_status; + unsigned int out_num = 0, in_num = 0; + struct virtio_admin_cmd_hdr *va_hdr; + u16 status; + int ret; + + if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ)) + return -EOPNOTSUPP; + + va_status = kzalloc(sizeof(*va_status), GFP_KERNEL); + if (!va_status) + return -ENOMEM; + + va_hdr = kzalloc(sizeof(*va_hdr), GFP_KERNEL); + if (!va_hdr) { + ret = -ENOMEM; + goto err_alloc; + } + + va_hdr->opcode = cmd->opcode; + va_hdr->group_type = cmd->group_type; + va_hdr->group_member_id = cmd->group_member_id; + + /* Add header */ + sg_init_one(&hdr, va_hdr, sizeof(*va_hdr)); + sgs[out_num] = &hdr; + out_num++; + + if (cmd->data_sg) { + sgs[out_num] = cmd->data_sg; + out_num++; + } + + /* Add return status */ + sg_init_one(&stat, va_status, sizeof(*va_status)); + sgs[out_num + in_num] = &stat; + in_num++; + + if (cmd->result_sg) { + sgs[out_num + in_num] = cmd->result_sg; + in_num++; + } + + mutex_lock(&vp_dev->admin_vq.cmd_lock); + ret = virtqueue_exec_admin_cmd(&vp_dev->admin_vq, sgs, + out_num, in_num, sgs); + mutex_unlock(&vp_dev->admin_vq.cmd_lock); + + if (ret) { + dev_err(&vdev->dev, + "Failed to execute command on admin vq: %d\n.", ret); + goto err_cmd_exec; + } + + status = le16_to_cpu(va_status->status); + if (status != VIRTIO_ADMIN_STATUS_OK) { + dev_err(&vdev->dev, + "admin command error: status(%#x) qualifier(%#x)\n", + status, le16_to_cpu(va_status->status_qualifier)); + ret = -status; + } + +err_cmd_exec: + kfree(va_hdr); +err_alloc: + kfree(va_status); + return ret; +} + +static void vp_modern_avq_activate(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtio_pci_admin_vq *admin_vq = &vp_dev->admin_vq; + + if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ)) + return; + + __virtqueue_unbreak(admin_vq->info.vq); +} + +static void vp_modern_avq_deactivate(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtio_pci_admin_vq *admin_vq = &vp_dev->admin_vq; + + if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ)) + return; + + __virtqueue_break(admin_vq->info.vq); +} + static void vp_transport_features(struct virtio_device *vdev, u64 features) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); @@ -213,6 +339,8 @@ static void vp_set_status(struct virtio_device *vdev, u8 status) /* We should never be setting status to 0. */ BUG_ON(status == 0); vp_modern_set_status(&vp_dev->mdev, status); + if (status & VIRTIO_CONFIG_S_DRIVER_OK) + vp_modern_avq_activate(vdev); } static void vp_reset(struct virtio_device *vdev) @@ -229,6 +357,9 @@ static void vp_reset(struct virtio_device *vdev) */ while (vp_modern_get_status(mdev)) msleep(1); + + vp_modern_avq_deactivate(vdev); + /* Flush pending VQ/configuration callbacks. */ vp_synchronize_vectors(vdev); } @@ -404,8 +535,11 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, goto err; } - if (is_avq) + if (is_avq) { + mutex_lock(&vp_dev->admin_vq.cmd_lock); vp_dev->admin_vq.info.vq = vq; + mutex_unlock(&vp_dev->admin_vq.cmd_lock); + } return vq; @@ -442,8 +576,11 @@ static void del_vq(struct virtio_pci_vq_info *info) struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); struct virtio_pci_modern_device *mdev = &vp_dev->mdev; - if (vp_is_avq(&vp_dev->vdev, vq->index)) + if (vp_is_avq(&vp_dev->vdev, vq->index)) { + mutex_lock(&vp_dev->admin_vq.cmd_lock); vp_dev->admin_vq.info.vq = NULL; + mutex_unlock(&vp_dev->admin_vq.cmd_lock); + } if (vp_dev->msix_enabled) vp_modern_queue_vector(mdev, vq->index, @@ -662,6 +799,7 @@ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev) vp_dev->isr = mdev->isr; vp_dev->vdev.id = mdev->id; + mutex_init(&vp_dev->admin_vq.cmd_lock); return 0; } @@ -669,5 +807,6 @@ void virtio_pci_modern_remove(struct virtio_pci_device *vp_dev) { struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + mutex_destroy(&vp_dev->admin_vq.cmd_lock); vp_modern_remove(mdev); } diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 4cc614a383765..b0201747a263a 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -103,6 +103,14 @@ int virtqueue_resize(struct virtqueue *vq, u32 num, int virtqueue_reset(struct virtqueue *vq, void (*recycle)(struct virtqueue *vq, void *buf)); +struct virtio_admin_cmd { + __le16 opcode; + __le16 group_type; + __le64 group_member_id; + struct scatterlist *data_sg; + struct scatterlist *result_sg; +}; + /** * struct virtio_device - representation of a device using virtio * @index: unique position on the virtio bus diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h index 240ddeef7eae3..187fd9e34a30f 100644 --- a/include/uapi/linux/virtio_pci.h +++ b/include/uapi/linux/virtio_pci.h @@ -223,4 +223,26 @@ struct virtio_pci_cfg_cap { #endif /* VIRTIO_PCI_NO_MODERN */ +/* Admin command status. */ +#define VIRTIO_ADMIN_STATUS_OK 0 + +struct __packed virtio_admin_cmd_hdr { + __le16 opcode; + /* + * 1 - SR-IOV + * 2-65535 - reserved + */ + __le16 group_type; + /* Unused, reserved for future extensions. */ + __u8 reserved1[12]; + __le64 group_member_id; +}; + +struct __packed virtio_admin_cmd_status { + __le16 status; + __le16 status_qualifier; + /* Unused, reserved for future extensions. */ + __u8 reserved2[4]; +}; + #endif -- GitLab From 388431b9f59bbfde2b5f2fe032b0836158b09ad0 Mon Sep 17 00:00:00 2001 From: Feng Liu Date: Tue, 19 Dec 2023 11:32:42 +0200 Subject: [PATCH 0401/1290] virtio-pci: Introduce admin commands Introduces admin commands, as follow: The "list query" command can be used by the driver to query the set of admin commands supported by the virtio device. The "list use" command is used to inform the virtio device which admin commands the driver will use. The "legacy common cfg rd/wr" commands are used to read from/write into the legacy common configuration structure. The "legacy dev cfg rd/wr" commands are used to read from/write into the legacy device configuration structure. The "notify info" command is used to query the notification region information. Signed-off-by: Feng Liu Reviewed-by: Parav Pandit Reviewed-by: Jiri Pirko Acked-by: Michael S. Tsirkin Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20231219093247.170936-5-yishaih@nvidia.com Signed-off-by: Alex Williamson --- include/uapi/linux/virtio_pci.h | 41 +++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h index 187fd9e34a30f..ef3810dee7efa 100644 --- a/include/uapi/linux/virtio_pci.h +++ b/include/uapi/linux/virtio_pci.h @@ -226,6 +226,20 @@ struct virtio_pci_cfg_cap { /* Admin command status. */ #define VIRTIO_ADMIN_STATUS_OK 0 +/* Admin command opcode. */ +#define VIRTIO_ADMIN_CMD_LIST_QUERY 0x0 +#define VIRTIO_ADMIN_CMD_LIST_USE 0x1 + +/* Admin command group type. */ +#define VIRTIO_ADMIN_GROUP_TYPE_SRIOV 0x1 + +/* Transitional device admin command. */ +#define VIRTIO_ADMIN_CMD_LEGACY_COMMON_CFG_WRITE 0x2 +#define VIRTIO_ADMIN_CMD_LEGACY_COMMON_CFG_READ 0x3 +#define VIRTIO_ADMIN_CMD_LEGACY_DEV_CFG_WRITE 0x4 +#define VIRTIO_ADMIN_CMD_LEGACY_DEV_CFG_READ 0x5 +#define VIRTIO_ADMIN_CMD_LEGACY_NOTIFY_INFO 0x6 + struct __packed virtio_admin_cmd_hdr { __le16 opcode; /* @@ -245,4 +259,31 @@ struct __packed virtio_admin_cmd_status { __u8 reserved2[4]; }; +struct __packed virtio_admin_cmd_legacy_wr_data { + __u8 offset; /* Starting offset of the register(s) to write. */ + __u8 reserved[7]; + __u8 registers[]; +}; + +struct __packed virtio_admin_cmd_legacy_rd_data { + __u8 offset; /* Starting offset of the register(s) to read. */ +}; + +#define VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_END 0 +#define VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_OWNER_DEV 0x1 +#define VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_OWNER_MEM 0x2 + +#define VIRTIO_ADMIN_CMD_MAX_NOTIFY_INFO 4 + +struct __packed virtio_admin_cmd_notify_info_data { + __u8 flags; /* 0 = end of list, 1 = owner device, 2 = member device */ + __u8 bar; /* BAR of the member or the owner device */ + __u8 padding[6]; + __le64 offset; /* Offset within bar. */ +}; + +struct virtio_admin_cmd_notify_info_result { + struct virtio_admin_cmd_notify_info_data entries[VIRTIO_ADMIN_CMD_MAX_NOTIFY_INFO]; +}; + #endif -- GitLab From f51e146f1e5c9de02433e51f5a344556502d9c6a Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 19 Dec 2023 11:32:43 +0200 Subject: [PATCH 0402/1290] virtio-pci: Initialize the supported admin commands Initialize the supported admin commands upon activating the admin queue. The supported commands are saved as part of the admin queue context. Next patches in this series will expose APIs to use them. Reviewed-by: Feng Liu Acked-by: Michael S. Tsirkin Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20231219093247.170936-6-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/virtio/Kconfig | 5 +++ drivers/virtio/virtio_pci_common.h | 19 ++++++++++++ drivers/virtio/virtio_pci_modern.c | 49 ++++++++++++++++++++++++++++-- 3 files changed, 71 insertions(+), 2 deletions(-) diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index 0a53a61231c29..c17193544268a 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -60,6 +60,11 @@ config VIRTIO_PCI If unsure, say M. +config VIRTIO_PCI_ADMIN_LEGACY + bool + depends on VIRTIO_PCI && (X86 || COMPILE_TEST) + default y + config VIRTIO_PCI_LEGACY bool "Support for legacy virtio draft 0.9.X and older devices" default y diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h index 282d087a92667..a39bffd5fd462 100644 --- a/drivers/virtio/virtio_pci_common.h +++ b/drivers/virtio/virtio_pci_common.h @@ -47,6 +47,7 @@ struct virtio_pci_admin_vq { struct virtio_pci_vq_info info; /* serializing admin commands execution and virtqueue deletion */ struct mutex cmd_lock; + u64 supported_cmds; /* Name of the admin queue: avq.$vq_index. */ char name[10]; u16 vq_index; @@ -155,6 +156,24 @@ static inline void virtio_pci_legacy_remove(struct virtio_pci_device *vp_dev) int virtio_pci_modern_probe(struct virtio_pci_device *); void virtio_pci_modern_remove(struct virtio_pci_device *); +#define VIRTIO_LEGACY_ADMIN_CMD_BITMAP \ + (BIT_ULL(VIRTIO_ADMIN_CMD_LEGACY_COMMON_CFG_WRITE) | \ + BIT_ULL(VIRTIO_ADMIN_CMD_LEGACY_COMMON_CFG_READ) | \ + BIT_ULL(VIRTIO_ADMIN_CMD_LEGACY_DEV_CFG_WRITE) | \ + BIT_ULL(VIRTIO_ADMIN_CMD_LEGACY_DEV_CFG_READ) | \ + BIT_ULL(VIRTIO_ADMIN_CMD_LEGACY_NOTIFY_INFO)) + +/* Unlike modern drivers which support hardware virtio devices, legacy drivers + * assume software-based devices: e.g. they don't use proper memory barriers + * on ARM, use big endian on PPC, etc. X86 drivers are mostly ok though, more + * or less by chance. For now, only support legacy IO on X86. + */ +#ifdef CONFIG_VIRTIO_PCI_ADMIN_LEGACY +#define VIRTIO_ADMIN_CMD_BITMAP VIRTIO_LEGACY_ADMIN_CMD_BITMAP +#else +#define VIRTIO_ADMIN_CMD_BITMAP 0 +#endif + int vp_modern_admin_cmd_exec(struct virtio_device *vdev, struct virtio_admin_cmd *cmd); diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 9bd66300a80aa..f62b530aa3b5b 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -39,6 +39,7 @@ static bool vp_is_avq(struct virtio_device *vdev, unsigned int index) } static int virtqueue_exec_admin_cmd(struct virtio_pci_admin_vq *admin_vq, + u16 opcode, struct scatterlist **sgs, unsigned int out_num, unsigned int in_num, @@ -51,6 +52,11 @@ static int virtqueue_exec_admin_cmd(struct virtio_pci_admin_vq *admin_vq, if (!vq) return -EIO; + if (opcode != VIRTIO_ADMIN_CMD_LIST_QUERY && + opcode != VIRTIO_ADMIN_CMD_LIST_USE && + !((1ULL << opcode) & admin_vq->supported_cmds)) + return -EOPNOTSUPP; + ret = virtqueue_add_sgs(vq, sgs, out_num, in_num, data, GFP_KERNEL); if (ret < 0) return -EIO; @@ -117,8 +123,9 @@ int vp_modern_admin_cmd_exec(struct virtio_device *vdev, } mutex_lock(&vp_dev->admin_vq.cmd_lock); - ret = virtqueue_exec_admin_cmd(&vp_dev->admin_vq, sgs, - out_num, in_num, sgs); + ret = virtqueue_exec_admin_cmd(&vp_dev->admin_vq, + le16_to_cpu(cmd->opcode), + sgs, out_num, in_num, sgs); mutex_unlock(&vp_dev->admin_vq.cmd_lock); if (ret) { @@ -142,6 +149,43 @@ err_alloc: return ret; } +static void virtio_pci_admin_cmd_list_init(struct virtio_device *virtio_dev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(virtio_dev); + struct virtio_admin_cmd cmd = {}; + struct scatterlist result_sg; + struct scatterlist data_sg; + __le64 *data; + int ret; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return; + + sg_init_one(&result_sg, data, sizeof(*data)); + cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_LIST_QUERY); + cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV); + cmd.result_sg = &result_sg; + + ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd); + if (ret) + goto end; + + *data &= cpu_to_le64(VIRTIO_ADMIN_CMD_BITMAP); + sg_init_one(&data_sg, data, sizeof(*data)); + cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_LIST_USE); + cmd.data_sg = &data_sg; + cmd.result_sg = NULL; + + ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd); + if (ret) + goto end; + + vp_dev->admin_vq.supported_cmds = le64_to_cpu(*data); +end: + kfree(data); +} + static void vp_modern_avq_activate(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); @@ -151,6 +195,7 @@ static void vp_modern_avq_activate(struct virtio_device *vdev) return; __virtqueue_unbreak(admin_vq->info.vq); + virtio_pci_admin_cmd_list_init(vdev); } static void vp_modern_avq_deactivate(struct virtio_device *vdev) -- GitLab From c3fc3e098bd64c560dde49a6e72b21b055150abe Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 19 Dec 2023 11:32:44 +0200 Subject: [PATCH 0403/1290] virtio-pci: Introduce APIs to execute legacy IO admin commands Introduce APIs to execute legacy IO admin commands. It includes: io_legacy_read/write for both common and the device configuration, io_legacy_notify_info. In addition, exposing an API to check whether the legacy IO commands are supported. (i.e. virtio_pci_admin_has_legacy_io()). Those APIs will be used by the next patches from this series. Note: Unlike modern drivers which support hardware virtio devices, legacy drivers assume software-based devices: e.g. they don't use proper memory barriers on ARM, use big endian on PPC, etc. X86 drivers are mostly ok though, more or less by chance. For now, only support legacy IO on X86. Acked-by: Michael S. Tsirkin Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20231219093247.170936-7-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/virtio/Makefile | 1 + drivers/virtio/virtio_pci_admin_legacy_io.c | 244 ++++++++++++++++++++ drivers/virtio/virtio_pci_common.c | 11 + drivers/virtio/virtio_pci_common.h | 2 + include/linux/virtio_pci_admin.h | 23 ++ 5 files changed, 281 insertions(+) create mode 100644 drivers/virtio/virtio_pci_admin_legacy_io.c create mode 100644 include/linux/virtio_pci_admin.h diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile index 8e98d24917cc0..73ace62af4409 100644 --- a/drivers/virtio/Makefile +++ b/drivers/virtio/Makefile @@ -7,6 +7,7 @@ obj-$(CONFIG_VIRTIO_MMIO) += virtio_mmio.o obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o virtio_pci-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o +virtio_pci-$(CONFIG_VIRTIO_PCI_ADMIN_LEGACY) += virtio_pci_admin_legacy_io.o obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o obj-$(CONFIG_VIRTIO_INPUT) += virtio_input.o obj-$(CONFIG_VIRTIO_VDPA) += virtio_vdpa.o diff --git a/drivers/virtio/virtio_pci_admin_legacy_io.c b/drivers/virtio/virtio_pci_admin_legacy_io.c new file mode 100644 index 0000000000000..819cfbbc67c3b --- /dev/null +++ b/drivers/virtio/virtio_pci_admin_legacy_io.c @@ -0,0 +1,244 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include +#include "virtio_pci_common.h" + +/* + * virtio_pci_admin_has_legacy_io - Checks whether the legacy IO + * commands are supported + * @dev: VF pci_dev + * + * Returns true on success. + */ +bool virtio_pci_admin_has_legacy_io(struct pci_dev *pdev) +{ + struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev); + struct virtio_pci_device *vp_dev; + + if (!virtio_dev) + return false; + + if (!virtio_has_feature(virtio_dev, VIRTIO_F_ADMIN_VQ)) + return false; + + vp_dev = to_vp_device(virtio_dev); + + if ((vp_dev->admin_vq.supported_cmds & VIRTIO_LEGACY_ADMIN_CMD_BITMAP) == + VIRTIO_LEGACY_ADMIN_CMD_BITMAP) + return true; + return false; +} +EXPORT_SYMBOL_GPL(virtio_pci_admin_has_legacy_io); + +static int virtio_pci_admin_legacy_io_write(struct pci_dev *pdev, u16 opcode, + u8 offset, u8 size, u8 *buf) +{ + struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev); + struct virtio_admin_cmd_legacy_wr_data *data; + struct virtio_admin_cmd cmd = {}; + struct scatterlist data_sg; + int vf_id; + int ret; + + if (!virtio_dev) + return -ENODEV; + + vf_id = pci_iov_vf_id(pdev); + if (vf_id < 0) + return vf_id; + + data = kzalloc(sizeof(*data) + size, GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->offset = offset; + memcpy(data->registers, buf, size); + sg_init_one(&data_sg, data, sizeof(*data) + size); + cmd.opcode = cpu_to_le16(opcode); + cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV); + cmd.group_member_id = cpu_to_le64(vf_id + 1); + cmd.data_sg = &data_sg; + ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd); + + kfree(data); + return ret; +} + +/* + * virtio_pci_admin_legacy_io_write_common - Write legacy common configuration + * of a member device + * @dev: VF pci_dev + * @offset: starting byte offset within the common configuration area to write to + * @size: size of the data to write + * @buf: buffer which holds the data + * + * Note: caller must serialize access for the given device. + * Returns 0 on success, or negative on failure. + */ +int virtio_pci_admin_legacy_common_io_write(struct pci_dev *pdev, u8 offset, + u8 size, u8 *buf) +{ + return virtio_pci_admin_legacy_io_write(pdev, + VIRTIO_ADMIN_CMD_LEGACY_COMMON_CFG_WRITE, + offset, size, buf); +} +EXPORT_SYMBOL_GPL(virtio_pci_admin_legacy_common_io_write); + +/* + * virtio_pci_admin_legacy_io_write_device - Write legacy device configuration + * of a member device + * @dev: VF pci_dev + * @offset: starting byte offset within the device configuration area to write to + * @size: size of the data to write + * @buf: buffer which holds the data + * + * Note: caller must serialize access for the given device. + * Returns 0 on success, or negative on failure. + */ +int virtio_pci_admin_legacy_device_io_write(struct pci_dev *pdev, u8 offset, + u8 size, u8 *buf) +{ + return virtio_pci_admin_legacy_io_write(pdev, + VIRTIO_ADMIN_CMD_LEGACY_DEV_CFG_WRITE, + offset, size, buf); +} +EXPORT_SYMBOL_GPL(virtio_pci_admin_legacy_device_io_write); + +static int virtio_pci_admin_legacy_io_read(struct pci_dev *pdev, u16 opcode, + u8 offset, u8 size, u8 *buf) +{ + struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev); + struct virtio_admin_cmd_legacy_rd_data *data; + struct scatterlist data_sg, result_sg; + struct virtio_admin_cmd cmd = {}; + int vf_id; + int ret; + + if (!virtio_dev) + return -ENODEV; + + vf_id = pci_iov_vf_id(pdev); + if (vf_id < 0) + return vf_id; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->offset = offset; + sg_init_one(&data_sg, data, sizeof(*data)); + sg_init_one(&result_sg, buf, size); + cmd.opcode = cpu_to_le16(opcode); + cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV); + cmd.group_member_id = cpu_to_le64(vf_id + 1); + cmd.data_sg = &data_sg; + cmd.result_sg = &result_sg; + ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd); + + kfree(data); + return ret; +} + +/* + * virtio_pci_admin_legacy_device_io_read - Read legacy device configuration of + * a member device + * @dev: VF pci_dev + * @offset: starting byte offset within the device configuration area to read from + * @size: size of the data to be read + * @buf: buffer to hold the returned data + * + * Note: caller must serialize access for the given device. + * Returns 0 on success, or negative on failure. + */ +int virtio_pci_admin_legacy_device_io_read(struct pci_dev *pdev, u8 offset, + u8 size, u8 *buf) +{ + return virtio_pci_admin_legacy_io_read(pdev, + VIRTIO_ADMIN_CMD_LEGACY_DEV_CFG_READ, + offset, size, buf); +} +EXPORT_SYMBOL_GPL(virtio_pci_admin_legacy_device_io_read); + +/* + * virtio_pci_admin_legacy_common_io_read - Read legacy common configuration of + * a member device + * @dev: VF pci_dev + * @offset: starting byte offset within the common configuration area to read from + * @size: size of the data to be read + * @buf: buffer to hold the returned data + * + * Note: caller must serialize access for the given device. + * Returns 0 on success, or negative on failure. + */ +int virtio_pci_admin_legacy_common_io_read(struct pci_dev *pdev, u8 offset, + u8 size, u8 *buf) +{ + return virtio_pci_admin_legacy_io_read(pdev, + VIRTIO_ADMIN_CMD_LEGACY_COMMON_CFG_READ, + offset, size, buf); +} +EXPORT_SYMBOL_GPL(virtio_pci_admin_legacy_common_io_read); + +/* + * virtio_pci_admin_legacy_io_notify_info - Read the queue notification + * information for legacy interface + * @dev: VF pci_dev + * @req_bar_flags: requested bar flags + * @bar: on output the BAR number of the owner or member device + * @bar_offset: on output the offset within bar + * + * Returns 0 on success, or negative on failure. + */ +int virtio_pci_admin_legacy_io_notify_info(struct pci_dev *pdev, + u8 req_bar_flags, u8 *bar, + u64 *bar_offset) +{ + struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev); + struct virtio_admin_cmd_notify_info_result *result; + struct virtio_admin_cmd cmd = {}; + struct scatterlist result_sg; + int vf_id; + int ret; + + if (!virtio_dev) + return -ENODEV; + + vf_id = pci_iov_vf_id(pdev); + if (vf_id < 0) + return vf_id; + + result = kzalloc(sizeof(*result), GFP_KERNEL); + if (!result) + return -ENOMEM; + + sg_init_one(&result_sg, result, sizeof(*result)); + cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_LEGACY_NOTIFY_INFO); + cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV); + cmd.group_member_id = cpu_to_le64(vf_id + 1); + cmd.result_sg = &result_sg; + ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd); + if (!ret) { + struct virtio_admin_cmd_notify_info_data *entry; + int i; + + ret = -ENOENT; + for (i = 0; i < VIRTIO_ADMIN_CMD_MAX_NOTIFY_INFO; i++) { + entry = &result->entries[i]; + if (entry->flags == VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_END) + break; + if (entry->flags != req_bar_flags) + continue; + *bar = entry->bar; + *bar_offset = le64_to_cpu(entry->offset); + ret = 0; + break; + } + } + + kfree(result); + return ret; +} +EXPORT_SYMBOL_GPL(virtio_pci_admin_legacy_io_notify_info); diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index fafd13d0e4d46..9f93330fc2cc0 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -645,6 +645,17 @@ static struct pci_driver virtio_pci_driver = { .sriov_configure = virtio_pci_sriov_configure, }; +struct virtio_device *virtio_pci_vf_get_pf_dev(struct pci_dev *pdev) +{ + struct virtio_pci_device *pf_vp_dev; + + pf_vp_dev = pci_iov_get_pf_drvdata(pdev, &virtio_pci_driver); + if (IS_ERR(pf_vp_dev)) + return NULL; + + return &pf_vp_dev->vdev; +} + module_pci_driver(virtio_pci_driver); MODULE_AUTHOR("Anthony Liguori "); diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h index a39bffd5fd462..7fef52bee4557 100644 --- a/drivers/virtio/virtio_pci_common.h +++ b/drivers/virtio/virtio_pci_common.h @@ -156,6 +156,8 @@ static inline void virtio_pci_legacy_remove(struct virtio_pci_device *vp_dev) int virtio_pci_modern_probe(struct virtio_pci_device *); void virtio_pci_modern_remove(struct virtio_pci_device *); +struct virtio_device *virtio_pci_vf_get_pf_dev(struct pci_dev *pdev); + #define VIRTIO_LEGACY_ADMIN_CMD_BITMAP \ (BIT_ULL(VIRTIO_ADMIN_CMD_LEGACY_COMMON_CFG_WRITE) | \ BIT_ULL(VIRTIO_ADMIN_CMD_LEGACY_COMMON_CFG_READ) | \ diff --git a/include/linux/virtio_pci_admin.h b/include/linux/virtio_pci_admin.h new file mode 100644 index 0000000000000..f4a100a0fe2e1 --- /dev/null +++ b/include/linux/virtio_pci_admin.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_VIRTIO_PCI_ADMIN_H +#define _LINUX_VIRTIO_PCI_ADMIN_H + +#include +#include + +#ifdef CONFIG_VIRTIO_PCI_ADMIN_LEGACY +bool virtio_pci_admin_has_legacy_io(struct pci_dev *pdev); +int virtio_pci_admin_legacy_common_io_write(struct pci_dev *pdev, u8 offset, + u8 size, u8 *buf); +int virtio_pci_admin_legacy_common_io_read(struct pci_dev *pdev, u8 offset, + u8 size, u8 *buf); +int virtio_pci_admin_legacy_device_io_write(struct pci_dev *pdev, u8 offset, + u8 size, u8 *buf); +int virtio_pci_admin_legacy_device_io_read(struct pci_dev *pdev, u8 offset, + u8 size, u8 *buf); +int virtio_pci_admin_legacy_io_notify_info(struct pci_dev *pdev, + u8 req_bar_flags, u8 *bar, + u64 *bar_offset); +#endif + +#endif /* _LINUX_VIRTIO_PCI_ADMIN_H */ -- GitLab From 8bccc5b80678c69f7729ce4cd232c0aa98fa6277 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 19 Dec 2023 11:32:45 +0200 Subject: [PATCH 0404/1290] vfio/pci: Expose vfio_pci_core_setup_barmap() Expose vfio_pci_core_setup_barmap() to be used by drivers. This will let drivers to mmap a BAR and re-use it from both vfio and the driver when it's applicable. This API will be used in the next patches by the vfio/virtio coming driver. Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Yishai Hadas Acked-by: Michael S. Tsirkin Link: https://lore.kernel.org/r/20231219093247.170936-8-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_rdwr.c | 7 ++++--- include/linux/vfio_pci_core.h | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index e27de61ac9fe7..a9887fd6de46c 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -200,7 +200,7 @@ static ssize_t do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, return done; } -static int vfio_pci_setup_barmap(struct vfio_pci_core_device *vdev, int bar) +int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar) { struct pci_dev *pdev = vdev->pdev; int ret; @@ -223,6 +223,7 @@ static int vfio_pci_setup_barmap(struct vfio_pci_core_device *vdev, int bar) return 0; } +EXPORT_SYMBOL_GPL(vfio_pci_core_setup_barmap); ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite) @@ -262,7 +263,7 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, } x_end = end; } else { - int ret = vfio_pci_setup_barmap(vdev, bar); + int ret = vfio_pci_core_setup_barmap(vdev, bar); if (ret) { done = ret; goto out; @@ -438,7 +439,7 @@ int vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, return -EINVAL; #endif - ret = vfio_pci_setup_barmap(vdev, bar); + ret = vfio_pci_core_setup_barmap(vdev, bar); if (ret) return ret; diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 562e8754869da..67ac58e20e1da 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -127,6 +127,7 @@ int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf); int vfio_pci_core_enable(struct vfio_pci_core_device *vdev); void vfio_pci_core_disable(struct vfio_pci_core_device *vdev); void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev); +int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar); pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev, pci_channel_state_t state); -- GitLab From 8486ae162b3b6cc1055366f044495cf1966231f1 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 19 Dec 2023 11:32:46 +0200 Subject: [PATCH 0405/1290] vfio/pci: Expose vfio_pci_core_iowrite/read##size() Expose vfio_pci_core_iowrite/read##size() to let it be used by drivers. This functionality is needed to enable direct access to some physical BAR of the device with the proper locks/checks in place. The next patches from this series will use this functionality on a data path flow when a direct access to the BAR is needed. Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Yishai Hadas Acked-by: Michael S. Tsirkin Link: https://lore.kernel.org/r/20231219093247.170936-9-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_rdwr.c | 50 +++++++++++++++++--------------- include/linux/vfio_pci_core.h | 19 ++++++++++++ 2 files changed, 45 insertions(+), 24 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index a9887fd6de46c..07fea08ea8a21 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -38,7 +38,7 @@ #define vfio_iowrite8 iowrite8 #define VFIO_IOWRITE(size) \ -static int vfio_pci_iowrite##size(struct vfio_pci_core_device *vdev, \ +int vfio_pci_core_iowrite##size(struct vfio_pci_core_device *vdev, \ bool test_mem, u##size val, void __iomem *io) \ { \ if (test_mem) { \ @@ -55,7 +55,8 @@ static int vfio_pci_iowrite##size(struct vfio_pci_core_device *vdev, \ up_read(&vdev->memory_lock); \ \ return 0; \ -} +} \ +EXPORT_SYMBOL_GPL(vfio_pci_core_iowrite##size); VFIO_IOWRITE(8) VFIO_IOWRITE(16) @@ -65,7 +66,7 @@ VFIO_IOWRITE(64) #endif #define VFIO_IOREAD(size) \ -static int vfio_pci_ioread##size(struct vfio_pci_core_device *vdev, \ +int vfio_pci_core_ioread##size(struct vfio_pci_core_device *vdev, \ bool test_mem, u##size *val, void __iomem *io) \ { \ if (test_mem) { \ @@ -82,7 +83,8 @@ static int vfio_pci_ioread##size(struct vfio_pci_core_device *vdev, \ up_read(&vdev->memory_lock); \ \ return 0; \ -} +} \ +EXPORT_SYMBOL_GPL(vfio_pci_core_ioread##size); VFIO_IOREAD(8) VFIO_IOREAD(16) @@ -119,13 +121,13 @@ static ssize_t do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, if (copy_from_user(&val, buf, 4)) return -EFAULT; - ret = vfio_pci_iowrite32(vdev, test_mem, - val, io + off); + ret = vfio_pci_core_iowrite32(vdev, test_mem, + val, io + off); if (ret) return ret; } else { - ret = vfio_pci_ioread32(vdev, test_mem, - &val, io + off); + ret = vfio_pci_core_ioread32(vdev, test_mem, + &val, io + off); if (ret) return ret; @@ -141,13 +143,13 @@ static ssize_t do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, if (copy_from_user(&val, buf, 2)) return -EFAULT; - ret = vfio_pci_iowrite16(vdev, test_mem, - val, io + off); + ret = vfio_pci_core_iowrite16(vdev, test_mem, + val, io + off); if (ret) return ret; } else { - ret = vfio_pci_ioread16(vdev, test_mem, - &val, io + off); + ret = vfio_pci_core_ioread16(vdev, test_mem, + &val, io + off); if (ret) return ret; @@ -163,13 +165,13 @@ static ssize_t do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, if (copy_from_user(&val, buf, 1)) return -EFAULT; - ret = vfio_pci_iowrite8(vdev, test_mem, - val, io + off); + ret = vfio_pci_core_iowrite8(vdev, test_mem, + val, io + off); if (ret) return ret; } else { - ret = vfio_pci_ioread8(vdev, test_mem, - &val, io + off); + ret = vfio_pci_core_ioread8(vdev, test_mem, + &val, io + off); if (ret) return ret; @@ -364,21 +366,21 @@ static void vfio_pci_ioeventfd_do_write(struct vfio_pci_ioeventfd *ioeventfd, { switch (ioeventfd->count) { case 1: - vfio_pci_iowrite8(ioeventfd->vdev, test_mem, - ioeventfd->data, ioeventfd->addr); + vfio_pci_core_iowrite8(ioeventfd->vdev, test_mem, + ioeventfd->data, ioeventfd->addr); break; case 2: - vfio_pci_iowrite16(ioeventfd->vdev, test_mem, - ioeventfd->data, ioeventfd->addr); + vfio_pci_core_iowrite16(ioeventfd->vdev, test_mem, + ioeventfd->data, ioeventfd->addr); break; case 4: - vfio_pci_iowrite32(ioeventfd->vdev, test_mem, - ioeventfd->data, ioeventfd->addr); + vfio_pci_core_iowrite32(ioeventfd->vdev, test_mem, + ioeventfd->data, ioeventfd->addr); break; #ifdef iowrite64 case 8: - vfio_pci_iowrite64(ioeventfd->vdev, test_mem, - ioeventfd->data, ioeventfd->addr); + vfio_pci_core_iowrite64(ioeventfd->vdev, test_mem, + ioeventfd->data, ioeventfd->addr); break; #endif } diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 67ac58e20e1da..85e84b92751b6 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -131,4 +131,23 @@ int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar); pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev, pci_channel_state_t state); +#define VFIO_IOWRITE_DECLATION(size) \ +int vfio_pci_core_iowrite##size(struct vfio_pci_core_device *vdev, \ + bool test_mem, u##size val, void __iomem *io); + +VFIO_IOWRITE_DECLATION(8) +VFIO_IOWRITE_DECLATION(16) +VFIO_IOWRITE_DECLATION(32) +#ifdef iowrite64 +VFIO_IOWRITE_DECLATION(64) +#endif + +#define VFIO_IOREAD_DECLATION(size) \ +int vfio_pci_core_ioread##size(struct vfio_pci_core_device *vdev, \ + bool test_mem, u##size *val, void __iomem *io); + +VFIO_IOREAD_DECLATION(8) +VFIO_IOREAD_DECLATION(16) +VFIO_IOREAD_DECLATION(32) + #endif /* VFIO_PCI_CORE_H */ -- GitLab From eb61eca0e8c3efbdd6d3c9d2b7f70bf67e165f7d Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 19 Dec 2023 11:32:47 +0200 Subject: [PATCH 0406/1290] vfio/virtio: Introduce a vfio driver over virtio devices Introduce a vfio driver over virtio devices to support the legacy interface functionality for VFs. Background, from the virtio spec [1]. -------------------------------------------------------------------- In some systems, there is a need to support a virtio legacy driver with a device that does not directly support the legacy interface. In such scenarios, a group owner device can provide the legacy interface functionality for the group member devices. The driver of the owner device can then access the legacy interface of a member device on behalf of the legacy member device driver. For example, with the SR-IOV group type, group members (VFs) can not present the legacy interface in an I/O BAR in BAR0 as expected by the legacy pci driver. If the legacy driver is running inside a virtual machine, the hypervisor executing the virtual machine can present a virtual device with an I/O BAR in BAR0. The hypervisor intercepts the legacy driver accesses to this I/O BAR and forwards them to the group owner device (PF) using group administration commands. -------------------------------------------------------------------- Specifically, this driver adds support for a virtio-net VF to be exposed as a transitional device to a guest driver and allows the legacy IO BAR functionality on top. This allows a VM which uses a legacy virtio-net driver in the guest to work transparently over a VF which its driver in the host is that new driver. The driver can be extended easily to support some other types of virtio devices (e.g virtio-blk), by adding in a few places the specific type properties as was done for virtio-net. For now, only the virtio-net use case was tested and as such we introduce the support only for such a device. Practically, Upon probing a VF for a virtio-net device, in case its PF supports legacy access over the virtio admin commands and the VF doesn't have BAR 0, we set some specific 'vfio_device_ops' to be able to simulate in SW a transitional device with I/O BAR in BAR 0. The existence of the simulated I/O bar is reported later on by overwriting the VFIO_DEVICE_GET_REGION_INFO command and the device exposes itself as a transitional device by overwriting some properties upon reading its config space. Once we report the existence of I/O BAR as BAR 0 a legacy driver in the guest may use it via read/write calls according to the virtio specification. Any read/write towards the control parts of the BAR will be captured by the new driver and will be translated into admin commands towards the device. In addition, any data path read/write access (i.e. virtio driver notifications) will be captured by the driver and forwarded to the physical BAR which its properties were supplied by the admin command VIRTIO_ADMIN_CMD_LEGACY_NOTIFY_INFO upon the probing/init flow. With that code in place a legacy driver in the guest has the look and feel as if having a transitional device with legacy support for both its control and data path flows. [1] https://github.com/oasis-tcs/virtio-spec/commit/03c2d32e5093ca9f2a17797242fbef88efe94b8c Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Yishai Hadas Acked-by: Michael S. Tsirkin Link: https://lore.kernel.org/r/20231219093247.170936-10-yishaih@nvidia.com Signed-off-by: Alex Williamson --- MAINTAINERS | 7 + drivers/vfio/pci/Kconfig | 2 + drivers/vfio/pci/Makefile | 2 + drivers/vfio/pci/virtio/Kconfig | 15 + drivers/vfio/pci/virtio/Makefile | 3 + drivers/vfio/pci/virtio/main.c | 576 +++++++++++++++++++++++++++++++ 6 files changed, 605 insertions(+) create mode 100644 drivers/vfio/pci/virtio/Kconfig create mode 100644 drivers/vfio/pci/virtio/Makefile create mode 100644 drivers/vfio/pci/virtio/main.c diff --git a/MAINTAINERS b/MAINTAINERS index 788be9ab5b733..c4cb7df15cd93 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22872,6 +22872,13 @@ L: kvm@vger.kernel.org S: Maintained F: drivers/vfio/pci/mlx5/ +VFIO VIRTIO PCI DRIVER +M: Yishai Hadas +L: kvm@vger.kernel.org +L: virtualization@lists.linux-foundation.org +S: Maintained +F: drivers/vfio/pci/virtio + VFIO PCI DEVICE SPECIFIC DRIVERS R: Jason Gunthorpe R: Yishai Hadas diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 8125e5f37832c..18c397df566d8 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -65,4 +65,6 @@ source "drivers/vfio/pci/hisilicon/Kconfig" source "drivers/vfio/pci/pds/Kconfig" +source "drivers/vfio/pci/virtio/Kconfig" + endmenu diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index 45167be462d8f..046139a4eca5b 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -13,3 +13,5 @@ obj-$(CONFIG_MLX5_VFIO_PCI) += mlx5/ obj-$(CONFIG_HISI_ACC_VFIO_PCI) += hisilicon/ obj-$(CONFIG_PDS_VFIO_PCI) += pds/ + +obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio/ diff --git a/drivers/vfio/pci/virtio/Kconfig b/drivers/vfio/pci/virtio/Kconfig new file mode 100644 index 0000000000000..fc3a0be9d8d42 --- /dev/null +++ b/drivers/vfio/pci/virtio/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only +config VIRTIO_VFIO_PCI + tristate "VFIO support for VIRTIO NET PCI devices" + depends on VIRTIO_PCI_ADMIN_LEGACY + select VFIO_PCI_CORE + help + This provides support for exposing VIRTIO NET VF devices which support + legacy IO access, using the VFIO framework that can work with a legacy + virtio driver in the guest. + Based on PCIe spec, VFs do not support I/O Space. + As of that this driver emulates I/O BAR in software to let a VF be + seen as a transitional device by its users and let it work with + a legacy driver. + + If you don't know what to do here, say N. diff --git a/drivers/vfio/pci/virtio/Makefile b/drivers/vfio/pci/virtio/Makefile new file mode 100644 index 0000000000000..7171105baf330 --- /dev/null +++ b/drivers/vfio/pci/virtio/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio-vfio-pci.o +virtio-vfio-pci-y := main.o diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c new file mode 100644 index 0000000000000..291c55b641f15 --- /dev/null +++ b/drivers/vfio/pci/virtio/main.c @@ -0,0 +1,576 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct virtiovf_pci_core_device { + struct vfio_pci_core_device core_device; + u8 *bar0_virtual_buf; + /* synchronize access to the virtual buf */ + struct mutex bar_mutex; + void __iomem *notify_addr; + u64 notify_offset; + __le32 pci_base_addr_0; + __le16 pci_cmd; + u8 bar0_virtual_buf_size; + u8 notify_bar; +}; + +static int +virtiovf_issue_legacy_rw_cmd(struct virtiovf_pci_core_device *virtvdev, + loff_t pos, char __user *buf, + size_t count, bool read) +{ + bool msix_enabled = + (virtvdev->core_device.irq_type == VFIO_PCI_MSIX_IRQ_INDEX); + struct pci_dev *pdev = virtvdev->core_device.pdev; + u8 *bar0_buf = virtvdev->bar0_virtual_buf; + bool common; + u8 offset; + int ret; + + common = pos < VIRTIO_PCI_CONFIG_OFF(msix_enabled); + /* offset within the relevant configuration area */ + offset = common ? pos : pos - VIRTIO_PCI_CONFIG_OFF(msix_enabled); + mutex_lock(&virtvdev->bar_mutex); + if (read) { + if (common) + ret = virtio_pci_admin_legacy_common_io_read(pdev, offset, + count, bar0_buf + pos); + else + ret = virtio_pci_admin_legacy_device_io_read(pdev, offset, + count, bar0_buf + pos); + if (ret) + goto out; + if (copy_to_user(buf, bar0_buf + pos, count)) + ret = -EFAULT; + } else { + if (copy_from_user(bar0_buf + pos, buf, count)) { + ret = -EFAULT; + goto out; + } + + if (common) + ret = virtio_pci_admin_legacy_common_io_write(pdev, offset, + count, bar0_buf + pos); + else + ret = virtio_pci_admin_legacy_device_io_write(pdev, offset, + count, bar0_buf + pos); + } +out: + mutex_unlock(&virtvdev->bar_mutex); + return ret; +} + +static int +virtiovf_pci_bar0_rw(struct virtiovf_pci_core_device *virtvdev, + loff_t pos, char __user *buf, + size_t count, bool read) +{ + struct vfio_pci_core_device *core_device = &virtvdev->core_device; + struct pci_dev *pdev = core_device->pdev; + u16 queue_notify; + int ret; + + if (!(le16_to_cpu(virtvdev->pci_cmd) & PCI_COMMAND_IO)) + return -EIO; + + if (pos + count > virtvdev->bar0_virtual_buf_size) + return -EINVAL; + + ret = pm_runtime_resume_and_get(&pdev->dev); + if (ret) { + pci_info_ratelimited(pdev, "runtime resume failed %d\n", ret); + return -EIO; + } + + switch (pos) { + case VIRTIO_PCI_QUEUE_NOTIFY: + if (count != sizeof(queue_notify)) { + ret = -EINVAL; + goto end; + } + if (read) { + ret = vfio_pci_core_ioread16(core_device, true, &queue_notify, + virtvdev->notify_addr); + if (ret) + goto end; + if (copy_to_user(buf, &queue_notify, + sizeof(queue_notify))) { + ret = -EFAULT; + goto end; + } + } else { + if (copy_from_user(&queue_notify, buf, count)) { + ret = -EFAULT; + goto end; + } + ret = vfio_pci_core_iowrite16(core_device, true, queue_notify, + virtvdev->notify_addr); + } + break; + default: + ret = virtiovf_issue_legacy_rw_cmd(virtvdev, pos, buf, count, + read); + } + +end: + pm_runtime_put(&pdev->dev); + return ret ? ret : count; +} + +static bool range_intersect_range(loff_t range1_start, size_t count1, + loff_t range2_start, size_t count2, + loff_t *start_offset, + size_t *intersect_count, + size_t *register_offset) +{ + if (range1_start <= range2_start && + range1_start + count1 > range2_start) { + *start_offset = range2_start - range1_start; + *intersect_count = min_t(size_t, count2, + range1_start + count1 - range2_start); + *register_offset = 0; + return true; + } + + if (range1_start > range2_start && + range1_start < range2_start + count2) { + *start_offset = 0; + *intersect_count = min_t(size_t, count1, + range2_start + count2 - range1_start); + *register_offset = range1_start - range2_start; + return true; + } + + return false; +} + +static ssize_t virtiovf_pci_read_config(struct vfio_device *core_vdev, + char __user *buf, size_t count, + loff_t *ppos) +{ + struct virtiovf_pci_core_device *virtvdev = container_of( + core_vdev, struct virtiovf_pci_core_device, core_device.vdev); + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + size_t register_offset; + loff_t copy_offset; + size_t copy_count; + __le32 val32; + __le16 val16; + u8 val8; + int ret; + + ret = vfio_pci_core_read(core_vdev, buf, count, ppos); + if (ret < 0) + return ret; + + if (range_intersect_range(pos, count, PCI_DEVICE_ID, sizeof(val16), + ©_offset, ©_count, ®ister_offset)) { + val16 = cpu_to_le16(VIRTIO_TRANS_ID_NET); + if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, copy_count)) + return -EFAULT; + } + + if ((le16_to_cpu(virtvdev->pci_cmd) & PCI_COMMAND_IO) && + range_intersect_range(pos, count, PCI_COMMAND, sizeof(val16), + ©_offset, ©_count, ®ister_offset)) { + if (copy_from_user((void *)&val16 + register_offset, buf + copy_offset, + copy_count)) + return -EFAULT; + val16 |= cpu_to_le16(PCI_COMMAND_IO); + if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, + copy_count)) + return -EFAULT; + } + + if (range_intersect_range(pos, count, PCI_REVISION_ID, sizeof(val8), + ©_offset, ©_count, ®ister_offset)) { + /* Transional needs to have revision 0 */ + val8 = 0; + if (copy_to_user(buf + copy_offset, &val8, copy_count)) + return -EFAULT; + } + + if (range_intersect_range(pos, count, PCI_BASE_ADDRESS_0, sizeof(val32), + ©_offset, ©_count, ®ister_offset)) { + u32 bar_mask = ~(virtvdev->bar0_virtual_buf_size - 1); + u32 pci_base_addr_0 = le32_to_cpu(virtvdev->pci_base_addr_0); + + val32 = cpu_to_le32((pci_base_addr_0 & bar_mask) | PCI_BASE_ADDRESS_SPACE_IO); + if (copy_to_user(buf + copy_offset, (void *)&val32 + register_offset, copy_count)) + return -EFAULT; + } + + if (range_intersect_range(pos, count, PCI_SUBSYSTEM_ID, sizeof(val16), + ©_offset, ©_count, ®ister_offset)) { + /* + * Transitional devices use the PCI subsystem device id as + * virtio device id, same as legacy driver always did. + */ + val16 = cpu_to_le16(VIRTIO_ID_NET); + if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, + copy_count)) + return -EFAULT; + } + + if (range_intersect_range(pos, count, PCI_SUBSYSTEM_VENDOR_ID, sizeof(val16), + ©_offset, ©_count, ®ister_offset)) { + val16 = cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET); + if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, + copy_count)) + return -EFAULT; + } + + return count; +} + +static ssize_t +virtiovf_pci_core_read(struct vfio_device *core_vdev, char __user *buf, + size_t count, loff_t *ppos) +{ + struct virtiovf_pci_core_device *virtvdev = container_of( + core_vdev, struct virtiovf_pci_core_device, core_device.vdev); + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + + if (!count) + return 0; + + if (index == VFIO_PCI_CONFIG_REGION_INDEX) + return virtiovf_pci_read_config(core_vdev, buf, count, ppos); + + if (index == VFIO_PCI_BAR0_REGION_INDEX) + return virtiovf_pci_bar0_rw(virtvdev, pos, buf, count, true); + + return vfio_pci_core_read(core_vdev, buf, count, ppos); +} + +static ssize_t virtiovf_pci_write_config(struct vfio_device *core_vdev, + const char __user *buf, size_t count, + loff_t *ppos) +{ + struct virtiovf_pci_core_device *virtvdev = container_of( + core_vdev, struct virtiovf_pci_core_device, core_device.vdev); + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + size_t register_offset; + loff_t copy_offset; + size_t copy_count; + + if (range_intersect_range(pos, count, PCI_COMMAND, sizeof(virtvdev->pci_cmd), + ©_offset, ©_count, + ®ister_offset)) { + if (copy_from_user((void *)&virtvdev->pci_cmd + register_offset, + buf + copy_offset, + copy_count)) + return -EFAULT; + } + + if (range_intersect_range(pos, count, PCI_BASE_ADDRESS_0, + sizeof(virtvdev->pci_base_addr_0), + ©_offset, ©_count, + ®ister_offset)) { + if (copy_from_user((void *)&virtvdev->pci_base_addr_0 + register_offset, + buf + copy_offset, + copy_count)) + return -EFAULT; + } + + return vfio_pci_core_write(core_vdev, buf, count, ppos); +} + +static ssize_t +virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct virtiovf_pci_core_device *virtvdev = container_of( + core_vdev, struct virtiovf_pci_core_device, core_device.vdev); + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + + if (!count) + return 0; + + if (index == VFIO_PCI_CONFIG_REGION_INDEX) + return virtiovf_pci_write_config(core_vdev, buf, count, ppos); + + if (index == VFIO_PCI_BAR0_REGION_INDEX) + return virtiovf_pci_bar0_rw(virtvdev, pos, (char __user *)buf, count, false); + + return vfio_pci_core_write(core_vdev, buf, count, ppos); +} + +static int +virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, + unsigned int cmd, unsigned long arg) +{ + struct virtiovf_pci_core_device *virtvdev = container_of( + core_vdev, struct virtiovf_pci_core_device, core_device.vdev); + unsigned long minsz = offsetofend(struct vfio_region_info, offset); + void __user *uarg = (void __user *)arg; + struct vfio_region_info info = {}; + + if (copy_from_user(&info, uarg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + switch (info.index) { + case VFIO_PCI_BAR0_REGION_INDEX: + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = virtvdev->bar0_virtual_buf_size; + info.flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + return copy_to_user(uarg, &info, minsz) ? -EFAULT : 0; + default: + return vfio_pci_core_ioctl(core_vdev, cmd, arg); + } +} + +static long +virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, + unsigned long arg) +{ + switch (cmd) { + case VFIO_DEVICE_GET_REGION_INFO: + return virtiovf_pci_ioctl_get_region_info(core_vdev, cmd, arg); + default: + return vfio_pci_core_ioctl(core_vdev, cmd, arg); + } +} + +static int +virtiovf_set_notify_addr(struct virtiovf_pci_core_device *virtvdev) +{ + struct vfio_pci_core_device *core_device = &virtvdev->core_device; + int ret; + + /* + * Setup the BAR where the 'notify' exists to be used by vfio as well + * This will let us mmap it only once and use it when needed. + */ + ret = vfio_pci_core_setup_barmap(core_device, + virtvdev->notify_bar); + if (ret) + return ret; + + virtvdev->notify_addr = core_device->barmap[virtvdev->notify_bar] + + virtvdev->notify_offset; + return 0; +} + +static int virtiovf_pci_open_device(struct vfio_device *core_vdev) +{ + struct virtiovf_pci_core_device *virtvdev = container_of( + core_vdev, struct virtiovf_pci_core_device, core_device.vdev); + struct vfio_pci_core_device *vdev = &virtvdev->core_device; + int ret; + + ret = vfio_pci_core_enable(vdev); + if (ret) + return ret; + + if (virtvdev->bar0_virtual_buf) { + /* + * Upon close_device() the vfio_pci_core_disable() is called + * and will close all the previous mmaps, so it seems that the + * valid life cycle for the 'notify' addr is per open/close. + */ + ret = virtiovf_set_notify_addr(virtvdev); + if (ret) { + vfio_pci_core_disable(vdev); + return ret; + } + } + + vfio_pci_core_finish_enable(vdev); + return 0; +} + +static int virtiovf_get_device_config_size(unsigned short device) +{ + /* Network card */ + return offsetofend(struct virtio_net_config, status); +} + +static int virtiovf_read_notify_info(struct virtiovf_pci_core_device *virtvdev) +{ + u64 offset; + int ret; + u8 bar; + + ret = virtio_pci_admin_legacy_io_notify_info(virtvdev->core_device.pdev, + VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_OWNER_MEM, + &bar, &offset); + if (ret) + return ret; + + virtvdev->notify_bar = bar; + virtvdev->notify_offset = offset; + return 0; +} + +static int virtiovf_pci_init_device(struct vfio_device *core_vdev) +{ + struct virtiovf_pci_core_device *virtvdev = container_of( + core_vdev, struct virtiovf_pci_core_device, core_device.vdev); + struct pci_dev *pdev; + int ret; + + ret = vfio_pci_core_init_dev(core_vdev); + if (ret) + return ret; + + pdev = virtvdev->core_device.pdev; + ret = virtiovf_read_notify_info(virtvdev); + if (ret) + return ret; + + virtvdev->bar0_virtual_buf_size = VIRTIO_PCI_CONFIG_OFF(true) + + virtiovf_get_device_config_size(pdev->device); + BUILD_BUG_ON(!is_power_of_2(virtvdev->bar0_virtual_buf_size)); + virtvdev->bar0_virtual_buf = kzalloc(virtvdev->bar0_virtual_buf_size, + GFP_KERNEL); + if (!virtvdev->bar0_virtual_buf) + return -ENOMEM; + mutex_init(&virtvdev->bar_mutex); + return 0; +} + +static void virtiovf_pci_core_release_dev(struct vfio_device *core_vdev) +{ + struct virtiovf_pci_core_device *virtvdev = container_of( + core_vdev, struct virtiovf_pci_core_device, core_device.vdev); + + kfree(virtvdev->bar0_virtual_buf); + vfio_pci_core_release_dev(core_vdev); +} + +static const struct vfio_device_ops virtiovf_vfio_pci_tran_ops = { + .name = "virtio-vfio-pci-trans", + .init = virtiovf_pci_init_device, + .release = virtiovf_pci_core_release_dev, + .open_device = virtiovf_pci_open_device, + .close_device = vfio_pci_core_close_device, + .ioctl = virtiovf_vfio_pci_core_ioctl, + .device_feature = vfio_pci_core_ioctl_feature, + .read = virtiovf_pci_core_read, + .write = virtiovf_pci_core_write, + .mmap = vfio_pci_core_mmap, + .request = vfio_pci_core_request, + .match = vfio_pci_core_match, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, + .detach_ioas = vfio_iommufd_physical_detach_ioas, +}; + +static const struct vfio_device_ops virtiovf_vfio_pci_ops = { + .name = "virtio-vfio-pci", + .init = vfio_pci_core_init_dev, + .release = vfio_pci_core_release_dev, + .open_device = virtiovf_pci_open_device, + .close_device = vfio_pci_core_close_device, + .ioctl = vfio_pci_core_ioctl, + .device_feature = vfio_pci_core_ioctl_feature, + .read = vfio_pci_core_read, + .write = vfio_pci_core_write, + .mmap = vfio_pci_core_mmap, + .request = vfio_pci_core_request, + .match = vfio_pci_core_match, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, + .detach_ioas = vfio_iommufd_physical_detach_ioas, +}; + +static bool virtiovf_bar0_exists(struct pci_dev *pdev) +{ + struct resource *res = pdev->resource; + + return res->flags; +} + +static int virtiovf_pci_probe(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + const struct vfio_device_ops *ops = &virtiovf_vfio_pci_ops; + struct virtiovf_pci_core_device *virtvdev; + int ret; + + if (pdev->is_virtfn && virtio_pci_admin_has_legacy_io(pdev) && + !virtiovf_bar0_exists(pdev)) + ops = &virtiovf_vfio_pci_tran_ops; + + virtvdev = vfio_alloc_device(virtiovf_pci_core_device, core_device.vdev, + &pdev->dev, ops); + if (IS_ERR(virtvdev)) + return PTR_ERR(virtvdev); + + dev_set_drvdata(&pdev->dev, &virtvdev->core_device); + ret = vfio_pci_core_register_device(&virtvdev->core_device); + if (ret) + goto out; + return 0; +out: + vfio_put_device(&virtvdev->core_device.vdev); + return ret; +} + +static void virtiovf_pci_remove(struct pci_dev *pdev) +{ + struct virtiovf_pci_core_device *virtvdev = dev_get_drvdata(&pdev->dev); + + vfio_pci_core_unregister_device(&virtvdev->core_device); + vfio_put_device(&virtvdev->core_device.vdev); +} + +static const struct pci_device_id virtiovf_pci_table[] = { + /* Only virtio-net is supported/tested so far */ + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1041) }, + {} +}; + +MODULE_DEVICE_TABLE(pci, virtiovf_pci_table); + +void virtiovf_pci_aer_reset_done(struct pci_dev *pdev) +{ + struct virtiovf_pci_core_device *virtvdev = dev_get_drvdata(&pdev->dev); + + virtvdev->pci_cmd = 0; +} + +static const struct pci_error_handlers virtiovf_err_handlers = { + .reset_done = virtiovf_pci_aer_reset_done, + .error_detected = vfio_pci_core_aer_err_detected, +}; + +static struct pci_driver virtiovf_pci_driver = { + .name = KBUILD_MODNAME, + .id_table = virtiovf_pci_table, + .probe = virtiovf_pci_probe, + .remove = virtiovf_pci_remove, + .err_handler = &virtiovf_err_handlers, + .driver_managed_dma = true, +}; + +module_pci_driver(virtiovf_pci_driver); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Yishai Hadas "); +MODULE_DESCRIPTION( + "VIRTIO VFIO PCI - User Level meta-driver for VIRTIO NET devices"); -- GitLab From d5cfbdfc96aadc96a2bf6176269b994e2ce31717 Mon Sep 17 00:00:00 2001 From: "Tzvetomir Stoyanov (VMware)" Date: Tue, 19 Dec 2023 13:54:15 -0500 Subject: [PATCH 0407/1290] ring-buffer: Have ring_buffer_print_page_header() be able to access ring_buffer_iter In order to introduce sub-buffer size per ring buffer, some internal refactoring is needed. As ring_buffer_print_page_header() will depend on the trace_buffer structure, it is moved after the structure definition. Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-2-tz.stoyanov@gmail.com Link: https://lore.kernel.org/linux-trace-kernel/20231219185627.723857541@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Kent Overstreet Signed-off-by: Tzvetomir Stoyanov (VMware) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 60 +++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f7dc74e45ebfa..2400c8e68fd33 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -379,36 +379,6 @@ static inline bool test_time_stamp(u64 delta) /* Max payload is BUF_PAGE_SIZE - header (8bytes) */ #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) -int ring_buffer_print_page_header(struct trace_seq *s) -{ - struct buffer_data_page field; - - trace_seq_printf(s, "\tfield: u64 timestamp;\t" - "offset:0;\tsize:%u;\tsigned:%u;\n", - (unsigned int)sizeof(field.time_stamp), - (unsigned int)is_signed_type(u64)); - - trace_seq_printf(s, "\tfield: local_t commit;\t" - "offset:%u;\tsize:%u;\tsigned:%u;\n", - (unsigned int)offsetof(typeof(field), commit), - (unsigned int)sizeof(field.commit), - (unsigned int)is_signed_type(long)); - - trace_seq_printf(s, "\tfield: int overwrite;\t" - "offset:%u;\tsize:%u;\tsigned:%u;\n", - (unsigned int)offsetof(typeof(field), commit), - 1, - (unsigned int)is_signed_type(long)); - - trace_seq_printf(s, "\tfield: char data;\t" - "offset:%u;\tsize:%u;\tsigned:%u;\n", - (unsigned int)offsetof(typeof(field), data), - (unsigned int)BUF_PAGE_SIZE, - (unsigned int)is_signed_type(char)); - - return !trace_seq_has_overflowed(s); -} - struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; @@ -556,6 +526,36 @@ struct ring_buffer_iter { int missed_events; }; +int ring_buffer_print_page_header(struct trace_seq *s) +{ + struct buffer_data_page field; + + trace_seq_printf(s, "\tfield: u64 timestamp;\t" + "offset:0;\tsize:%u;\tsigned:%u;\n", + (unsigned int)sizeof(field.time_stamp), + (unsigned int)is_signed_type(u64)); + + trace_seq_printf(s, "\tfield: local_t commit;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + (unsigned int)sizeof(field.commit), + (unsigned int)is_signed_type(long)); + + trace_seq_printf(s, "\tfield: int overwrite;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + 1, + (unsigned int)is_signed_type(long)); + + trace_seq_printf(s, "\tfield: char data;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), data), + (unsigned int)BUF_PAGE_SIZE, + (unsigned int)is_signed_type(char)); + + return !trace_seq_has_overflowed(s); +} + static inline void rb_time_read(rb_time_t *t, u64 *ret) { *ret = local64_read(&t->time); -- GitLab From 139f84002145d8624f0195fb090b3a7670744a13 Mon Sep 17 00:00:00 2001 From: "Tzvetomir Stoyanov (VMware)" Date: Tue, 19 Dec 2023 13:54:16 -0500 Subject: [PATCH 0408/1290] ring-buffer: Page size per ring buffer Currently the size of one sub buffer page is global for all buffers and it is hard coded to one system page. In order to introduce configurable ring buffer sub page size, the internal logic should be refactored to work with sub page size per ring buffer. Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-3-tz.stoyanov@gmail.com Link: https://lore.kernel.org/linux-trace-kernel/20231219185628.009147038@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Kent Overstreet Signed-off-by: Tzvetomir Stoyanov (VMware) Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 2 +- kernel/trace/ring_buffer.c | 68 +++++++++++++++++++++---------------- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 1 + kernel/trace/trace_events.c | 59 ++++++++++++++++++++++++-------- 5 files changed, 86 insertions(+), 46 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index b1b03b2c0f08e..ce46218ce46df 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -200,7 +200,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, void **data_page, struct trace_seq; int ring_buffer_print_entry_header(struct trace_seq *s); -int ring_buffer_print_page_header(struct trace_seq *s); +int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s); enum ring_buffer_flags { RB_FL_OVERWRITE = 1 << 0, diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2400c8e68fd33..d9f6565024004 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -374,11 +374,6 @@ static inline bool test_time_stamp(u64 delta) return !!(delta & TS_DELTA_TEST); } -#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) - -/* Max payload is BUF_PAGE_SIZE - header (8bytes) */ -#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) - struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; @@ -510,6 +505,9 @@ struct trace_buffer { struct rb_irq_work irq_work; bool time_stamp_abs; + + unsigned int subbuf_size; + unsigned int max_data_size; }; struct ring_buffer_iter { @@ -523,10 +521,11 @@ struct ring_buffer_iter { u64 read_stamp; u64 page_stamp; struct ring_buffer_event *event; + size_t event_size; int missed_events; }; -int ring_buffer_print_page_header(struct trace_seq *s) +int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s) { struct buffer_data_page field; @@ -550,7 +549,7 @@ int ring_buffer_print_page_header(struct trace_seq *s) trace_seq_printf(s, "\tfield: char data;\t" "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), data), - (unsigned int)BUF_PAGE_SIZE, + (unsigned int)buffer->subbuf_size, (unsigned int)is_signed_type(char)); return !trace_seq_has_overflowed(s); @@ -1625,7 +1624,13 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) goto fail_free_buffer; - nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + /* Default buffer page size - one system page */ + buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE; + + /* Max payload is buffer page size - header (8bytes) */ + buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2); + + nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size); buffer->flags = flags; buffer->clock = trace_clock_local; buffer->reader_lock_key = key; @@ -1944,7 +1949,7 @@ static void update_pages_handler(struct work_struct *work) * @size: the new size. * @cpu_id: the cpu buffer to resize * - * Minimum size is 2 * BUF_PAGE_SIZE. + * Minimum size is 2 * buffer->subbuf_size. * * Returns 0 on success and < 0 on failure. */ @@ -1966,7 +1971,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, !cpumask_test_cpu(cpu_id, buffer->cpumask)) return 0; - nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size); /* we need a minimum of two pages */ if (nr_pages < 2) @@ -2213,7 +2218,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter) */ barrier(); - if ((iter->head + length) > commit || length > BUF_PAGE_SIZE) + if ((iter->head + length) > commit || length > iter->event_size) /* Writer corrupted the read? */ goto reset; @@ -2446,6 +2451,7 @@ static inline void rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, unsigned long tail, struct rb_event_info *info) { + unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size); struct buffer_page *tail_page = info->tail_page; struct ring_buffer_event *event; unsigned long length = info->length; @@ -2454,13 +2460,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, * Only the event that crossed the page boundary * must fill the old tail_page with padding. */ - if (tail >= BUF_PAGE_SIZE) { + if (tail >= bsize) { /* * If the page was filled, then we still need * to update the real_end. Reset it to zero * and the reader will ignore it. */ - if (tail == BUF_PAGE_SIZE) + if (tail == bsize) tail_page->real_end = 0; local_sub(length, &tail_page->write); @@ -2488,7 +2494,7 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, * If we are less than the minimum size, we don't need to * worry about it. */ - if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) { + if (tail > (bsize - RB_EVNT_MIN_SIZE)) { /* No room for any events */ /* Mark the rest of the page with padding */ @@ -2503,19 +2509,19 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, } /* Put in a discarded event */ - event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE; + event->array[0] = (bsize - tail) - RB_EVNT_HDR_SIZE; event->type_len = RINGBUF_TYPE_PADDING; /* time delta must be non zero */ event->time_delta = 1; /* account for padding bytes */ - local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); + local_add(bsize - tail, &cpu_buffer->entries_bytes); /* Make sure the padding is visible before the tail_page->write update */ smp_wmb(); /* Set write to end of buffer */ - length = (tail + length) - BUF_PAGE_SIZE; + length = (tail + length) - bsize; local_sub(length, &tail_page->write); } @@ -3469,7 +3475,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, tail = write - info->length; /* See if we shot pass the end of this buffer page */ - if (unlikely(write > BUF_PAGE_SIZE)) { + if (unlikely(write > cpu_buffer->buffer->subbuf_size)) { check_buffer(cpu_buffer, info, CHECK_FULL_PAGE); return rb_move_tail(cpu_buffer, tail, info); } @@ -3600,7 +3606,7 @@ rb_reserve_next_event(struct trace_buffer *buffer, if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) { add_ts_default = RB_ADD_STAMP_ABSOLUTE; info.length += RB_LEN_TIME_EXTEND; - if (info.length > BUF_MAX_DATA_SIZE) + if (info.length > cpu_buffer->buffer->max_data_size) goto out_fail; } else { add_ts_default = RB_ADD_STAMP_NONE; @@ -3675,7 +3681,7 @@ ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length) if (unlikely(atomic_read(&cpu_buffer->record_disabled))) goto out; - if (unlikely(length > BUF_MAX_DATA_SIZE)) + if (unlikely(length > buffer->max_data_size)) goto out; if (unlikely(trace_recursive_lock(cpu_buffer))) @@ -3825,7 +3831,7 @@ int ring_buffer_write(struct trace_buffer *buffer, if (atomic_read(&cpu_buffer->record_disabled)) goto out; - if (length > BUF_MAX_DATA_SIZE) + if (length > buffer->max_data_size) goto out; if (unlikely(trace_recursive_lock(cpu_buffer))) @@ -4405,6 +4411,7 @@ static struct buffer_page * rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = NULL; + unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size); unsigned long overwrite; unsigned long flags; int nr_loops = 0; @@ -4540,7 +4547,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) #define USECS_WAIT 1000000 for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) { /* If the write is past the end of page, a writer is still updating it */ - if (likely(!reader || rb_page_write(reader) <= BUF_PAGE_SIZE)) + if (likely(!reader || rb_page_write(reader) <= bsize)) break; udelay(1); @@ -4984,7 +4991,8 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) return NULL; /* Holds the entire event: data and meta data */ - iter->event = kmalloc(BUF_PAGE_SIZE, flags); + iter->event_size = buffer->subbuf_size; + iter->event = kmalloc(iter->event_size, flags); if (!iter->event) { kfree(iter); return NULL; @@ -5102,14 +5110,14 @@ unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu) { /* * Earlier, this method returned - * BUF_PAGE_SIZE * buffer->nr_pages + * buffer->subbuf_size * buffer->nr_pages * Since the nr_pages field is now removed, we have converted this to * return the per cpu buffer value. */ if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; - return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages; + return buffer->subbuf_size * buffer->buffers[cpu]->nr_pages; } EXPORT_SYMBOL_GPL(ring_buffer_size); @@ -5123,8 +5131,8 @@ unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer) { /* If abs timestamp is requested, events have a timestamp too */ if (ring_buffer_time_stamp_abs(buffer)) - return BUF_MAX_DATA_SIZE - RB_LEN_TIME_EXTEND; - return BUF_MAX_DATA_SIZE; + return buffer->max_data_size - RB_LEN_TIME_EXTEND; + return buffer->max_data_size; } EXPORT_SYMBOL_GPL(ring_buffer_max_event_size); @@ -5730,7 +5738,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, /* If there is room at the end of the page to save the * missed events, then record it there. */ - if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) { + if (buffer->subbuf_size - commit >= sizeof(missed_events)) { memcpy(&bpage->data[commit], &missed_events, sizeof(missed_events)); local_add(RB_MISSED_STORED, &bpage->commit); @@ -5742,8 +5750,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer, /* * This page may be off to user land. Zero it out here. */ - if (commit < BUF_PAGE_SIZE) - memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit); + if (commit < buffer->subbuf_size) + memset(&bpage->data[commit], 0, buffer->subbuf_size - commit); out_unlock: raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 55dabee4c78b1..76dd0a4c8cb5e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5018,7 +5018,7 @@ static int tracing_release(struct inode *inode, struct file *file) return 0; } -static int tracing_release_generic_tr(struct inode *inode, struct file *file) +int tracing_release_generic_tr(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 79180aed13eee..00f873910c5d9 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -616,6 +616,7 @@ void tracing_reset_all_online_cpus(void); void tracing_reset_all_online_cpus_unlocked(void); int tracing_open_generic(struct inode *inode, struct file *filp); int tracing_open_generic_tr(struct inode *inode, struct file *filp); +int tracing_release_generic_tr(struct inode *inode, struct file *file); int tracing_open_file_tr(struct inode *inode, struct file *filp); int tracing_release_file_tr(struct inode *inode, struct file *filp); int tracing_single_release_file_tr(struct inode *inode, struct file *filp); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index b70d038180384..7c364b87352ee 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1893,9 +1893,9 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, } static ssize_t -show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +show_header_page_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - int (*func)(struct trace_seq *s) = filp->private_data; + struct trace_array *tr = filp->private_data; struct trace_seq *s; int r; @@ -1908,7 +1908,31 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) trace_seq_init(s); - func(s); + ring_buffer_print_page_header(tr->array_buffer.buffer, s); + r = simple_read_from_buffer(ubuf, cnt, ppos, + s->buffer, trace_seq_used(s)); + + kfree(s); + + return r; +} + +static ssize_t +show_header_event_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_seq *s; + int r; + + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + ring_buffer_print_entry_header(s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s)); @@ -2165,10 +2189,18 @@ static const struct file_operations ftrace_tr_enable_fops = { .release = subsystem_release, }; -static const struct file_operations ftrace_show_header_fops = { - .open = tracing_open_generic, - .read = show_header, +static const struct file_operations ftrace_show_header_page_fops = { + .open = tracing_open_generic_tr, + .read = show_header_page_file, .llseek = default_llseek, + .release = tracing_release_generic_tr, +}; + +static const struct file_operations ftrace_show_header_event_fops = { + .open = tracing_open_generic_tr, + .read = show_header_event_file, + .llseek = default_llseek, + .release = tracing_release_generic_tr, }; static int @@ -3794,17 +3826,16 @@ static int events_callback(const char *name, umode_t *mode, void **data, return 1; } - if (strcmp(name, "header_page") == 0) - *data = ring_buffer_print_page_header; - - else if (strcmp(name, "header_event") == 0) - *data = ring_buffer_print_entry_header; + if (strcmp(name, "header_page") == 0) { + *mode = TRACE_MODE_READ; + *fops = &ftrace_show_header_page_fops; - else + } else if (strcmp(name, "header_event") == 0) { + *mode = TRACE_MODE_READ; + *fops = &ftrace_show_header_event_fops; + } else return 0; - *mode = TRACE_MODE_READ; - *fops = &ftrace_show_header_fops; return 1; } -- GitLab From 2808e31ec12e5fbe2ae25acc027fcdc67b1fb7f0 Mon Sep 17 00:00:00 2001 From: "Tzvetomir Stoyanov (VMware)" Date: Tue, 19 Dec 2023 13:54:17 -0500 Subject: [PATCH 0409/1290] ring-buffer: Add interface for configuring trace sub buffer size The trace ring buffer sub page size can be configured, per trace instance. A new ftrace file "buffer_subbuf_order" is added to get and set the size of the ring buffer sub page for current trace instance. The size must be an order of system page size, that's why the new interface works with system page order, instead of absolute page size: 0 means the ring buffer sub page is equal to 1 system page and so forth: 0 - 1 system page 1 - 2 system pages 2 - 4 system pages ... The ring buffer sub page size is limited between 1 and 128 system pages. The default value is 1 system page. New ring buffer APIs are introduced: ring_buffer_subbuf_order_set() ring_buffer_subbuf_order_get() ring_buffer_subbuf_size_get() Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-4-tz.stoyanov@gmail.com Link: https://lore.kernel.org/linux-trace-kernel/20231219185628.298324722@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Kent Overstreet Signed-off-by: Tzvetomir Stoyanov (VMware) Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 4 ++ kernel/trace/ring_buffer.c | 73 +++++++++++++++++++++++++++++++++++++ kernel/trace/trace.c | 48 ++++++++++++++++++++++++ 3 files changed, 125 insertions(+) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index ce46218ce46df..12573306b8892 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -202,6 +202,10 @@ struct trace_seq; int ring_buffer_print_entry_header(struct trace_seq *s); int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s); +int ring_buffer_subbuf_order_get(struct trace_buffer *buffer); +int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order); +int ring_buffer_subbuf_size_get(struct trace_buffer *buffer); + enum ring_buffer_flags { RB_FL_OVERWRITE = 1 << 0, }; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d9f6565024004..20fc0121735d3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -507,6 +507,7 @@ struct trace_buffer { bool time_stamp_abs; unsigned int subbuf_size; + unsigned int subbuf_order; unsigned int max_data_size; }; @@ -5761,6 +5762,78 @@ int ring_buffer_read_page(struct trace_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_read_page); +/** + * ring_buffer_subbuf_size_get - get size of the sub buffer. + * @buffer: the buffer to get the sub buffer size from + * + * Returns size of the sub buffer, in bytes. + */ +int ring_buffer_subbuf_size_get(struct trace_buffer *buffer) +{ + return buffer->subbuf_size + BUF_PAGE_HDR_SIZE; +} +EXPORT_SYMBOL_GPL(ring_buffer_subbuf_size_get); + +/** + * ring_buffer_subbuf_order_get - get order of system sub pages in one buffer page. + * @buffer: The ring_buffer to get the system sub page order from + * + * By default, one ring buffer sub page equals to one system page. This parameter + * is configurable, per ring buffer. The size of the ring buffer sub page can be + * extended, but must be an order of system page size. + * + * Returns the order of buffer sub page size, in system pages: + * 0 means the sub buffer size is 1 system page and so forth. + * In case of an error < 0 is returned. + */ +int ring_buffer_subbuf_order_get(struct trace_buffer *buffer) +{ + if (!buffer) + return -EINVAL; + + return buffer->subbuf_order; +} +EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_get); + +/** + * ring_buffer_subbuf_order_set - set the size of ring buffer sub page. + * @buffer: The ring_buffer to set the new page size. + * @order: Order of the system pages in one sub buffer page + * + * By default, one ring buffer pages equals to one system page. This API can be + * used to set new size of the ring buffer page. The size must be order of + * system page size, that's why the input parameter @order is the order of + * system pages that are allocated for one ring buffer page: + * 0 - 1 system page + * 1 - 2 system pages + * 3 - 4 system pages + * ... + * + * Returns 0 on success or < 0 in case of an error. + */ +int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) +{ + int psize; + + if (!buffer || order < 0) + return -EINVAL; + + if (buffer->subbuf_order == order) + return 0; + + psize = (1 << order) * PAGE_SIZE; + if (psize <= BUF_PAGE_HDR_SIZE) + return -EINVAL; + + buffer->subbuf_order = order; + buffer->subbuf_size = psize - BUF_PAGE_HDR_SIZE; + + /* Todo: reset the buffer with the new page size */ + + return 0; +} +EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set); + /* * We only allocate new buffers, never free them if the CPU goes down. * If we were to free the buffer, then the user would lose any trace that was in diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 76dd0a4c8cb5e..a010aba4c4a43 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9358,6 +9358,51 @@ static const struct file_operations buffer_percent_fops = { .llseek = default_llseek, }; +static ssize_t +buffer_order_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + int r; + + r = sprintf(buf, "%d\n", ring_buffer_subbuf_order_get(tr->array_buffer.buffer)); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +buffer_order_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + /* limit between 1 and 128 system pages */ + if (val < 0 || val > 7) + return -EINVAL; + + ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, val); + if (ret) + return ret; + + (*ppos)++; + + return cnt; +} + +static const struct file_operations buffer_order_fops = { + .open = tracing_open_generic_tr, + .read = buffer_order_read, + .write = buffer_order_write, + .release = tracing_release_generic_tr, + .llseek = default_llseek, +}; + static struct dentry *trace_instance_dir; static void @@ -9824,6 +9869,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer, tr, &buffer_percent_fops); + trace_create_file("buffer_subbuf_order", TRACE_MODE_WRITE, d_tracer, + tr, &buffer_order_fops); + create_trace_options_dir(tr); #ifdef CONFIG_TRACER_MAX_TRACE -- GitLab From f9b94daa542a8d2532f0930f01cd9aec2d19621b Mon Sep 17 00:00:00 2001 From: "Tzvetomir Stoyanov (VMware)" Date: Tue, 19 Dec 2023 13:54:18 -0500 Subject: [PATCH 0410/1290] ring-buffer: Set new size of the ring buffer sub page There are two approaches when changing the size of the ring buffer sub page: 1. Destroying all pages and allocating new pages with the new size. 2. Allocating new pages, copying the content of the old pages before destroying them. The first approach is easier, it is selected in the proposed implementation. Changing the ring buffer sub page size is supposed to not happen frequently. Usually, that size should be set only once, when the buffer is not in use yet and is supposed to be empty. Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-5-tz.stoyanov@gmail.com Link: https://lore.kernel.org/linux-trace-kernel/20231219185628.588995543@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Kent Overstreet Signed-off-by: Tzvetomir Stoyanov (VMware) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 80 ++++++++++++++++++++++++++++++++++---- 1 file changed, 73 insertions(+), 7 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 20fc0121735d3..b928bd03dd0eb 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -332,6 +332,7 @@ struct buffer_page { unsigned read; /* index for next read */ local_t entries; /* entries on this page */ unsigned long real_end; /* real end of data */ + unsigned order; /* order of the page */ struct buffer_data_page *page; /* Actual data page */ }; @@ -362,7 +363,7 @@ static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage) static void free_buffer_page(struct buffer_page *bpage) { - free_page((unsigned long)bpage->page); + free_pages((unsigned long)bpage->page, bpage->order); kfree(bpage); } @@ -1460,10 +1461,12 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, list_add(&bpage->list, pages); - page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, 0); + page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, + cpu_buffer->buffer->subbuf_order); if (!page) goto free_pages; bpage->page = page_address(page); + bpage->order = cpu_buffer->buffer->subbuf_order; rb_init_page(bpage->page); if (user_thread && fatal_signal_pending(current)) @@ -1542,7 +1545,8 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) rb_check_bpage(cpu_buffer, bpage); cpu_buffer->reader_page = bpage; - page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); + + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, cpu_buffer->buffer->subbuf_order); if (!page) goto fail_free_reader; bpage->page = page_address(page); @@ -1626,6 +1630,7 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, goto fail_free_buffer; /* Default buffer page size - one system page */ + buffer->subbuf_order = 0; buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE; /* Max payload is buffer page size - header (8bytes) */ @@ -5503,8 +5508,8 @@ void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) if (bpage) goto out; - page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_NORETRY, 0); + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, + cpu_buffer->buffer->subbuf_order); if (!page) return ERR_PTR(-ENOMEM); @@ -5553,7 +5558,7 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data local_irq_restore(flags); out: - free_page((unsigned long)bpage); + free_pages((unsigned long)bpage, buffer->subbuf_order); } EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); @@ -5813,7 +5818,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_get); */ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) { + struct ring_buffer_per_cpu **cpu_buffers; + int old_order, old_size; + int nr_pages; int psize; + int bsize; + int err; + int cpu; if (!buffer || order < 0) return -EINVAL; @@ -5825,12 +5836,67 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) if (psize <= BUF_PAGE_HDR_SIZE) return -EINVAL; + bsize = sizeof(void *) * buffer->cpus; + cpu_buffers = kzalloc(bsize, GFP_KERNEL); + if (!cpu_buffers) + return -ENOMEM; + + old_order = buffer->subbuf_order; + old_size = buffer->subbuf_size; + + /* prevent another thread from changing buffer sizes */ + mutex_lock(&buffer->mutex); + atomic_inc(&buffer->record_disabled); + + /* Make sure all commits have finished */ + synchronize_rcu(); + buffer->subbuf_order = order; buffer->subbuf_size = psize - BUF_PAGE_HDR_SIZE; - /* Todo: reset the buffer with the new page size */ + /* Make sure all new buffers are allocated, before deleting the old ones */ + for_each_buffer_cpu(buffer, cpu) { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + continue; + + nr_pages = buffer->buffers[cpu]->nr_pages; + cpu_buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); + if (!cpu_buffers[cpu]) { + err = -ENOMEM; + goto error; + } + } + + for_each_buffer_cpu(buffer, cpu) { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + continue; + + rb_free_cpu_buffer(buffer->buffers[cpu]); + buffer->buffers[cpu] = cpu_buffers[cpu]; + } + + atomic_dec(&buffer->record_disabled); + mutex_unlock(&buffer->mutex); + + kfree(cpu_buffers); return 0; + +error: + buffer->subbuf_order = old_order; + buffer->subbuf_size = old_size; + + atomic_dec(&buffer->record_disabled); + mutex_unlock(&buffer->mutex); + + for_each_buffer_cpu(buffer, cpu) { + if (!cpu_buffers[cpu]) + continue; + kfree(cpu_buffers[cpu]); + } + kfree(cpu_buffers); + + return err; } EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set); -- GitLab From bce761d757452ba5eb77e11fecc37a04b67494e7 Mon Sep 17 00:00:00 2001 From: "Tzvetomir Stoyanov (VMware)" Date: Tue, 19 Dec 2023 13:54:19 -0500 Subject: [PATCH 0411/1290] ring-buffer: Read and write to ring buffers with custom sub buffer size As the size of the ring sub buffer page can be changed dynamically, the logic that reads and writes to the buffer should be fixed to take that into account. Some internal ring buffer APIs are changed: ring_buffer_alloc_read_page() ring_buffer_free_read_page() ring_buffer_read_page() A new API is introduced: ring_buffer_read_page_data() Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-6-tz.stoyanov@gmail.com Link: https://lore.kernel.org/linux-trace-kernel/20231219185628.875145995@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Kent Overstreet Signed-off-by: Tzvetomir Stoyanov (VMware) [ Fixed kerneldoc on data_page parameter in ring_buffer_free_read_page() ] Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 11 ++-- kernel/trace/ring_buffer.c | 75 ++++++++++++++++++++-------- kernel/trace/ring_buffer_benchmark.c | 10 ++-- kernel/trace/trace.c | 34 +++++++------ 4 files changed, 89 insertions(+), 41 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 12573306b8892..fa802db216f94 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -192,10 +192,15 @@ bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer); size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu); size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu); -void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu); -void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data); -int ring_buffer_read_page(struct trace_buffer *buffer, void **data_page, +struct buffer_data_read_page; +struct buffer_data_read_page * +ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu); +void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, + struct buffer_data_read_page *page); +int ring_buffer_read_page(struct trace_buffer *buffer, + struct buffer_data_read_page *data_page, size_t len, int cpu, int full); +void *ring_buffer_read_page_data(struct buffer_data_read_page *page); struct trace_seq; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b928bd03dd0eb..c2afcf98ea912 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -318,6 +318,11 @@ struct buffer_data_page { unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */ }; +struct buffer_data_read_page { + unsigned order; /* order of the page */ + struct buffer_data_page *data; /* actual data, stored in this page */ +}; + /* * Note, the buffer_page list must be first. The buffer pages * are allocated in cache lines, which means that each buffer @@ -5483,40 +5488,48 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); * Returns: * The page allocated, or ERR_PTR */ -void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) +struct buffer_data_read_page * +ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - struct buffer_data_page *bpage = NULL; + struct buffer_data_read_page *bpage = NULL; unsigned long flags; struct page *page; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return ERR_PTR(-ENODEV); + bpage = kzalloc(sizeof(*bpage), GFP_KERNEL); + if (!bpage) + return ERR_PTR(-ENOMEM); + + bpage->order = buffer->subbuf_order; cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); arch_spin_lock(&cpu_buffer->lock); if (cpu_buffer->free_page) { - bpage = cpu_buffer->free_page; + bpage->data = cpu_buffer->free_page; cpu_buffer->free_page = NULL; } arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); - if (bpage) + if (bpage->data) goto out; page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, cpu_buffer->buffer->subbuf_order); - if (!page) + if (!page) { + kfree(bpage); return ERR_PTR(-ENOMEM); + } - bpage = page_address(page); + bpage->data = page_address(page); out: - rb_init_page(bpage); + rb_init_page(bpage->data); return bpage; } @@ -5526,14 +5539,15 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); * ring_buffer_free_read_page - free an allocated read page * @buffer: the buffer the page was allocate for * @cpu: the cpu buffer the page came from - * @data: the page to free + * @data_page: the page to free * * Free a page allocated from ring_buffer_alloc_read_page. */ -void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data) +void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, + struct buffer_data_read_page *data_page) { struct ring_buffer_per_cpu *cpu_buffer; - struct buffer_data_page *bpage = data; + struct buffer_data_page *bpage = data_page->data; struct page *page = virt_to_page(bpage); unsigned long flags; @@ -5542,8 +5556,12 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data cpu_buffer = buffer->buffers[cpu]; - /* If the page is still in use someplace else, we can't reuse it */ - if (page_ref_count(page) > 1) + /* + * If the page is still in use someplace else, or order of the page + * is different from the subbuffer order of the buffer - + * we can't reuse it + */ + if (page_ref_count(page) > 1 || data_page->order != buffer->subbuf_order) goto out; local_irq_save(flags); @@ -5558,7 +5576,8 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data local_irq_restore(flags); out: - free_pages((unsigned long)bpage, buffer->subbuf_order); + free_pages((unsigned long)bpage, data_page->order); + kfree(data_page); } EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); @@ -5579,9 +5598,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); * rpage = ring_buffer_alloc_read_page(buffer, cpu); * if (IS_ERR(rpage)) * return PTR_ERR(rpage); - * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); + * ret = ring_buffer_read_page(buffer, rpage, len, cpu, 0); * if (ret >= 0) - * process_page(rpage, ret); + * process_page(ring_buffer_read_page_data(rpage), ret); + * ring_buffer_free_read_page(buffer, cpu, rpage); * * When @full is set, the function will not return true unless * the writer is off the reader page. @@ -5596,7 +5616,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); * <0 if no data has been transferred. */ int ring_buffer_read_page(struct trace_buffer *buffer, - void **data_page, size_t len, int cpu, int full) + struct buffer_data_read_page *data_page, + size_t len, int cpu, int full) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; @@ -5621,10 +5642,12 @@ int ring_buffer_read_page(struct trace_buffer *buffer, len -= BUF_PAGE_HDR_SIZE; - if (!data_page) + if (!data_page || !data_page->data) + goto out; + if (data_page->order != buffer->subbuf_order) goto out; - bpage = *data_page; + bpage = data_page->data; if (!bpage) goto out; @@ -5718,11 +5741,11 @@ int ring_buffer_read_page(struct trace_buffer *buffer, /* swap the pages */ rb_init_page(bpage); bpage = reader->page; - reader->page = *data_page; + reader->page = data_page->data; local_set(&reader->write, 0); local_set(&reader->entries, 0); reader->read = 0; - *data_page = bpage; + data_page->data = bpage; /* * Use the real_end for the data size, @@ -5767,6 +5790,18 @@ int ring_buffer_read_page(struct trace_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_read_page); +/** + * ring_buffer_read_page_data - get pointer to the data in the page. + * @page: the page to get the data from + * + * Returns pointer to the actual data in this page. + */ +void *ring_buffer_read_page_data(struct buffer_data_read_page *page) +{ + return page->data; +} +EXPORT_SYMBOL_GPL(ring_buffer_read_page_data); + /** * ring_buffer_subbuf_size_get - get size of the sub buffer. * @buffer: the buffer to get the sub buffer size from diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index aef34673d79d0..008187ebd7fe6 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -104,10 +104,11 @@ static enum event_status read_event(int cpu) static enum event_status read_page(int cpu) { + struct buffer_data_read_page *bpage; struct ring_buffer_event *event; struct rb_page *rpage; unsigned long commit; - void *bpage; + int page_size; int *entry; int ret; int inc; @@ -117,14 +118,15 @@ static enum event_status read_page(int cpu) if (IS_ERR(bpage)) return EVENT_DROPPED; - ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); + page_size = ring_buffer_subbuf_size_get(buffer); + ret = ring_buffer_read_page(buffer, bpage, page_size, cpu, 1); if (ret >= 0) { - rpage = bpage; + rpage = ring_buffer_read_page_data(bpage); /* The commit may have missed event flags set, clear them */ commit = local_read(&rpage->commit) & 0xfffff; for (i = 0; i < commit && !test_error ; i += inc) { - if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) { + if (i >= (page_size - offsetof(struct rb_page, data))) { TEST_ERROR(); break; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a010aba4c4a43..711095aa731d3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8286,6 +8286,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, { struct ftrace_buffer_info *info = filp->private_data; struct trace_iterator *iter = &info->iter; + void *trace_data; + int page_size; ssize_t ret = 0; ssize_t size; @@ -8297,6 +8299,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, return -EBUSY; #endif + page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer); + if (!info->spare) { info->spare = ring_buffer_alloc_read_page(iter->array_buffer->buffer, iter->cpu_file); @@ -8311,13 +8315,13 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, return ret; /* Do we have previous read data to read? */ - if (info->read < PAGE_SIZE) + if (info->read < page_size) goto read; again: trace_access_lock(iter->cpu_file); ret = ring_buffer_read_page(iter->array_buffer->buffer, - &info->spare, + info->spare, count, iter->cpu_file, 0); trace_access_unlock(iter->cpu_file); @@ -8338,11 +8342,11 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, info->read = 0; read: - size = PAGE_SIZE - info->read; + size = page_size - info->read; if (size > count) size = count; - - ret = copy_to_user(ubuf, info->spare + info->read, size); + trace_data = ring_buffer_read_page_data(info->spare); + ret = copy_to_user(ubuf, trace_data + info->read, size); if (ret == size) return -EFAULT; @@ -8453,6 +8457,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, .spd_release = buffer_spd_release, }; struct buffer_ref *ref; + int page_size; int entries, i; ssize_t ret = 0; @@ -8461,13 +8466,14 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, return -EBUSY; #endif - if (*ppos & (PAGE_SIZE - 1)) + page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer); + if (*ppos & (page_size - 1)) return -EINVAL; - if (len & (PAGE_SIZE - 1)) { - if (len < PAGE_SIZE) + if (len & (page_size - 1)) { + if (len < page_size) return -EINVAL; - len &= PAGE_MASK; + len &= (~(page_size - 1)); } if (splice_grow_spd(pipe, &spd)) @@ -8477,7 +8483,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, trace_access_lock(iter->cpu_file); entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file); - for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) { + for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= page_size) { struct page *page; int r; @@ -8498,7 +8504,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, } ref->cpu = iter->cpu_file; - r = ring_buffer_read_page(ref->buffer, &ref->page, + r = ring_buffer_read_page(ref->buffer, ref->page, len, iter->cpu_file, 1); if (r < 0) { ring_buffer_free_read_page(ref->buffer, ref->cpu, @@ -8507,14 +8513,14 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, break; } - page = virt_to_page(ref->page); + page = virt_to_page(ring_buffer_read_page_data(ref->page)); spd.pages[i] = page; - spd.partial[i].len = PAGE_SIZE; + spd.partial[i].len = page_size; spd.partial[i].offset = 0; spd.partial[i].private = (unsigned long)ref; spd.nr_pages++; - *ppos += PAGE_SIZE; + *ppos += page_size; entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file); } -- GitLab From daca194876a94565194f6e32ddb0355af8cf30f7 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Wed, 20 Dec 2023 10:24:56 +0200 Subject: [PATCH 0412/1290] vfio/virtio: Declare virtiovf_pci_aer_reset_done() static Declare virtiovf_pci_aer_reset_done() as a static function to prevent the below build warning. "warning: no previous prototype for 'virtiovf_pci_aer_reset_done' [-Wmissing-prototypes]" Fixes: eb61eca0e8c3 ("vfio/virtio: Introduce a vfio driver over virtio devices") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/lkml/20231220143122.63337669@canb.auug.org.au/ Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20231220082456.241973-1-yishaih@nvidia.com Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202312202115.oDmvN1VE-lkp@intel.com/ Signed-off-by: Alex Williamson --- drivers/vfio/pci/virtio/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c index 291c55b641f15..d5af683837d34 100644 --- a/drivers/vfio/pci/virtio/main.c +++ b/drivers/vfio/pci/virtio/main.c @@ -547,7 +547,7 @@ static const struct pci_device_id virtiovf_pci_table[] = { MODULE_DEVICE_TABLE(pci, virtiovf_pci_table); -void virtiovf_pci_aer_reset_done(struct pci_dev *pdev) +static void virtiovf_pci_aer_reset_done(struct pci_dev *pdev) { struct virtiovf_pci_core_device *virtvdev = dev_get_drvdata(&pdev->dev); -- GitLab From e43c64c971e48d11ee100c5a8b2eadbed056f924 Mon Sep 17 00:00:00 2001 From: Veronika Molnarova Date: Tue, 12 Dec 2023 17:59:09 +0100 Subject: [PATCH 0413/1290] perf archive: Add new option '--unpack' to expand tarballs Archives generated by the command 'perf archive' have to be unpacked manually. Following the addition of option '--all' now there also exist a nested structure of tars, and after further discussion with Red Hat Global Support Services, they found a feature correctly unpacking archives of 'perf archive' convenient. Option '--unpack' of 'perf archive' unpacks archives generated by the command 'perf archive' as well as archives generated when used with option '--all'. The 'perf.data' file is placed in the current directory, while debug symbols are unpacked in '~/.debug' directory. A tar filename can be passed as an argument, and if not provided the command tries to find a viable perf.tar file for unpacking. Signed-off-by: Veronika Molnarova Tested-by: Arnaldo Carvalho de Melo Cc: Michael Petlan Link: https://lore.kernel.org/r/20231212165909.14459-2-vmolnaro@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/perf-archive.sh | 66 +++++++++++++++++++++++++++++++++----- 1 file changed, 58 insertions(+), 8 deletions(-) diff --git a/tools/perf/perf-archive.sh b/tools/perf/perf-archive.sh index a92042eae95a1..f94795794b361 100755 --- a/tools/perf/perf-archive.sh +++ b/tools/perf/perf-archive.sh @@ -7,17 +7,72 @@ PERF_DATA=perf.data PERF_SYMBOLS=perf.symbols PERF_ALL=perf.all ALL=0 +UNPACK=0 while [ $# -gt 0 ] ; do if [ $1 == "--all" ]; then ALL=1 shift + elif [ $1 == "--unpack" ]; then + UNPACK=1 + shift else PERF_DATA=$1 + UNPACK_TAR=$1 shift fi done +if [ $UNPACK -eq 1 ]; then + if [ ! -z "$UNPACK_TAR" ]; then # tar given as an argument + if [ ! -e "$UNPACK_TAR" ]; then + echo "Provided file $UNPACK_TAR does not exist" + exit 1 + fi + TARGET="$UNPACK_TAR" + else # search for perf tar in the current directory + TARGET=`find . -regex "\./perf.*\.tar\.bz2"` + TARGET_NUM=`echo -n "$TARGET" | grep -c '^'` + + if [ -z "$TARGET" -o $TARGET_NUM -gt 1 ]; then + echo -e "Error: $TARGET_NUM files found for unpacking:\n$TARGET" + echo "Provide the requested file as an argument" + exit 1 + else + echo "Found target file for unpacking: $TARGET" + fi + fi + + if [[ "$TARGET" =~ (\./)?$PERF_ALL.*.tar.bz2 ]]; then # perf tar generated by --all option + TAR_CONTENTS=`tar tvf "$TARGET" | tr -s " " | cut -d " " -f 6` + VALID_TAR=`echo "$TAR_CONTENTS" | grep "$PERF_SYMBOLS.tar.bz2" | wc -l` # check if it contains a sub-tar perf.symbols + if [ $VALID_TAR -ne 1 ]; then + echo "Error: $TARGET file is not valid (contains zero or multiple sub-tar files with debug symbols)" + exit 1 + fi + + INTERSECT=`comm -12 <(ls) <(echo "$TAR_CONTENTS") | tr "\n" " "` # check for overwriting + if [ ! -z "$INTERSECT" ]; then # prompt if file(s) already exist in the current directory + echo "File(s) ${INTERSECT::-1} already exist in the current directory." + while true; do + read -p 'Do you wish to overwrite them? ' yn + case $yn in + [Yy]* ) break;; + [Nn]* ) exit 1;; + * ) echo "Please answer yes or no.";; + esac + done + fi + + # unzip the perf.data file in the current working directory and debug symbols in ~/.debug directory + tar xvf $TARGET && tar xvf $PERF_SYMBOLS.tar.bz2 -C ~/.debug + + else # perf tar generated by perf archive (contains only debug symbols) + tar xvf $TARGET -C ~/.debug + fi + exit 0 +fi + # # PERF_BUILDID_DIR environment variable set by perf # path to buildid directory, default to $HOME/.debug @@ -55,17 +110,12 @@ if [ $ALL -eq 1 ]; then # pack perf.data file together with tar containing tar cjf $PERF_SYMBOLS.tar.bz2 -C $PERF_BUILDID_DIR -T $MANIFEST tar cjf $PERF_ALL-$HOSTNAME-$DATE.tar.bz2 $PERF_DATA $PERF_SYMBOLS.tar.bz2 rm $PERF_SYMBOLS.tar.bz2 $MANIFEST $BUILDIDS || true - - echo -e "Now please run:\n" - echo -e "$ tar xvf $PERF_ALL-$HOSTNAME-$DATE.tar.bz2 && tar xvf $PERF_SYMBOLS.tar.bz2 -C ~/.debug\n" - echo "wherever you need to run 'perf report' on." else # pack only the debug symbols tar cjf $PERF_DATA.tar.bz2 -C $PERF_BUILDID_DIR -T $MANIFEST rm $MANIFEST $BUILDIDS || true - - echo -e "Now please run:\n" - echo -e "$ tar xvf $PERF_DATA.tar.bz2 -C ~/.debug\n" - echo "wherever you need to run 'perf report' on." fi +echo -e "Now please run:\n" +echo -e "$ perf archive --unpack\n" +echo "or unpack the tar manually wherever you need to run 'perf report' on." exit 0 -- GitLab From c344675ad267040b1794b0b5d85147a5023c548d Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Thu, 14 Dec 2023 20:33:03 +0800 Subject: [PATCH 0414/1290] perf scripts python arm-cs-trace-disasm.py: Set start vm addr of exectable file to 0 For exectable ELF file, which e_type is ET_EXEC, dso start address is a absolute address other than offset. Just set vm_start to zero when dso start is 0x400000, which means it is a exectable file. Reviewed-by: James Clark Signed-off-by: Ruidong Tian Cc: Adrian Hunter Cc: Al Grant Cc: Alexander Shishkin Cc: Leo Yan Cc: Mathieu Poirier Cc: Mike Leach Cc: Suzuki Poulouse Cc: Tor Jeremiassen Link: https://lore.kernel.org/r/20231214123304.34087-3-tianruidong@linux.alibaba.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/scripts/python/arm-cs-trace-disasm.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/perf/scripts/python/arm-cs-trace-disasm.py b/tools/perf/scripts/python/arm-cs-trace-disasm.py index de58991c78bb4..e33aee5c78d72 100755 --- a/tools/perf/scripts/python/arm-cs-trace-disasm.py +++ b/tools/perf/scripts/python/arm-cs-trace-disasm.py @@ -258,8 +258,9 @@ def process_event(param_dict): if (options.objdump_name != None): # It doesn't need to decrease virtual memory offset for disassembly - # for kernel dso, so in this case we set vm_start to zero. - if (dso == "[kernel.kallsyms]"): + # for kernel dso and executable file dso, so in this case we set + # vm_start to zero. + if (dso == "[kernel.kallsyms]" or dso_start == 0x400000): dso_vm_start = 0 else: dso_vm_start = int(dso_start) -- GitLab From 2d98dbb4c9c5b09c8d2411581ab17921f872dbd3 Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Thu, 14 Dec 2023 20:33:04 +0800 Subject: [PATCH 0415/1290] perf scripts python arm-cs-trace-disasm.py: Do not ignore disam first sample arm-cs-trace-disasm ignore disam the first branch sample, For example as follow, the instructions beteween 0x0000ffffae878750 and 0x0000ffffae878754 is lose: ARM CoreSight Trace Data Assembler Dump Event type: branches:uH Sample = { cpu: 0000 addr: 0x0000ffffae878750 phys_addr: 0x0000000000000000 ip: 0x0000000000000000 pid: 4003489 tid: 4003489 period: 1 time: 26765151766034 } Event type: branches:uH Sample = { cpu: 0000 addr: 0x0000000000000000 phys_addr: 0x0000000000000000 ip: 0x0000ffffae878754 pid: 4003489 tid: 4003489 period: 1 time: 26765151766034 } Initialize cpu_data earlier to fix it: ARM CoreSight Trace Data Assembler Dump Event type: branches:uH Sample = { cpu: 0000 addr: 0x0000000000000000 phys_addr: 0x0000000000000000 ip: 0x0000ffffae878754 pid: 4003489 tid: 4003489 period: 1 time: 26765151766034 } 0000000000028740 : (base address is 0x0000ffffae850000) 28750: b13ffc1f cmn x0, #4095 28754: 54000042 b.hs 0x2875c test 4003489/4003489 [0000] 26765.151766034 __GI___ioctl+0x14 /usr/lib64/libc-2.32.so Event type: branches:uH Sample = { cpu: 0000 addr: 0x0000ffffa67535ac phys_addr: 0x0000000000000000 ip: 0x0000000000000000 pid: 4003489 tid: 4003489 period: 1 time: 26765151766034 } Reviewed-by: James Clark Signed-off-by: Ruidong Tian Cc: Adrian Hunter Cc: Al Grant Cc: Alexander Shishkin Cc: Leo Yan Cc: Mathieu Poirier Cc: Mike Leach Cc: Suzuki Poulouse Cc: Tor Jeremiassen Link: https://lore.kernel.org/r/20231214123304.34087-4-tianruidong@linux.alibaba.com Signed-off-by: Arnaldo Carvalho de Melo --- .../scripts/python/arm-cs-trace-disasm.py | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/tools/perf/scripts/python/arm-cs-trace-disasm.py b/tools/perf/scripts/python/arm-cs-trace-disasm.py index e33aee5c78d72..d973c2baed1c8 100755 --- a/tools/perf/scripts/python/arm-cs-trace-disasm.py +++ b/tools/perf/scripts/python/arm-cs-trace-disasm.py @@ -188,6 +188,17 @@ def process_event(param_dict): dso_end = get_optional(param_dict, "dso_map_end") symbol = get_optional(param_dict, "symbol") + cpu = sample["cpu"] + ip = sample["ip"] + addr = sample["addr"] + + # Initialize CPU data if it's empty, and directly return back + # if this is the first tracing event for this CPU. + if (cpu_data.get(str(cpu) + 'addr') == None): + cpu_data[str(cpu) + 'addr'] = addr + return + + if (options.verbose == True): print("Event type: %s" % name) print_sample(sample) @@ -209,16 +220,6 @@ def process_event(param_dict): if (name[0:8] != "branches"): return - cpu = sample["cpu"] - ip = sample["ip"] - addr = sample["addr"] - - # Initialize CPU data if it's empty, and directly return back - # if this is the first tracing event for this CPU. - if (cpu_data.get(str(cpu) + 'addr') == None): - cpu_data[str(cpu) + 'addr'] = addr - return - # The format for packet is: # # +------------+------------+------------+ -- GitLab From 16f533ade706d33e60324ff32e526bda20bccbd9 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 6 Dec 2023 17:16:45 -0800 Subject: [PATCH 0416/1290] perf unwind: Use function to add missing maps lock Switch read_unwind_spec_eh_frame() from loop macro maps__for_each_entry() to maps__for_each_map() function that takes a callback. The function holds the maps lock, which should be held during iteration. Committer notes: Fixed up conflict with: 4fb54994b2360ab5 ("perf unwind-libunwind: Fix base address for .eh_frame") Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: German Gomez Cc: James Clark Cc: Jiri Olsa Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Steinar H. Gunderson Cc: Wenyu Liu Cc: Yang Jihong Cc: changbin du Cc: colin ian king Cc: dmitrii dolgov <9erthalion6@gmail.com> Cc: guilherme amadio Cc: huacai chen Cc: k prateek nayak Cc: li dong Cc: liam howlett Cc: miguel ojeda Cc: ming wang Cc: sean christopherson Cc: vincent whitchurch Link: http://lore.kernel.org/lkml/20231207011722.1220634-12-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/unwind-libunwind-local.c | 34 +++++++++++++++++------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c index 5e5c3395a4998..dac536e28360a 100644 --- a/tools/perf/util/unwind-libunwind-local.c +++ b/tools/perf/util/unwind-libunwind-local.c @@ -302,12 +302,31 @@ static int unwind_spec_ehframe(struct dso *dso, struct machine *machine, return 0; } +struct read_unwind_spec_eh_frame_maps_cb_args { + struct dso *dso; + u64 base_addr; +}; + +static int read_unwind_spec_eh_frame_maps_cb(struct map *map, void *data) +{ + + struct read_unwind_spec_eh_frame_maps_cb_args *args = data; + + if (map__dso(map) == args->dso && map__start(map) - map__pgoff(map) < args->base_addr) + args->base_addr = map__start(map) - map__pgoff(map); + + return 0; +} + + static int read_unwind_spec_eh_frame(struct dso *dso, struct unwind_info *ui, u64 *table_data, u64 *segbase, u64 *fde_count) { - struct map_rb_node *map_node; - u64 base_addr = UINT64_MAX; + struct read_unwind_spec_eh_frame_maps_cb_args args = { + .dso = dso, + .base_addr = UINT64_MAX, + }; int ret, fd; if (dso->data.eh_frame_hdr_offset == 0) { @@ -325,16 +344,11 @@ static int read_unwind_spec_eh_frame(struct dso *dso, struct unwind_info *ui, return -EINVAL; } - maps__for_each_entry(thread__maps(ui->thread), map_node) { - struct map *map = map_node->map; - u64 start = map__start(map) - map__pgoff(map); + maps__for_each_map(thread__maps(ui->thread), read_unwind_spec_eh_frame_maps_cb, &args); - if (map__dso(map) == dso && start < base_addr) - base_addr = start; - } - base_addr -= dso->data.elf_base_addr; + args.base_addr -= dso->data.elf_base_addr; /* Address of .eh_frame_hdr */ - *segbase = base_addr + dso->data.eh_frame_hdr_addr; + *segbase = args.base_addr + dso->data.eh_frame_hdr_addr; ret = unwind_spec_ehframe(dso, ui->machine, dso->data.eh_frame_hdr_offset, table_data, fde_count); if (ret) -- GitLab From 51ab715e2bf037cd0525494b7ed496c46e4013c6 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 6 Dec 2023 17:16:46 -0800 Subject: [PATCH 0417/1290] perf vdso: Use function to add missing maps lock Switch machine__thread_dso_type() from loop macro maps__for_each_entry() to maps__for_each_map() function that takes a callback. The function holds the maps lock, which should be held during iteration. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Guilherme Amadio Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Miguel Ojeda Cc: Ming Wang Cc: Namhyung Kim Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231207011722.1220634-13-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/vdso.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/tools/perf/util/vdso.c b/tools/perf/util/vdso.c index ae3eee69b659c..df8963796187d 100644 --- a/tools/perf/util/vdso.c +++ b/tools/perf/util/vdso.c @@ -140,23 +140,34 @@ static struct dso *__machine__addnew_vdso(struct machine *machine, const char *s return dso; } +struct machine__thread_dso_type_maps_cb_args { + struct machine *machine; + enum dso_type dso_type; +}; + +static int machine__thread_dso_type_maps_cb(struct map *map, void *data) +{ + struct machine__thread_dso_type_maps_cb_args *args = data; + struct dso *dso = map__dso(map); + + if (!dso || dso->long_name[0] != '/') + return 0; + + args->dso_type = dso__type(dso, args->machine); + return (args->dso_type != DSO__TYPE_UNKNOWN) ? 1 : 0; +} + static enum dso_type machine__thread_dso_type(struct machine *machine, struct thread *thread) { - enum dso_type dso_type = DSO__TYPE_UNKNOWN; - struct map_rb_node *rb_node; - - maps__for_each_entry(thread__maps(thread), rb_node) { - struct dso *dso = map__dso(rb_node->map); + struct machine__thread_dso_type_maps_cb_args args = { + .machine = machine, + .dso_type = DSO__TYPE_UNKNOWN, + }; - if (!dso || dso->long_name[0] != '/') - continue; - dso_type = dso__type(dso, machine); - if (dso_type != DSO__TYPE_UNKNOWN) - break; - } + maps__for_each_map(thread__maps(thread), machine__thread_dso_type_maps_cb, &args); - return dso_type; + return args.dso_type; } #if BITS_PER_LONG == 64 -- GitLab From 9cce3a161e17b5c1d50de75e85c1aeb7452d17e6 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 6 Dec 2023 17:16:47 -0800 Subject: [PATCH 0418/1290] perf maps: Reduce scope of maps__for_each_entry() Reduce scope of maps__for_each_entry() as maps__for_each_map() is a safer alternative holding the maps lock during iteration. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Guilherme Amadio Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Miguel Ojeda Cc: Ming Wang Cc: Namhyung Kim Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231207011722.1220634-14-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/maps.c | 3 +++ tools/perf/util/maps.h | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c index 160a6dce54bb6..00e6589bba10d 100644 --- a/tools/perf/util/maps.c +++ b/tools/perf/util/maps.c @@ -10,6 +10,9 @@ #include "ui/ui.h" #include "unwind.h" +#define maps__for_each_entry(maps, map) \ + for (map = maps__first(maps); map; map = map_rb_node__next(map)) + static void maps__init(struct maps *maps, struct machine *machine) { refcount_set(maps__refcnt(maps), 1); diff --git a/tools/perf/util/maps.h b/tools/perf/util/maps.h index 14ad959792577..8ac30cdaf5bd1 100644 --- a/tools/perf/util/maps.h +++ b/tools/perf/util/maps.h @@ -36,9 +36,6 @@ struct map_rb_node *map_rb_node__next(struct map_rb_node *node); struct map_rb_node *maps__find_node(struct maps *maps, struct map *map); struct map *maps__find(struct maps *maps, u64 addr); -#define maps__for_each_entry(maps, map) \ - for (map = maps__first(maps); map; map = map_rb_node__next(map)) - #define maps__for_each_entry_safe(maps, map, next) \ for (map = maps__first(maps), next = map_rb_node__next(map); map; \ map = next, next = map_rb_node__next(map)) -- GitLab From 8d5847a61723b4dbb3da3b356925c4928ed14de2 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 6 Dec 2023 17:16:48 -0800 Subject: [PATCH 0419/1290] perf maps: Add remove maps function to remove a map based on callback Removing maps wasn't being done under the write lock. Similar to maps__for_each_map(), iterate the entries but in this case remove the entry based on the result of the callback. If an entry was removed then maps_by_name() also needs updating, so add missed flush. In dso__load_kcore(), the test of map to save would always be false with REFCNT_CHECKING because of a missing RC_CHK_ACCESS/RC_CHK_EQUAL. Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Guilherme Amadio Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Miguel Ojeda Cc: Ming Wang Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231207011722.1220634-15-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/maps.c | 24 ++++++++++++++++++++++++ tools/perf/util/maps.h | 6 ++---- tools/perf/util/symbol.c | 24 ++++++++++++------------ 3 files changed, 38 insertions(+), 16 deletions(-) diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c index 00e6589bba10d..f13fd3a9686b1 100644 --- a/tools/perf/util/maps.c +++ b/tools/perf/util/maps.c @@ -13,6 +13,10 @@ #define maps__for_each_entry(maps, map) \ for (map = maps__first(maps); map; map = map_rb_node__next(map)) +#define maps__for_each_entry_safe(maps, map, next) \ + for (map = maps__first(maps), next = map_rb_node__next(map); map; \ + map = next, next = map_rb_node__next(map)) + static void maps__init(struct maps *maps, struct machine *machine) { refcount_set(maps__refcnt(maps), 1); @@ -214,6 +218,26 @@ int maps__for_each_map(struct maps *maps, int (*cb)(struct map *map, void *data) return ret; } +void maps__remove_maps(struct maps *maps, bool (*cb)(struct map *map, void *data), void *data) +{ + struct map_rb_node *pos, *next; + unsigned int start_nr_maps; + + down_write(maps__lock(maps)); + + start_nr_maps = maps__nr_maps(maps); + maps__for_each_entry_safe(maps, pos, next) { + if (cb(pos->map, data)) { + __maps__remove(maps, pos); + --RC_CHK_ACCESS(maps)->nr_maps; + } + } + if (maps__maps_by_name(maps) && start_nr_maps != maps__nr_maps(maps)) + __maps__free_maps_by_name(maps); + + up_write(maps__lock(maps)); +} + struct symbol *maps__find_symbol(struct maps *maps, u64 addr, struct map **mapp) { struct map *map = maps__find(maps, addr); diff --git a/tools/perf/util/maps.h b/tools/perf/util/maps.h index 8ac30cdaf5bd1..b94ad5c8fea74 100644 --- a/tools/perf/util/maps.h +++ b/tools/perf/util/maps.h @@ -36,10 +36,6 @@ struct map_rb_node *map_rb_node__next(struct map_rb_node *node); struct map_rb_node *maps__find_node(struct maps *maps, struct map *map); struct map *maps__find(struct maps *maps, u64 addr); -#define maps__for_each_entry_safe(maps, map, next) \ - for (map = maps__first(maps), next = map_rb_node__next(map); map; \ - map = next, next = map_rb_node__next(map)) - DECLARE_RC_STRUCT(maps) { struct rb_root entries; struct rw_semaphore lock; @@ -80,6 +76,8 @@ static inline void __maps__zput(struct maps **map) /* Iterate over map calling cb for each entry. */ int maps__for_each_map(struct maps *maps, int (*cb)(struct map *map, void *data), void *data); +/* Iterate over map removing an entry if cb returns true. */ +void maps__remove_maps(struct maps *maps, bool (*cb)(struct map *map, void *data), void *data); static inline struct rb_root *maps__entries(struct maps *maps) { diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 72f03b875478a..be212ba157dc3 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -1239,13 +1239,23 @@ static int kcore_mapfn(u64 start, u64 len, u64 pgoff, void *data) return 0; } +static bool remove_old_maps(struct map *map, void *data) +{ + const struct map *map_to_save = data; + + /* + * We need to preserve eBPF maps even if they are covered by kcore, + * because we need to access eBPF dso for source data. + */ + return !RC_CHK_EQUAL(map, map_to_save) && !__map__is_bpf_prog(map); +} + static int dso__load_kcore(struct dso *dso, struct map *map, const char *kallsyms_filename) { struct maps *kmaps = map__kmaps(map); struct kcore_mapfn_data md; struct map *replacement_map = NULL; - struct map_rb_node *old_node, *next; struct machine *machine; bool is_64_bit; int err, fd; @@ -1292,17 +1302,7 @@ static int dso__load_kcore(struct dso *dso, struct map *map, } /* Remove old maps */ - maps__for_each_entry_safe(kmaps, old_node, next) { - struct map *old_map = old_node->map; - - /* - * We need to preserve eBPF maps even if they are - * covered by kcore, because we need to access - * eBPF dso for source data. - */ - if (old_map != map && !__map__is_bpf_prog(old_map)) - maps__remove(kmaps, old_map); - } + maps__remove_maps(kmaps, remove_old_maps, map); machine->trampolines_mapped = false; /* Find the kernel map using the '_stext' symbol */ -- GitLab From ec49230cf6dda70437bf878281566dd82af9affa Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 6 Dec 2023 17:16:49 -0800 Subject: [PATCH 0420/1290] perf debug: Expose debug file Some dumping call backs need to be passed a FILE*. Expose debug file via an accessor API for a consistent way to do this. Catch the unlikely failure of it not being set. Switch two cases where stderr was being used instead of debug_file. Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Guilherme Amadio Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Miguel Ojeda Cc: Ming Wang Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231207011722.1220634-16-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/debug.c | 22 +++++++++++++++------- tools/perf/util/debug.h | 1 + 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c index 88378c4c5dd9e..e282b4ceb4d25 100644 --- a/tools/perf/util/debug.c +++ b/tools/perf/util/debug.c @@ -38,12 +38,21 @@ bool dump_trace = false, quiet = false; int debug_ordered_events; static int redirect_to_stderr; int debug_data_convert; -static FILE *debug_file; +static FILE *_debug_file; bool debug_display_time; +FILE *debug_file(void) +{ + if (!_debug_file) { + pr_warning_once("debug_file not set"); + debug_set_file(stderr); + } + return _debug_file; +} + void debug_set_file(FILE *file) { - debug_file = file; + _debug_file = file; } void debug_set_display_time(bool set) @@ -78,8 +87,8 @@ int veprintf(int level, int var, const char *fmt, va_list args) if (use_browser >= 1 && !redirect_to_stderr) { ui_helpline__vshow(fmt, args); } else { - ret = fprintf_time(debug_file); - ret += vfprintf(debug_file, fmt, args); + ret = fprintf_time(debug_file()); + ret += vfprintf(debug_file(), fmt, args); } } @@ -107,9 +116,8 @@ static int veprintf_time(u64 t, const char *fmt, va_list args) nsecs -= secs * NSEC_PER_SEC; usecs = nsecs / NSEC_PER_USEC; - ret = fprintf(stderr, "[%13" PRIu64 ".%06" PRIu64 "] ", - secs, usecs); - ret += vfprintf(stderr, fmt, args); + ret = fprintf(debug_file(), "[%13" PRIu64 ".%06" PRIu64 "] ", secs, usecs); + ret += vfprintf(debug_file(), fmt, args); return ret; } diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h index f99468a7f6817..de8870980d44a 100644 --- a/tools/perf/util/debug.h +++ b/tools/perf/util/debug.h @@ -77,6 +77,7 @@ int eprintf_time(int level, int var, u64 t, const char *fmt, ...) __printf(4, 5) int veprintf(int level, int var, const char *fmt, va_list args); int perf_debug_option(const char *str); +FILE *debug_file(void); void debug_set_file(FILE *file); void debug_set_display_time(bool set); void perf_debug_setup(void); -- GitLab From 07ef14d50cf1af1caa49cd183216a517e75e8a93 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 6 Dec 2023 17:16:50 -0800 Subject: [PATCH 0421/1290] perf maps: Refactor maps__fixup_overlappings() Rename to maps__fixup_overlap_and_insert() as the given mapping is always inserted. Factor out first_ending_after() as a utility function. Minor variable name changes. Switch to using debug_file() rather than passing a debug FILE*. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Guilherme Amadio Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Miguel Ojeda Cc: Ming Wang Cc: Namhyung Kim Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231207011722.1220634-17-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/maps.c | 57 ++++++++++++++++++++++++---------------- tools/perf/util/maps.h | 2 +- tools/perf/util/thread.c | 3 +-- 3 files changed, 37 insertions(+), 25 deletions(-) diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c index f13fd3a9686b1..94a97923527df 100644 --- a/tools/perf/util/maps.c +++ b/tools/perf/util/maps.c @@ -334,20 +334,16 @@ size_t maps__fprintf(struct maps *maps, FILE *fp) return args.printed; } -int maps__fixup_overlappings(struct maps *maps, struct map *map, FILE *fp) +/* + * Find first map where end > map->start. + * Same as find_vma() in kernel. + */ +static struct rb_node *first_ending_after(struct maps *maps, const struct map *map) { struct rb_root *root; struct rb_node *next, *first; - int err = 0; - - down_write(maps__lock(maps)); root = maps__entries(maps); - - /* - * Find first map where end > map->start. - * Same as find_vma() in kernel. - */ next = root->rb_node; first = NULL; while (next) { @@ -361,8 +357,23 @@ int maps__fixup_overlappings(struct maps *maps, struct map *map, FILE *fp) } else next = next->rb_right; } + return first; +} + +/* + * Adds new to maps, if new overlaps existing entries then the existing maps are + * adjusted or removed so that new fits without overlapping any entries. + */ +int maps__fixup_overlap_and_insert(struct maps *maps, struct map *new) +{ + + struct rb_node *next; + int err = 0; + FILE *fp = debug_file(); + + down_write(maps__lock(maps)); - next = first; + next = first_ending_after(maps, new); while (next && !err) { struct map_rb_node *pos = rb_entry(next, struct map_rb_node, rb_node); next = rb_next(&pos->rb_node); @@ -371,27 +382,27 @@ int maps__fixup_overlappings(struct maps *maps, struct map *map, FILE *fp) * Stop if current map starts after map->end. * Maps are ordered by start: next will not overlap for sure. */ - if (map__start(pos->map) >= map__end(map)) + if (map__start(pos->map) >= map__end(new)) break; if (verbose >= 2) { if (use_browser) { pr_debug("overlapping maps in %s (disable tui for more info)\n", - map__dso(map)->name); + map__dso(new)->name); } else { - fputs("overlapping maps:\n", fp); - map__fprintf(map, fp); + pr_debug("overlapping maps:\n"); + map__fprintf(new, fp); map__fprintf(pos->map, fp); } } - rb_erase_init(&pos->rb_node, root); + rb_erase_init(&pos->rb_node, maps__entries(maps)); /* * Now check if we need to create new maps for areas not * overlapped by the new map: */ - if (map__start(map) > map__start(pos->map)) { + if (map__start(new) > map__start(pos->map)) { struct map *before = map__clone(pos->map); if (before == NULL) { @@ -399,7 +410,7 @@ int maps__fixup_overlappings(struct maps *maps, struct map *map, FILE *fp) goto put_map; } - map__set_end(before, map__start(map)); + map__set_end(before, map__start(new)); err = __maps__insert(maps, before); if (err) { map__put(before); @@ -411,7 +422,7 @@ int maps__fixup_overlappings(struct maps *maps, struct map *map, FILE *fp) map__put(before); } - if (map__end(map) < map__end(pos->map)) { + if (map__end(new) < map__end(pos->map)) { struct map *after = map__clone(pos->map); if (after == NULL) { @@ -419,10 +430,10 @@ int maps__fixup_overlappings(struct maps *maps, struct map *map, FILE *fp) goto put_map; } - map__set_start(after, map__end(map)); - map__add_pgoff(after, map__end(map) - map__start(pos->map)); - assert(map__map_ip(pos->map, map__end(map)) == - map__map_ip(after, map__end(map))); + map__set_start(after, map__end(new)); + map__add_pgoff(after, map__end(new) - map__start(pos->map)); + assert(map__map_ip(pos->map, map__end(new)) == + map__map_ip(after, map__end(new))); err = __maps__insert(maps, after); if (err) { map__put(after); @@ -436,6 +447,8 @@ put_map: map__put(pos->map); free(pos); } + /* Add the map. */ + err = __maps__insert(maps, new); up_write(maps__lock(maps)); return err; } diff --git a/tools/perf/util/maps.h b/tools/perf/util/maps.h index b94ad5c8fea74..62e94d443c029 100644 --- a/tools/perf/util/maps.h +++ b/tools/perf/util/maps.h @@ -133,7 +133,7 @@ struct addr_map_symbol; int maps__find_ams(struct maps *maps, struct addr_map_symbol *ams); -int maps__fixup_overlappings(struct maps *maps, struct map *map, FILE *fp); +int maps__fixup_overlap_and_insert(struct maps *maps, struct map *new); struct map *maps__find_by_name(struct maps *maps, const char *name); diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index b6986a81aa6df..3d47b5c5528b2 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -345,8 +345,7 @@ int thread__insert_map(struct thread *thread, struct map *map) if (ret) return ret; - maps__fixup_overlappings(thread__maps(thread), map, stderr); - return maps__insert(thread__maps(thread), map); + return maps__fixup_overlap_and_insert(thread__maps(thread), map); } struct thread__prepare_access_maps_cb_args { -- GitLab From 980d7927213a57c9a31ff4ea685eb93fbe4b31ab Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 6 Dec 2023 17:16:51 -0800 Subject: [PATCH 0422/1290] perf maps: Do simple merge if given map doesn't overlap Simplify merge in for the simple case of a non-overlapping map. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Guilherme Amadio Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Miguel Ojeda Cc: Ming Wang Cc: Namhyung Kim Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231207011722.1220634-18-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/maps.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c index 94a97923527df..f305a4834cf09 100644 --- a/tools/perf/util/maps.c +++ b/tools/perf/util/maps.c @@ -697,9 +697,20 @@ void maps__fixup_end(struct maps *maps) int maps__merge_in(struct maps *kmaps, struct map *new_map) { struct map_rb_node *rb_node; + struct rb_node *first; + bool overlaps; LIST_HEAD(merged); int err = 0; + down_read(maps__lock(kmaps)); + first = first_ending_after(kmaps, new_map); + rb_node = first ? rb_entry(first, struct map_rb_node, rb_node) : NULL; + overlaps = rb_node && map__start(rb_node->map) < map__end(new_map); + up_read(maps__lock(kmaps)); + + if (!overlaps) + return maps__insert(kmaps, new_map); + maps__for_each_entry(kmaps, rb_node) { struct map *old_map = rb_node->map; -- GitLab From 9084952704ba075de28684301ec282b6626b5e7a Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 6 Dec 2023 17:16:52 -0800 Subject: [PATCH 0423/1290] perf maps: Rename clone to copy from Rename maps__clone() to maps__copy_from() to be more intention revealing of its behavior. Pass the underlying maps rather than the thread. Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Guilherme Amadio Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Miguel Ojeda Cc: Ming Wang Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231207011722.1220634-19-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 2 +- tools/perf/util/maps.c | 6 +----- tools/perf/util/maps.h | 3 +-- tools/perf/util/thread.c | 2 +- 4 files changed, 4 insertions(+), 9 deletions(-) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index ca855fc435ac5..38bf7108821d3 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -453,7 +453,7 @@ static struct thread *findnew_guest_code(struct machine *machine, * Guest code can be found in hypervisor process at the same address * so copy host maps. */ - err = maps__clone(thread, thread__maps(host_thread)); + err = maps__copy_from(thread__maps(thread), thread__maps(host_thread)); thread__put(host_thread); if (err) goto out_err; diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c index f305a4834cf09..986daa1b04978 100644 --- a/tools/perf/util/maps.c +++ b/tools/perf/util/maps.c @@ -453,12 +453,8 @@ put_map: return err; } -/* - * XXX This should not really _copy_ te maps, but refcount them. - */ -int maps__clone(struct thread *thread, struct maps *parent) +int maps__copy_from(struct maps *maps, struct maps *parent) { - struct maps *maps = thread__maps(thread); int err; struct map_rb_node *rb_node; diff --git a/tools/perf/util/maps.h b/tools/perf/util/maps.h index 62e94d443c029..e4a49d6ff5cfc 100644 --- a/tools/perf/util/maps.h +++ b/tools/perf/util/maps.h @@ -14,7 +14,6 @@ struct ref_reloc_sym; struct machine; struct map; struct maps; -struct thread; struct map_rb_node { struct rb_node rb_node; @@ -61,7 +60,7 @@ struct kmap { struct maps *maps__new(struct machine *machine); bool maps__empty(struct maps *maps); -int maps__clone(struct thread *thread, struct maps *parent); +int maps__copy_from(struct maps *maps, struct maps *parent); struct maps *maps__get(struct maps *maps); void maps__put(struct maps *maps); diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index 3d47b5c5528b2..89c47a5098e28 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -390,7 +390,7 @@ static int thread__clone_maps(struct thread *thread, struct thread *parent, bool return 0; } /* But this one is new process, copy maps. */ - return do_maps_clone ? maps__clone(thread, thread__maps(parent)) : 0; + return do_maps_clone ? maps__copy_from(thread__maps(thread), thread__maps(parent)) : 0; } int thread__fork(struct thread *thread, struct thread *parent, u64 timestamp, bool do_maps_clone) -- GitLab From e77b0236cd0cd1572c6a9b25097b207eab799e74 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 6 Dec 2023 17:16:53 -0800 Subject: [PATCH 0424/1290] perf maps: Add maps__load_first() Avoid bpf_lock_contention_read touching the internal maps data structure by adding a helper function. As access is done directly on the map in maps, hold the read lock to stop it being removed. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Guilherme Amadio Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Miguel Ojeda Cc: Ming Wang Cc: Namhyung Kim Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231207011722.1220634-20-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf_lock_contention.c | 2 +- tools/perf/util/maps.c | 13 +++++++++++++ tools/perf/util/maps.h | 2 ++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c index f1716c089c991..31ff19afc20c1 100644 --- a/tools/perf/util/bpf_lock_contention.c +++ b/tools/perf/util/bpf_lock_contention.c @@ -318,7 +318,7 @@ int lock_contention_read(struct lock_contention *con) } /* make sure it loads the kernel map */ - map__load(maps__first(machine->kmaps)->map); + maps__load_first(machine->kmaps); prev_key = NULL; while (!bpf_map_get_next_key(fd, prev_key, &key)) { diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c index 986daa1b04978..024a6c9f72c40 100644 --- a/tools/perf/util/maps.c +++ b/tools/perf/util/maps.c @@ -793,3 +793,16 @@ out: } return err; } + +void maps__load_first(struct maps *maps) +{ + struct map_rb_node *first; + + down_read(maps__lock(maps)); + + first = maps__first(maps); + if (first) + map__load(first->map); + + up_read(maps__lock(maps)); +} diff --git a/tools/perf/util/maps.h b/tools/perf/util/maps.h index e4a49d6ff5cfc..b7ab3ec61b7c9 100644 --- a/tools/perf/util/maps.h +++ b/tools/perf/util/maps.h @@ -142,4 +142,6 @@ void __maps__sort_by_name(struct maps *maps); void maps__fixup_end(struct maps *maps); +void maps__load_first(struct maps *maps); + #endif // __PERF_MAPS_H -- GitLab From 75858007d101cf38e9a79d682d0361bb6493d7cc Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 6 Dec 2023 17:16:54 -0800 Subject: [PATCH 0425/1290] perf maps: Add find next entry to give entry after the given map Use to remove map_rb_node use from machine.c. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Guilherme Amadio Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Miguel Ojeda Cc: Ming Wang Cc: Namhyung Kim Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231207011722.1220634-21-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 7 +++---- tools/perf/util/maps.c | 11 +++++++++++ tools/perf/util/maps.h | 2 ++ 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 38bf7108821d3..b397a769006f4 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -1762,12 +1762,11 @@ int machine__create_kernel_maps(struct machine *machine) if (end == ~0ULL) { /* update end address of the kernel map using adjacent module address */ - struct map_rb_node *rb_node = maps__find_node(machine__kernel_maps(machine), - machine__kernel_map(machine)); - struct map_rb_node *next = map_rb_node__next(rb_node); + struct map *next = maps__find_next_entry(machine__kernel_maps(machine), + machine__kernel_map(machine)); if (next) - machine__set_kernel_mmap(machine, start, map__start(next->map)); + machine__set_kernel_mmap(machine, start, map__start(next)); } out_put: diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c index 024a6c9f72c40..5b898a0e97b26 100644 --- a/tools/perf/util/maps.c +++ b/tools/perf/util/maps.c @@ -663,6 +663,17 @@ out_unlock: return map; } +struct map *maps__find_next_entry(struct maps *maps, struct map *map) +{ + struct map_rb_node *rb_node = maps__find_node(maps, map); + struct map_rb_node *next = map_rb_node__next(rb_node); + + if (next) + return next->map; + + return NULL; +} + void maps__fixup_end(struct maps *maps) { struct map_rb_node *prev = NULL, *curr; diff --git a/tools/perf/util/maps.h b/tools/perf/util/maps.h index b7ab3ec61b7c9..84b42c8456e82 100644 --- a/tools/perf/util/maps.h +++ b/tools/perf/util/maps.h @@ -136,6 +136,8 @@ int maps__fixup_overlap_and_insert(struct maps *maps, struct map *new); struct map *maps__find_by_name(struct maps *maps, const char *name); +struct map *maps__find_next_entry(struct maps *maps, struct map *map); + int maps__merge_in(struct maps *kmaps, struct map *new_map); void __maps__sort_by_name(struct maps *maps); -- GitLab From 631bb236aa6f306fd2ba16aee9ae96083453eebc Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 6 Dec 2023 17:16:55 -0800 Subject: [PATCH 0426/1290] perf maps: Reduce scope of map_rb_node and maps internals Avoid exposing the implementation of maps so that the internals can be refactored. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Guilherme Amadio Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Miguel Ojeda Cc: Ming Wang Cc: Namhyung Kim Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231207011722.1220634-22-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/maps.c | 90 ++++++++++++++++++++++++++---------------- tools/perf/util/maps.h | 23 ----------- 2 files changed, 55 insertions(+), 58 deletions(-) diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c index 5b898a0e97b26..dcd67384d877d 100644 --- a/tools/perf/util/maps.c +++ b/tools/perf/util/maps.c @@ -10,6 +10,11 @@ #include "ui/ui.h" #include "unwind.h" +struct map_rb_node { + struct rb_node rb_node; + struct map *map; +}; + #define maps__for_each_entry(maps, map) \ for (map = maps__first(maps); map; map = map_rb_node__next(map)) @@ -17,6 +22,56 @@ for (map = maps__first(maps), next = map_rb_node__next(map); map; \ map = next, next = map_rb_node__next(map)) +static struct rb_root *maps__entries(struct maps *maps) +{ + return &RC_CHK_ACCESS(maps)->entries; +} + +static struct rw_semaphore *maps__lock(struct maps *maps) +{ + return &RC_CHK_ACCESS(maps)->lock; +} + +static struct map **maps__maps_by_name(struct maps *maps) +{ + return RC_CHK_ACCESS(maps)->maps_by_name; +} + +static struct map_rb_node *maps__first(struct maps *maps) +{ + struct rb_node *first = rb_first(maps__entries(maps)); + + if (first) + return rb_entry(first, struct map_rb_node, rb_node); + return NULL; +} + +static struct map_rb_node *map_rb_node__next(struct map_rb_node *node) +{ + struct rb_node *next; + + if (!node) + return NULL; + + next = rb_next(&node->rb_node); + + if (!next) + return NULL; + + return rb_entry(next, struct map_rb_node, rb_node); +} + +static struct map_rb_node *maps__find_node(struct maps *maps, struct map *map) +{ + struct map_rb_node *rb_node; + + maps__for_each_entry(maps, rb_node) { + if (rb_node->RC_CHK_ACCESS(map) == RC_CHK_ACCESS(map)) + return rb_node; + } + return NULL; +} + static void maps__init(struct maps *maps, struct machine *machine) { refcount_set(maps__refcnt(maps), 1); @@ -485,17 +540,6 @@ out_unlock: return err; } -struct map_rb_node *maps__find_node(struct maps *maps, struct map *map) -{ - struct map_rb_node *rb_node; - - maps__for_each_entry(maps, rb_node) { - if (rb_node->RC_CHK_ACCESS(map) == RC_CHK_ACCESS(map)) - return rb_node; - } - return NULL; -} - struct map *maps__find(struct maps *maps, u64 ip) { struct rb_node *p; @@ -521,30 +565,6 @@ out: return m ? m->map : NULL; } -struct map_rb_node *maps__first(struct maps *maps) -{ - struct rb_node *first = rb_first(maps__entries(maps)); - - if (first) - return rb_entry(first, struct map_rb_node, rb_node); - return NULL; -} - -struct map_rb_node *map_rb_node__next(struct map_rb_node *node) -{ - struct rb_node *next; - - if (!node) - return NULL; - - next = rb_next(&node->rb_node); - - if (!next) - return NULL; - - return rb_entry(next, struct map_rb_node, rb_node); -} - static int map__strcmp(const void *a, const void *b) { const struct map *map_a = *(const struct map **)a; diff --git a/tools/perf/util/maps.h b/tools/perf/util/maps.h index 84b42c8456e82..d836d04c94022 100644 --- a/tools/perf/util/maps.h +++ b/tools/perf/util/maps.h @@ -15,11 +15,6 @@ struct machine; struct map; struct maps; -struct map_rb_node { - struct rb_node rb_node; - struct map *map; -}; - struct map_list_node { struct list_head node; struct map *map; @@ -30,9 +25,6 @@ static inline struct map_list_node *map_list_node__new(void) return malloc(sizeof(struct map_list_node)); } -struct map_rb_node *maps__first(struct maps *maps); -struct map_rb_node *map_rb_node__next(struct map_rb_node *node); -struct map_rb_node *maps__find_node(struct maps *maps, struct map *map); struct map *maps__find(struct maps *maps, u64 addr); DECLARE_RC_STRUCT(maps) { @@ -78,26 +70,11 @@ int maps__for_each_map(struct maps *maps, int (*cb)(struct map *map, void *data) /* Iterate over map removing an entry if cb returns true. */ void maps__remove_maps(struct maps *maps, bool (*cb)(struct map *map, void *data), void *data); -static inline struct rb_root *maps__entries(struct maps *maps) -{ - return &RC_CHK_ACCESS(maps)->entries; -} - static inline struct machine *maps__machine(struct maps *maps) { return RC_CHK_ACCESS(maps)->machine; } -static inline struct rw_semaphore *maps__lock(struct maps *maps) -{ - return &RC_CHK_ACCESS(maps)->lock; -} - -static inline struct map **maps__maps_by_name(struct maps *maps) -{ - return RC_CHK_ACCESS(maps)->maps_by_name; -} - static inline unsigned int maps__nr_maps(const struct maps *maps) { return RC_CHK_ACCESS(maps)->nr_maps; -- GitLab From 7887097c65446ff229099221975f6fc9ad9e378b Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 6 Dec 2023 17:16:56 -0800 Subject: [PATCH 0427/1290] perf maps: Fix up overlaps during fixup_end Maps are sometimes made overlapping, in particular kernel maps. If the end of a map overlaps the start of the next, shorten the overlapping map. This should remove potential non-determinism in maps__find, ie finding maps by address. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Changbin Du Cc: Colin Ian King Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: German Gomez Cc: Guilherme Amadio Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: K Prateek Nayak Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Li Dong Cc: Liam Howlett Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Miguel Ojeda Cc: Ming Wang Cc: Namhyung Kim Cc: Nick Terrell Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Vincent Whitchurch Cc: Wenyu Liu Cc: Yang Jihong Link: https://lore.kernel.org/r/20231207011722.1220634-23-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/maps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c index dcd67384d877d..0334fc18d9c65 100644 --- a/tools/perf/util/maps.c +++ b/tools/perf/util/maps.c @@ -701,7 +701,7 @@ void maps__fixup_end(struct maps *maps) down_write(maps__lock(maps)); maps__for_each_entry(maps, curr) { - if (prev != NULL && !map__end(prev->map)) + if (prev && (!map__end(prev->map) || map__end(prev->map) > map__start(curr->map))) map__set_end(prev->map, map__start(curr->map)); prev = curr; -- GitLab From 457caadce7ab71a54ee2d4f032ee4a55b4a28776 Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Thu, 21 Dec 2023 14:03:13 +0800 Subject: [PATCH 0428/1290] perf vendor events: Remove UTF-8 characters from cmn.json cmn.json contains UTF-8 characters in brief description which could break the perf build on some distros. Fix this issue by removing the UTF-8 characters from cmn.json. without this fix: $find tools/perf/pmu-events/ -name "*.json" | xargs file -i | grep -v us-ascii tools/perf/pmu-events/arch/arm64/arm/cmn/sys/cmn.json: application/json; charset=utf-8 with it: $ file -i tools/perf/pmu-events/arch/arm64/arm/cmn/sys/cmn.json tools/perf/pmu-events/arch/arm64/arm/cmn/sys/cmn.json: text/plain; charset=us-ascii Fixes: 0b4de7bdf46c5215 ("perf jevents: Add support for Arm CMN PMU aliasing") Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Jing Zhang Cc: Adrian Hunter Cc: Heiko Carstens Cc: Ian Rogers Cc: Jing Zhang Cc: Jiri Olsa Cc: Kajol Jain Cc: Namhyung Kim Cc: Sukadev Bhattiprolu Cc: Thomas Richter Link: https://lore.kernel.org/r/1703138593-50486-1-git-send-email-renyu.zj@linux.alibaba.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/arm64/arm/cmn/sys/cmn.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/pmu-events/arch/arm64/arm/cmn/sys/cmn.json b/tools/perf/pmu-events/arch/arm64/arm/cmn/sys/cmn.json index 428605c37d10b..5ec157c39f0df 100644 --- a/tools/perf/pmu-events/arch/arm64/arm/cmn/sys/cmn.json +++ b/tools/perf/pmu-events/arch/arm64/arm/cmn/sys/cmn.json @@ -107,7 +107,7 @@ "EventName": "hnf_qos_hh_retry", "EventidCode": "0xe", "NodeType": "0x5", - "BriefDescription": "Counts number of times a HighHigh priority request is protocolretried at the HN‑F.", + "BriefDescription": "Counts number of times a HighHigh priority request is protocolretried at the HN-F.", "Unit": "arm_cmn", "Compat": "(434|436|43c|43a).*" }, -- GitLab From 22887dfba0633c09444562722f0cf68f61ec9a50 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 19 Dec 2023 13:54:20 -0500 Subject: [PATCH 0429/1290] ring-buffer: Clear pages on error in ring_buffer_subbuf_order_set() failure On failure to allocate ring buffer pages, the pointer to the CPU buffer pages is freed, but the pages that were allocated previously were not. Make sure they are freed too. Link: https://lore.kernel.org/linux-trace-kernel/20231219185629.179352802@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Tzvetomir Stoyanov Cc: Vincent Donnefort Cc: Kent Overstreet Fixes: f9b94daa542a ("tracing: Set new size of the ring buffer sub page") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c2afcf98ea912..3c11e8e811edd 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5927,6 +5927,7 @@ error: for_each_buffer_cpu(buffer, cpu) { if (!cpu_buffers[cpu]) continue; + rb_free_cpu_buffer(cpu_buffers[cpu]); kfree(cpu_buffers[cpu]); } kfree(cpu_buffers); -- GitLab From b81e03a24966dca0b119eff0549a4e44befff419 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 19 Dec 2023 13:54:21 -0500 Subject: [PATCH 0430/1290] ring-buffer: Do no swap cpu buffers if order is different As all the subbuffer order (subbuffer sizes) must be the same throughout the ring buffer, check the order of the buffers that are doing a CPU buffer swap in ring_buffer_swap_cpu() to make sure they are the same. If the are not the same, then fail to do the swap, otherwise the ring buffer will think the CPU buffer has a specific subbuffer size when it does not. Link: https://lore.kernel.org/linux-trace-kernel/20231219185629.467894710@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Tzvetomir Stoyanov Cc: Vincent Donnefort Cc: Kent Overstreet Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3c11e8e811edd..fdcd171b09b5d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5417,6 +5417,9 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) goto out; + if (buffer_a->subbuf_order != buffer_b->subbuf_order) + goto out; + ret = -EAGAIN; if (atomic_read(&buffer_a->record_disabled)) -- GitLab From 4e958db34fd5324421ef9562c31c15e2d152dc63 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 19 Dec 2023 13:54:22 -0500 Subject: [PATCH 0431/1290] ring-buffer: Make sure the spare sub buffer used for reads has same size Now that the ring buffer specifies the size of its sub buffers, they all need to be the same size. When doing a read, a swap is done with a spare page. Make sure they are the same size before doing the swap, otherwise the read will fail. Link: https://lore.kernel.org/linux-trace-kernel/20231219185629.763664788@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Tzvetomir Stoyanov Cc: Vincent Donnefort Cc: Kent Overstreet Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 711095aa731d3..4dcdc30aa1100 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7582,6 +7582,7 @@ struct ftrace_buffer_info { struct trace_iterator iter; void *spare; unsigned int spare_cpu; + unsigned int spare_size; unsigned int read; }; @@ -8301,6 +8302,15 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer); + /* Make sure the spare matches the current sub buffer size */ + if (info->spare) { + if (page_size != info->spare_size) { + ring_buffer_free_read_page(iter->array_buffer->buffer, + info->spare_cpu, info->spare); + info->spare = NULL; + } + } + if (!info->spare) { info->spare = ring_buffer_alloc_read_page(iter->array_buffer->buffer, iter->cpu_file); @@ -8309,6 +8319,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, info->spare = NULL; } else { info->spare_cpu = iter->cpu_file; + info->spare_size = page_size; } } if (!info->spare) -- GitLab From aa067682adf19ca78f81cd57539282dbfd7cce21 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 19 Dec 2023 13:54:23 -0500 Subject: [PATCH 0432/1290] tracing: Update snapshot order along with main buffer order When updating the order of the sub buffers for the main buffer, make sure that if the snapshot buffer exists, that it gets its order updated as well. Link: https://lore.kernel.org/linux-trace-kernel/20231219185630.054668186@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Tzvetomir Stoyanov Cc: Vincent Donnefort Cc: Kent Overstreet Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 45 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4dcdc30aa1100..2439e00aa4ce9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1263,10 +1263,17 @@ static void set_buffer_entries(struct array_buffer *buf, unsigned long val); int tracing_alloc_snapshot_instance(struct trace_array *tr) { + int order; int ret; if (!tr->allocated_snapshot) { + /* Make the snapshot buffer have the same order as main buffer */ + order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); + ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order); + if (ret < 0) + return ret; + /* allocate spare buffer */ ret = resize_buffer_duplicate_size(&tr->max_buffer, &tr->array_buffer, RING_BUFFER_ALL_CPUS); @@ -1286,6 +1293,7 @@ static void free_snapshot(struct trace_array *tr) * The max_tr ring buffer has some state (e.g. ring->clock) and * we want preserve it. */ + ring_buffer_subbuf_order_set(tr->max_buffer.buffer, 0); ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); set_buffer_entries(&tr->max_buffer, 1); tracing_reset_online_cpus(&tr->max_buffer); @@ -9393,6 +9401,7 @@ buffer_order_write(struct file *filp, const char __user *ubuf, { struct trace_array *tr = filp->private_data; unsigned long val; + int old_order; int ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); @@ -9403,12 +9412,44 @@ buffer_order_write(struct file *filp, const char __user *ubuf, if (val < 0 || val > 7) return -EINVAL; + old_order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); + if (old_order == val) + return 0; + ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, val); if (ret) - return ret; + return 0; - (*ppos)++; +#ifdef CONFIG_TRACER_MAX_TRACE + + if (!tr->allocated_snapshot) + goto out_max; + ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, val); + if (ret) { + /* Put back the old order */ + cnt = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, old_order); + if (WARN_ON_ONCE(cnt)) { + /* + * AARGH! We are left with different orders! + * The max buffer is our "snapshot" buffer. + * When a tracer needs a snapshot (one of the + * latency tracers), it swaps the max buffer + * with the saved snap shot. We succeeded to + * update the order of the main buffer, but failed to + * update the order of the max buffer. But when we tried + * to reset the main buffer to the original size, we + * failed there too. This is very unlikely to + * happen, but if it does, warn and kill all + * tracing. + */ + tracing_disabled = 1; + } + return ret; + } + out_max: +#endif + (*ppos)++; return cnt; } -- GitLab From 1075ee66a8c19bfa375b19c236fd6a22a867f138 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Tue, 19 Dec 2023 20:33:50 +0100 Subject: [PATCH 0433/1290] dmaengine: idxd: Remove usage of the deprecated ida_simple_xx() API ida_alloc() and ida_free() should be preferred to the deprecated ida_simple_get() and ida_simple_remove(). This is less verbose. Note that the upper limit of ida_simple_get() is exclusive, but the one of ida_alloc_range() is inclusive. Sothis change allows one more device. MINORMASK is ((1U << MINORBITS) - 1), so allowing MINORMASK as a maximum value makes sense. It is also consistent with other "ida_.*MINORMASK" and "ida_*MINOR()" usages. Signed-off-by: Christophe JAILLET Reviewed-by: Fenghua Yu Acked-by: Lijun Pan Link: https://lore.kernel.org/r/ac991f5f42112fa782a881d391d447529cbc4a23.1702967302.git.christophe.jaillet@wanadoo.fr Signed-off-by: Vinod Koul --- drivers/dma/idxd/cdev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c index 0423655f5a880..b00926abc69ae 100644 --- a/drivers/dma/idxd/cdev.c +++ b/drivers/dma/idxd/cdev.c @@ -165,7 +165,7 @@ static void idxd_cdev_dev_release(struct device *dev) struct idxd_wq *wq = idxd_cdev->wq; cdev_ctx = &ictx[wq->idxd->data->type]; - ida_simple_remove(&cdev_ctx->minor_ida, idxd_cdev->minor); + ida_free(&cdev_ctx->minor_ida, idxd_cdev->minor); kfree(idxd_cdev); } @@ -463,7 +463,7 @@ int idxd_wq_add_cdev(struct idxd_wq *wq) cdev = &idxd_cdev->cdev; dev = cdev_dev(idxd_cdev); cdev_ctx = &ictx[wq->idxd->data->type]; - minor = ida_simple_get(&cdev_ctx->minor_ida, 0, MINORMASK, GFP_KERNEL); + minor = ida_alloc_max(&cdev_ctx->minor_ida, MINORMASK, GFP_KERNEL); if (minor < 0) { kfree(idxd_cdev); return minor; -- GitLab From fa4b54af5ba17a59ce082e9d710eb638bdce7637 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 19 Dec 2023 13:54:24 -0500 Subject: [PATCH 0434/1290] tracing: Stop the tracing while changing the ring buffer subbuf size Because the main buffer and the snapshot buffer need to be the same for some tracers, otherwise it will fail and disable all tracing, the tracers need to be stopped while updating the sub buffer sizes so that the tracers see the main and snapshot buffers with the same sub buffer size. Link: https://lore.kernel.org/linux-trace-kernel/20231219185630.353222794@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Tzvetomir Stoyanov Cc: Vincent Donnefort Cc: Kent Overstreet Fixes: 2808e31ec12e ("ring-buffer: Add interface for configuring trace sub buffer size") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2439e00aa4ce9..82303bd2bba13 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9412,13 +9412,16 @@ buffer_order_write(struct file *filp, const char __user *ubuf, if (val < 0 || val > 7) return -EINVAL; + /* Do not allow tracing while changing the order of the ring buffer */ + tracing_stop_tr(tr); + old_order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); if (old_order == val) - return 0; + goto out; ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, val); if (ret) - return 0; + goto out; #ifdef CONFIG_TRACER_MAX_TRACE @@ -9445,11 +9448,15 @@ buffer_order_write(struct file *filp, const char __user *ubuf, */ tracing_disabled = 1; } - return ret; + goto out; } out_max: #endif (*ppos)++; + out: + if (ret) + cnt = ret; + tracing_start_tr(tr); return cnt; } -- GitLab From 71a5197e2b872afeef8ade3099ffc4050466b542 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 17 Dec 2023 22:08:34 -0800 Subject: [PATCH 0435/1290] dmaengine: std_dma40: fix kernel-doc warnings and spelling Correct kernel-doc warnings as reported by kernel test robot: ste_dma40.c:57: warning: Excess struct member 'dev_tx' description in 'stedma40_platform_data' ste_dma40.c:57: warning: Excess struct member 'dev_rx' description in 'stedma40_platform_data' Correct spellos as reported by codespell. Signed-off-by: Randy Dunlap Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202312171417.izbQThoU-lkp@intel.com/ Cc: Linus Walleij Cc: linux-arm-kernel@lists.infradead.org Cc: Vinod Koul Cc: dmaengine@vger.kernel.org Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20231218060834.19222-1-rdunlap@infradead.org Signed-off-by: Vinod Koul --- drivers/dma/ste_dma40.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/dma/ste_dma40.c b/drivers/dma/ste_dma40.c index 002833fb1fa04..2c489299148ee 100644 --- a/drivers/dma/ste_dma40.c +++ b/drivers/dma/ste_dma40.c @@ -31,13 +31,11 @@ /** * struct stedma40_platform_data - Configuration struct for the dma device. * - * @dev_tx: mapping between destination event line and io address - * @dev_rx: mapping between source event line and io address * @disabled_channels: A vector, ending with -1, that marks physical channels * that are for different reasons not available for the driver. * @soft_lli_chans: A vector, that marks physical channels will use LLI by SW * which avoids HW bug that exists in some versions of the controller. - * SoftLLI introduces relink overhead that could impact performace for + * SoftLLI introduces relink overhead that could impact performance for * certain use cases. * @num_of_soft_lli_chans: The number of channels that needs to be configured * to use SoftLLI. @@ -184,7 +182,7 @@ static __maybe_unused u32 d40_backup_regs[] = { /* * since 9540 and 8540 has the same HW revision - * use v4a for 9540 or ealier + * use v4a for 9540 or earlier * use v4b for 8540 or later * HW revision: * DB8500ed has revision 0 @@ -411,7 +409,7 @@ struct d40_desc { * * @base: The virtual address of LCLA. 18 bit aligned. * @dma_addr: DMA address, if mapped - * @base_unaligned: The orignal kmalloc pointer, if kmalloc is used. + * @base_unaligned: The original kmalloc pointer, if kmalloc is used. * This pointer is only there for clean-up on error. * @pages: The number of pages needed for all physical channels. * Only used later for clean-up on error @@ -1655,7 +1653,7 @@ static void dma_tasklet(struct tasklet_struct *t) return; check_pending_tx: - /* Rescue manouver if receiving double interrupts */ + /* Rescue maneuver if receiving double interrupts */ if (d40c->pending_tx > 0) d40c->pending_tx--; spin_unlock_irqrestore(&d40c->lock, flags); @@ -3412,7 +3410,7 @@ static int __init d40_lcla_allocate(struct d40_base *base) base->lcla_pool.base = (void *)page_list[i]; } else { /* - * After many attempts and no succees with finding the correct + * After many attempts and no success with finding the correct * alignment, try with allocating a big buffer. */ dev_warn(base->dev, -- GitLab From 353cc219372935805218e05a7754a059ab104641 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 19 Dec 2023 13:54:25 -0500 Subject: [PATCH 0436/1290] ring-buffer: Keep the same size when updating the order The function ring_buffer_subbuf_order_set() just updated the sub-buffers to the new size, but this also changes the size of the buffer in doing so. As the size is determined by nr_pages * subbuf_size. If the subbuf_size is increased without decreasing the nr_pages, this causes the total size of the buffer to increase. This broke the latency tracers as the snapshot needs to be the same size as the main buffer. The size of the snapshot buffer is only expanded when needed, and because the order is still the same, the size becomes out of sync with the main buffer, as the main buffer increased in size without the tracing system knowing. Calculate the nr_pages to allocate with the new subbuf_size to be buffer_size / new_subbuf_size. Link: https://lore.kernel.org/linux-trace-kernel/20231219185630.649397785@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Tzvetomir Stoyanov Cc: Vincent Donnefort Cc: Kent Overstreet Fixes: f9b94daa542a ("ring-buffer: Set new size of the ring buffer sub page") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index fdcd171b09b5d..23ead7602da0b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5897,7 +5897,10 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) if (!cpumask_test_cpu(cpu, buffer->cpumask)) continue; - nr_pages = buffer->buffers[cpu]->nr_pages; + /* Update the number of pages to match the new size */ + nr_pages = old_size * buffer->buffers[cpu]->nr_pages; + nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size); + cpu_buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); if (!cpu_buffers[cpu]) { err = -ENOMEM; -- GitLab From 8e7b58c27b3c567316a51079b375b846f9223bba Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 19 Dec 2023 13:54:26 -0500 Subject: [PATCH 0437/1290] ring-buffer: Just update the subbuffers when changing their allocation order The ring_buffer_subbuf_order_set() was creating ring_buffer_per_cpu cpu_buffers with the new subbuffers with the updated order, and if they all successfully were created, then they the ring_buffer's per_cpu buffers would be freed and replaced by them. The problem is that the freed per_cpu buffers contains state that would be lost. Running the following commands: 1. # echo 3 > /sys/kernel/tracing/buffer_subbuf_order 2. # echo 0 > /sys/kernel/tracing/tracing_cpumask 3. # echo 1 > /sys/kernel/tracing/snapshot 4. # echo ff > /sys/kernel/tracing/tracing_cpumask 5. # echo test > /sys/kernel/tracing/trace_marker Would result in: -bash: echo: write error: Bad file descriptor That's because the state of the per_cpu buffers of the snapshot buffer is lost when the order is changed (the order of a freed snapshot buffer goes to 0 to save memory, and when the snapshot buffer is allocated again, it goes back to what the main buffer is). In operation 2, the snapshot buffers were set to "disable" (as all the ring buffers CPUs were disabled). In operation 3, the snapshot is allocated and a call to ring_buffer_subbuf_order_set() replaced the per_cpu buffers losing the "record_disable" count. When it was enabled again, the atomic_dec(&cpu_buffer->record_disable) was decrementing a zero, setting it to -1. Writing 1 into the snapshot would swap the snapshot buffer with the main buffer, so now the main buffer is "disabled", and nothing can write to the ring buffer anymore. Instead of creating new per_cpu buffers and losing the state of the old buffers, basically do what the resize does and just allocate new subbuf pages into the new_pages link list of the per_cpu buffer and if they all succeed, then replace the old sub buffers with the new ones. This keeps the per_cpu buffer descriptor in tact and by doing so, keeps its state. Link: https://lore.kernel.org/linux-trace-kernel/20231219185630.944104939@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Tzvetomir Stoyanov Cc: Vincent Donnefort Cc: Kent Overstreet Fixes: f9b94daa542a ("ring-buffer: Set new size of the ring buffer sub page") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 88 ++++++++++++++++++++++++++++++-------- 1 file changed, 71 insertions(+), 17 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 23ead7602da0b..7ee6779bf292d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5856,11 +5856,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_get); */ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) { - struct ring_buffer_per_cpu **cpu_buffers; + struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_page *bpage, *tmp; int old_order, old_size; int nr_pages; int psize; - int bsize; int err; int cpu; @@ -5874,11 +5874,6 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) if (psize <= BUF_PAGE_HDR_SIZE) return -EINVAL; - bsize = sizeof(void *) * buffer->cpus; - cpu_buffers = kzalloc(bsize, GFP_KERNEL); - if (!cpu_buffers) - return -ENOMEM; - old_order = buffer->subbuf_order; old_size = buffer->subbuf_size; @@ -5894,33 +5889,88 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) /* Make sure all new buffers are allocated, before deleting the old ones */ for_each_buffer_cpu(buffer, cpu) { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) continue; + cpu_buffer = buffer->buffers[cpu]; + /* Update the number of pages to match the new size */ nr_pages = old_size * buffer->buffers[cpu]->nr_pages; nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size); - cpu_buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); - if (!cpu_buffers[cpu]) { + /* we need a minimum of two pages */ + if (nr_pages < 2) + nr_pages = 2; + + cpu_buffer->nr_pages_to_update = nr_pages; + + /* Include the reader page */ + nr_pages++; + + /* Allocate the new size buffer */ + INIT_LIST_HEAD(&cpu_buffer->new_pages); + if (__rb_allocate_pages(cpu_buffer, nr_pages, + &cpu_buffer->new_pages)) { + /* not enough memory for new pages */ err = -ENOMEM; goto error; } } for_each_buffer_cpu(buffer, cpu) { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) continue; - rb_free_cpu_buffer(buffer->buffers[cpu]); - buffer->buffers[cpu] = cpu_buffers[cpu]; + cpu_buffer = buffer->buffers[cpu]; + + /* Clear the head bit to make the link list normal to read */ + rb_head_page_deactivate(cpu_buffer); + + /* Now walk the list and free all the old sub buffers */ + list_for_each_entry_safe(bpage, tmp, cpu_buffer->pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + /* The above loop stopped an the last page needing to be freed */ + bpage = list_entry(cpu_buffer->pages, struct buffer_page, list); + free_buffer_page(bpage); + + /* Free the current reader page */ + free_buffer_page(cpu_buffer->reader_page); + + /* One page was allocated for the reader page */ + cpu_buffer->reader_page = list_entry(cpu_buffer->new_pages.next, + struct buffer_page, list); + list_del_init(&cpu_buffer->reader_page->list); + + /* The cpu_buffer pages are a link list with no head */ + cpu_buffer->pages = cpu_buffer->new_pages.next; + cpu_buffer->new_pages.next->prev = cpu_buffer->new_pages.prev; + cpu_buffer->new_pages.prev->next = cpu_buffer->new_pages.next; + + /* Clear the new_pages list */ + INIT_LIST_HEAD(&cpu_buffer->new_pages); + + cpu_buffer->head_page + = list_entry(cpu_buffer->pages, struct buffer_page, list); + cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; + + cpu_buffer->nr_pages = cpu_buffer->nr_pages_to_update; + cpu_buffer->nr_pages_to_update = 0; + + free_pages((unsigned long)cpu_buffer->free_page, old_order); + cpu_buffer->free_page = NULL; + + rb_head_page_activate(cpu_buffer); + + rb_check_pages(cpu_buffer); } atomic_dec(&buffer->record_disabled); mutex_unlock(&buffer->mutex); - kfree(cpu_buffers); - return 0; error: @@ -5931,12 +5981,16 @@ error: mutex_unlock(&buffer->mutex); for_each_buffer_cpu(buffer, cpu) { - if (!cpu_buffers[cpu]) + cpu_buffer = buffer->buffers[cpu]; + + if (!cpu_buffer->nr_pages_to_update) continue; - rb_free_cpu_buffer(cpu_buffers[cpu]); - kfree(cpu_buffers[cpu]); + + list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } } - kfree(cpu_buffers); return err; } -- GitLab From 7c3f48026589b532ebc0d12f939f6be78bd74a58 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 19 Dec 2023 13:54:27 -0500 Subject: [PATCH 0438/1290] ring-buffer: Add documentation on the buffer_subbuf_order file Add to the documentation how to use the buffer_subbuf_order file to change the size and how it affects what events can be added to the ring buffer. Link: https://lore.kernel.org/linux-trace-kernel/20231219185631.230636734@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Tzvetomir Stoyanov Cc: Vincent Donnefort Cc: Kent Overstreet Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/ftrace.rst | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst index 23572f6697c0a..231d26ceedb05 100644 --- a/Documentation/trace/ftrace.rst +++ b/Documentation/trace/ftrace.rst @@ -203,6 +203,33 @@ of ftrace. Here is a list of some of the key files: This displays the total combined size of all the trace buffers. + buffer_subbuf_order: + + This sets or displays the sub buffer page size order. The ring buffer + is broken up into several same size "sub buffers". An event can not be + bigger than the size of the sub buffer. Normally, the sub buffer is + the size of the architecture's page (4K on x86). The sub buffer also + contains meta data at the start which also limits the size of an event. + That means when the sub buffer is a page size, no event can be larger + than the page size minus the sub buffer meta data. + + The buffer_subbuf_order allows the user to change the size of the sub + buffer. As the sub buffer is a set of pages by the power of 2, thus + the sub buffer total size is defined by the order: + + order size + ---- ---- + 0 PAGE_SIZE + 1 PAGE_SIZE * 2 + 2 PAGE_SIZE * 4 + 3 PAGE_SIZE * 8 + + Changing the order will change the sub buffer size allowing for events + to be larger than the page size. + + Note: When changing the order, tracing is stopped and any data in the + ring buffer and the snapshot buffer will be discarded. + free_buffer: If a process is performing tracing, and the ring buffer should be -- GitLab From 1acce70374caabc2945aa07a862f834fb7c2914f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 19 Dec 2023 13:54:28 -0500 Subject: [PATCH 0439/1290] ringbuffer/selftest: Add basic selftest to test changing subbuf order Add a self test that will write into the trace buffer with differ trace sub buffer order sizes. Link: https://lore.kernel.org/linux-trace-kernel/20231219185631.520496304@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Tzvetomir Stoyanov Cc: Vincent Donnefort Cc: Kent Overstreet Signed-off-by: Steven Rostedt (Google) --- .../ftrace/test.d/00basic/ringbuffer_order.tc | 95 +++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc diff --git a/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc b/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc new file mode 100644 index 0000000000000..ecbcc810e6c16 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc @@ -0,0 +1,95 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Change the ringbuffer sub-buffer order +# requires: buffer_subbuf_order +# flags: instance + +get_buffer_data_size() { + sed -ne 's/^.*data.*size:\([0-9][0-9]*\).*/\1/p' events/header_page +} + +get_buffer_data_offset() { + sed -ne 's/^.*data.*offset:\([0-9][0-9]*\).*/\1/p' events/header_page +} + +get_event_header_size() { + type_len=`sed -ne 's/^.*type_len.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event` + time_len=`sed -ne 's/^.*time_delta.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event` + array_len=`sed -ne 's/^.*array.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event` + total_bits=$((type_len+time_len+array_len)) + total_bits=$((total_bits+7)) + echo $((total_bits/8)) +} + +get_print_event_buf_offset() { + sed -ne 's/^.*buf.*offset:\([0-9][0-9]*\).*/\1/p' events/ftrace/print/format +} + +event_header_size=`get_event_header_size` +print_header_size=`get_print_event_buf_offset` + +data_offset=`get_buffer_data_offset` + +marker_meta=$((event_header_size+print_header_size)) + +make_str() { + cnt=$1 + printf -- 'X%.0s' $(seq $cnt) +} + +write_buffer() { + size=$1 + + str=`make_str $size` + + # clear the buffer + echo > trace + + # write the string into the marker + echo $str > trace_marker + + echo $str +} + +test_buffer() { + orde=$1 + page_size=$((4096< buffer_subbuf_order + test_buffer $a +done + +echo $ORIG > buffer_subbuf_order + -- GitLab From 2f84b39f48476615186af5b3220ad5a2c756679b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 19 Dec 2023 13:54:29 -0500 Subject: [PATCH 0440/1290] tracing: Update subbuffer with kilobytes not page order Using page order for deciding what the size of the ring buffer sub buffers are is exposing a bit too much of the implementation. Although the sub buffers are only allocated in orders of pages, allow the user to specify the minimum size of each sub-buffer via kilobytes like they can with the buffer size itself. If the user specifies 3 via: echo 3 > buffer_subbuf_size_kb Then the sub-buffer size will round up to 4kb (on a 4kb page size system). If they specify: echo 6 > buffer_subbuf_size_kb The sub-buffer size will become 8kb. and so on. Link: https://lore.kernel.org/linux-trace-kernel/20231219185631.809766769@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Tzvetomir Stoyanov Cc: Vincent Donnefort Cc: Kent Overstreet Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/ftrace.rst | 46 ++++++++----------- kernel/trace/trace.c | 38 +++++++++------ ...fer_order.tc => ringbuffer_subbuf_size.tc} | 18 ++++---- 3 files changed, 54 insertions(+), 48 deletions(-) rename tools/testing/selftests/ftrace/test.d/00basic/{ringbuffer_order.tc => ringbuffer_subbuf_size.tc} (85%) diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst index 231d26ceedb05..933e7efb9f1b3 100644 --- a/Documentation/trace/ftrace.rst +++ b/Documentation/trace/ftrace.rst @@ -203,32 +203,26 @@ of ftrace. Here is a list of some of the key files: This displays the total combined size of all the trace buffers. - buffer_subbuf_order: - - This sets or displays the sub buffer page size order. The ring buffer - is broken up into several same size "sub buffers". An event can not be - bigger than the size of the sub buffer. Normally, the sub buffer is - the size of the architecture's page (4K on x86). The sub buffer also - contains meta data at the start which also limits the size of an event. - That means when the sub buffer is a page size, no event can be larger - than the page size minus the sub buffer meta data. - - The buffer_subbuf_order allows the user to change the size of the sub - buffer. As the sub buffer is a set of pages by the power of 2, thus - the sub buffer total size is defined by the order: - - order size - ---- ---- - 0 PAGE_SIZE - 1 PAGE_SIZE * 2 - 2 PAGE_SIZE * 4 - 3 PAGE_SIZE * 8 - - Changing the order will change the sub buffer size allowing for events - to be larger than the page size. - - Note: When changing the order, tracing is stopped and any data in the - ring buffer and the snapshot buffer will be discarded. + buffer_subbuf_size_kb: + + This sets or displays the sub buffer size. The ring buffer is broken up + into several same size "sub buffers". An event can not be bigger than + the size of the sub buffer. Normally, the sub buffer is the size of the + architecture's page (4K on x86). The sub buffer also contains meta data + at the start which also limits the size of an event. That means when + the sub buffer is a page size, no event can be larger than the page + size minus the sub buffer meta data. + + Note, the buffer_subbuf_size_kb is a way for the user to specify the + minimum size of the subbuffer. The kernel may make it bigger due to the + implementation details, or simply fail the operation if the kernel can + not handle the request. + + Changing the sub buffer size allows for events to be larger than the + page size. + + Note: When changing the sub-buffer size, tracing is stopped and any + data in the ring buffer and the snapshot buffer will be discarded. free_buffer: diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 82303bd2bba13..46dbe22121e91 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9384,42 +9384,54 @@ static const struct file_operations buffer_percent_fops = { }; static ssize_t -buffer_order_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +buffer_subbuf_size_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; + size_t size; char buf[64]; + int order; int r; - r = sprintf(buf, "%d\n", ring_buffer_subbuf_order_get(tr->array_buffer.buffer)); + order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); + size = (PAGE_SIZE << order) / 1024; + + r = sprintf(buf, "%zd\n", size); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } static ssize_t -buffer_order_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +buffer_subbuf_size_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; unsigned long val; int old_order; + int order; + int pages; int ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; + val *= 1024; /* value passed in is in KB */ + + pages = DIV_ROUND_UP(val, PAGE_SIZE); + order = fls(pages - 1); + /* limit between 1 and 128 system pages */ - if (val < 0 || val > 7) + if (order < 0 || order > 7) return -EINVAL; /* Do not allow tracing while changing the order of the ring buffer */ tracing_stop_tr(tr); old_order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); - if (old_order == val) + if (old_order == order) goto out; - ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, val); + ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, order); if (ret) goto out; @@ -9428,7 +9440,7 @@ buffer_order_write(struct file *filp, const char __user *ubuf, if (!tr->allocated_snapshot) goto out_max; - ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, val); + ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order); if (ret) { /* Put back the old order */ cnt = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, old_order); @@ -9460,10 +9472,10 @@ buffer_order_write(struct file *filp, const char __user *ubuf, return cnt; } -static const struct file_operations buffer_order_fops = { +static const struct file_operations buffer_subbuf_size_fops = { .open = tracing_open_generic_tr, - .read = buffer_order_read, - .write = buffer_order_write, + .read = buffer_subbuf_size_read, + .write = buffer_subbuf_size_write, .release = tracing_release_generic_tr, .llseek = default_llseek, }; @@ -9934,8 +9946,8 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer, tr, &buffer_percent_fops); - trace_create_file("buffer_subbuf_order", TRACE_MODE_WRITE, d_tracer, - tr, &buffer_order_fops); + trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer, + tr, &buffer_subbuf_size_fops); create_trace_options_dir(tr); diff --git a/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc b/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_subbuf_size.tc similarity index 85% rename from tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc rename to tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_subbuf_size.tc index ecbcc810e6c16..d44d09a33a743 100644 --- a/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc +++ b/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_subbuf_size.tc @@ -1,7 +1,7 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 -# description: Change the ringbuffer sub-buffer order -# requires: buffer_subbuf_order +# description: Change the ringbuffer sub-buffer size +# requires: buffer_subbuf_size_kb # flags: instance get_buffer_data_size() { @@ -52,8 +52,8 @@ write_buffer() { } test_buffer() { - orde=$1 - page_size=$((4096< buffer_subbuf_order +for a in 4 8 16 32 ; do + echo $a > buffer_subbuf_size_kb test_buffer $a done -echo $ORIG > buffer_subbuf_order +echo $ORIG > buffer_subbuf_size_kb -- GitLab From 3cb3091138ca0921c4569bcf7ffa062519639b6a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 19 Dec 2023 17:38:00 -0500 Subject: [PATCH 0441/1290] ring-buffer: Use subbuf_order for buffer page masking The comparisons to PAGE_SIZE were all converted to use the buffer->subbuf_order, but the use of PAGE_MASK was missed. Convert all the PAGE_MASK usages over to: (PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1 Link: https://lore.kernel.org/linux-trace-kernel/20231219173800.66eefb7a@gandalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Tzvetomir Stoyanov Cc: Vincent Donnefort Cc: Kent Overstreet Fixes: 139f84002145 ("ring-buffer: Page size per ring buffer") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7ee6779bf292d..173d2595ce2d9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2269,11 +2269,13 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) } static __always_inline unsigned -rb_event_index(struct ring_buffer_event *event) +rb_event_index(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { unsigned long addr = (unsigned long)event; - return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; + addr &= (PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1; + + return addr - BUF_PAGE_HDR_SIZE; } static void rb_inc_iter(struct ring_buffer_iter *iter) @@ -2646,7 +2648,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, /* Slow path */ static struct ring_buffer_event * -rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs) +rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event, u64 delta, bool abs) { if (abs) event->type_len = RINGBUF_TYPE_TIME_STAMP; @@ -2654,7 +2657,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs) event->type_len = RINGBUF_TYPE_TIME_EXTEND; /* Not the first event on the page, or not delta? */ - if (abs || rb_event_index(event)) { + if (abs || rb_event_index(cpu_buffer, event)) { event->time_delta = delta & TS_MASK; event->array[0] = delta >> TS_SHIFT; } else { @@ -2728,7 +2731,7 @@ static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer, if (!abs) info->delta = 0; } - *event = rb_add_time_stamp(*event, info->delta, abs); + *event = rb_add_time_stamp(cpu_buffer, *event, info->delta, abs); *length -= RB_LEN_TIME_EXTEND; *delta = 0; } @@ -2812,10 +2815,10 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *bpage; unsigned long addr; - new_index = rb_event_index(event); + new_index = rb_event_index(cpu_buffer, event); old_index = new_index + rb_event_ts_length(event); addr = (unsigned long)event; - addr &= PAGE_MASK; + addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1); bpage = READ_ONCE(cpu_buffer->tail_page); @@ -3726,7 +3729,7 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *bpage = cpu_buffer->commit_page; struct buffer_page *start; - addr &= PAGE_MASK; + addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1); /* Do the likely case first */ if (likely(bpage->page == (void *)addr)) { -- GitLab From 3b3b5339cdc67e98817d08431f8443b08880084f Mon Sep 17 00:00:00 2001 From: Binbin Zhou Date: Mon, 18 Dec 2023 09:56:38 +0800 Subject: [PATCH 0442/1290] dt-bindings: dmaengine: Add Loongson LS2X APB DMA controller Add Loongson LS2X APB DMA controller binding with DT schema format using json-schema. Reviewed-by: Conor Dooley Signed-off-by: Binbin Zhou Link: https://lore.kernel.org/r/078307641077edaf46dd986c6d31cea15545a208.1702365725.git.zhoubinbin@loongson.cn Signed-off-by: Vinod Koul --- .../bindings/dma/loongson,ls2x-apbdma.yaml | 62 +++++++++++++++++++ MAINTAINERS | 6 ++ 2 files changed, 68 insertions(+) create mode 100644 Documentation/devicetree/bindings/dma/loongson,ls2x-apbdma.yaml diff --git a/Documentation/devicetree/bindings/dma/loongson,ls2x-apbdma.yaml b/Documentation/devicetree/bindings/dma/loongson,ls2x-apbdma.yaml new file mode 100644 index 0000000000000..6a1b49a49a646 --- /dev/null +++ b/Documentation/devicetree/bindings/dma/loongson,ls2x-apbdma.yaml @@ -0,0 +1,62 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/dma/loongson,ls2x-apbdma.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Loongson LS2X APB DMA controller + +description: + The Loongson LS2X APB DMA controller is used for transferring data + between system memory and the peripherals on the APB bus. + +maintainers: + - Binbin Zhou + +allOf: + - $ref: dma-controller.yaml# + +properties: + compatible: + oneOf: + - const: loongson,ls2k1000-apbdma + - items: + - const: loongson,ls2k0500-apbdma + - const: loongson,ls2k1000-apbdma + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + clocks: + maxItems: 1 + + '#dma-cells': + const: 1 + +required: + - compatible + - reg + - interrupts + - clocks + - '#dma-cells' + +additionalProperties: false + +examples: + - | + #include + #include + + dma-controller@1fe00c00 { + compatible = "loongson,ls2k1000-apbdma"; + reg = <0x1fe00c00 0x8>; + interrupt-parent = <&liointc1>; + interrupts = <12 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_APB_CLK>; + #dma-cells = <1>; + }; + +... diff --git a/MAINTAINERS b/MAINTAINERS index 97f51d5ec1cfd..c9a48bce986e0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12507,6 +12507,12 @@ S: Maintained F: Documentation/devicetree/bindings/gpio/loongson,ls-gpio.yaml F: drivers/gpio/gpio-loongson-64bit.c +LOONGSON LS2X APB DMA DRIVER +M: Binbin Zhou +L: dmaengine@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/dma/loongson,ls2x-apbdma.yaml + LOONGSON LS2X I2C DRIVER M: Binbin Zhou L: linux-i2c@vger.kernel.org -- GitLab From 71e7d3cb6e55ae2eadcdb178f9243dc18499d369 Mon Sep 17 00:00:00 2001 From: Binbin Zhou Date: Mon, 18 Dec 2023 09:56:39 +0800 Subject: [PATCH 0443/1290] dmaengine: ls2x-apb: New driver for the Loongson LS2X APB DMA controller The Loongson LS2X APB DMA controller is available on Loongson-2K chips. It is a single-channel, configurable DMA controller IP core based on the AXI bus, whose main function is to integrate DMA functionality on a chip dedicated to carrying data between memory and peripherals in APB bus (e.g. nand). Signed-off-by: Binbin Zhou Signed-off-by: Yingkun Meng Link: https://lore.kernel.org/r/8df2a0199434fba3535831082966c2442ecf1cae.1702365725.git.zhoubinbin@loongson.cn Signed-off-by: Vinod Koul --- MAINTAINERS | 1 + drivers/dma/Kconfig | 14 + drivers/dma/Makefile | 1 + drivers/dma/ls2x-apb-dma.c | 705 +++++++++++++++++++++++++++++++++++++ 4 files changed, 721 insertions(+) create mode 100644 drivers/dma/ls2x-apb-dma.c diff --git a/MAINTAINERS b/MAINTAINERS index c9a48bce986e0..331bac4abde16 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12512,6 +12512,7 @@ M: Binbin Zhou L: dmaengine@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/dma/loongson,ls2x-apbdma.yaml +F: drivers/dma/ls2x-apb-dma.c LOONGSON LS2X I2C DRIVER M: Binbin Zhou diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 70ba506dabab5..e928f2ca0f1e9 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -378,6 +378,20 @@ config LPC18XX_DMAMUX Enable support for DMA on NXP LPC18xx/43xx platforms with PL080 and multiplexed DMA request lines. +config LS2X_APB_DMA + tristate "Loongson LS2X APB DMA support" + depends on LOONGARCH || COMPILE_TEST + select DMA_ENGINE + select DMA_VIRTUAL_CHANNELS + help + Support for the Loongson LS2X APB DMA controller driver. The + DMA controller is having single DMA channel which can be + configured for different peripherals like audio, nand, sdio + etc which is in APB bus. + + This DMA controller transfers data from memory to peripheral fifo. + It does not support memory to memory data transfer. + config MCF_EDMA tristate "Freescale eDMA engine support, ColdFire mcf5441x SoCs" depends on M5441x || COMPILE_TEST diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile index 83553a97a010e..dfd40d14e4089 100644 --- a/drivers/dma/Makefile +++ b/drivers/dma/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_INTEL_IOATDMA) += ioat/ obj-y += idxd/ obj-$(CONFIG_K3_DMA) += k3dma.o obj-$(CONFIG_LPC18XX_DMAMUX) += lpc18xx-dmamux.o +obj-$(CONFIG_LS2X_APB_DMA) += ls2x-apb-dma.o obj-$(CONFIG_MILBEAUT_HDMAC) += milbeaut-hdmac.o obj-$(CONFIG_MILBEAUT_XDMAC) += milbeaut-xdmac.o obj-$(CONFIG_MMP_PDMA) += mmp_pdma.o diff --git a/drivers/dma/ls2x-apb-dma.c b/drivers/dma/ls2x-apb-dma.c new file mode 100644 index 0000000000000..a49913f3ed3f7 --- /dev/null +++ b/drivers/dma/ls2x-apb-dma.c @@ -0,0 +1,705 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Driver for the Loongson LS2X APB DMA Controller + * + * Copyright (C) 2017-2023 Loongson Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dmaengine.h" +#include "virt-dma.h" + +/* Global Configuration Register */ +#define LDMA_ORDER_ERG 0x0 + +/* Bitfield definitions */ + +/* Bitfields in Global Configuration Register */ +#define LDMA_64BIT_EN BIT(0) /* 1: 64 bit support */ +#define LDMA_UNCOHERENT_EN BIT(1) /* 0: cache, 1: uncache */ +#define LDMA_ASK_VALID BIT(2) +#define LDMA_START BIT(3) /* DMA start operation */ +#define LDMA_STOP BIT(4) /* DMA stop operation */ +#define LDMA_CONFIG_MASK GENMASK(4, 0) /* DMA controller config bits mask */ + +/* Bitfields in ndesc_addr field of HW decriptor */ +#define LDMA_DESC_EN BIT(0) /*1: The next descriptor is valid */ +#define LDMA_DESC_ADDR_LOW GENMASK(31, 1) + +/* Bitfields in cmd field of HW decriptor */ +#define LDMA_INT BIT(1) /* Enable DMA interrupts */ +#define LDMA_DATA_DIRECTION BIT(12) /* 1: write to device, 0: read from device */ + +#define LDMA_SLAVE_BUSWIDTHS (BIT(DMA_SLAVE_BUSWIDTH_4_BYTES) | \ + BIT(DMA_SLAVE_BUSWIDTH_8_BYTES)) + +#define LDMA_MAX_TRANS_LEN U32_MAX + +/*-- descriptors -----------------------------------------------------*/ + +/* + * struct ls2x_dma_hw_desc - DMA HW descriptor + * @ndesc_addr: the next descriptor low address. + * @mem_addr: memory low address. + * @apb_addr: device buffer address. + * @len: length of a piece of carried content, in words. + * @step_len: length between two moved memory data blocks. + * @step_times: number of blocks to be carried in a single DMA operation. + * @cmd: descriptor command or state. + * @stats: DMA status. + * @high_ndesc_addr: the next descriptor high address. + * @high_mem_addr: memory high address. + * @reserved: reserved + */ +struct ls2x_dma_hw_desc { + u32 ndesc_addr; + u32 mem_addr; + u32 apb_addr; + u32 len; + u32 step_len; + u32 step_times; + u32 cmd; + u32 stats; + u32 high_ndesc_addr; + u32 high_mem_addr; + u32 reserved[2]; +} __packed; + +/* + * struct ls2x_dma_sg - ls2x dma scatter gather entry + * @hw: the pointer to DMA HW descriptor. + * @llp: physical address of the DMA HW descriptor. + * @phys: destination or source address(mem). + * @len: number of Bytes to read. + */ +struct ls2x_dma_sg { + struct ls2x_dma_hw_desc *hw; + dma_addr_t llp; + dma_addr_t phys; + u32 len; +}; + +/* + * struct ls2x_dma_desc - software descriptor + * @vdesc: pointer to the virtual dma descriptor. + * @cyclic: flag to dma cyclic + * @burst_size: burst size of transaction, in words. + * @desc_num: number of sg entries. + * @direction: transfer direction, to or from device. + * @status: dma controller status. + * @sg: array of sgs. + */ +struct ls2x_dma_desc { + struct virt_dma_desc vdesc; + bool cyclic; + size_t burst_size; + u32 desc_num; + enum dma_transfer_direction direction; + enum dma_status status; + struct ls2x_dma_sg sg[] __counted_by(desc_num); +}; + +/*-- Channels --------------------------------------------------------*/ + +/* + * struct ls2x_dma_chan - internal representation of an LS2X APB DMA channel + * @vchan: virtual dma channel entry. + * @desc: pointer to the ls2x sw dma descriptor. + * @pool: hw desc table + * @irq: irq line + * @sconfig: configuration for slave transfers, passed via .device_config + */ +struct ls2x_dma_chan { + struct virt_dma_chan vchan; + struct ls2x_dma_desc *desc; + void *pool; + int irq; + struct dma_slave_config sconfig; +}; + +/*-- Controller ------------------------------------------------------*/ + +/* + * struct ls2x_dma_priv - LS2X APB DMAC specific information + * @ddev: dmaengine dma_device object members + * @dma_clk: DMAC clock source + * @regs: memory mapped register base + * @lchan: channel to store ls2x_dma_chan structures + */ +struct ls2x_dma_priv { + struct dma_device ddev; + struct clk *dma_clk; + void __iomem *regs; + struct ls2x_dma_chan lchan; +}; + +/*-- Helper functions ------------------------------------------------*/ + +static inline struct ls2x_dma_desc *to_ldma_desc(struct virt_dma_desc *vdesc) +{ + return container_of(vdesc, struct ls2x_dma_desc, vdesc); +} + +static inline struct ls2x_dma_chan *to_ldma_chan(struct dma_chan *chan) +{ + return container_of(chan, struct ls2x_dma_chan, vchan.chan); +} + +static inline struct ls2x_dma_priv *to_ldma_priv(struct dma_device *ddev) +{ + return container_of(ddev, struct ls2x_dma_priv, ddev); +} + +static struct device *chan2dev(struct dma_chan *chan) +{ + return &chan->dev->device; +} + +static void ls2x_dma_desc_free(struct virt_dma_desc *vdesc) +{ + struct ls2x_dma_chan *lchan = to_ldma_chan(vdesc->tx.chan); + struct ls2x_dma_desc *desc = to_ldma_desc(vdesc); + int i; + + for (i = 0; i < desc->desc_num; i++) { + if (desc->sg[i].hw) + dma_pool_free(lchan->pool, desc->sg[i].hw, + desc->sg[i].llp); + } + + kfree(desc); +} + +static void ls2x_dma_write_cmd(struct ls2x_dma_chan *lchan, bool cmd) +{ + struct ls2x_dma_priv *priv = to_ldma_priv(lchan->vchan.chan.device); + u64 val; + + val = lo_hi_readq(priv->regs + LDMA_ORDER_ERG) & ~LDMA_CONFIG_MASK; + val |= LDMA_64BIT_EN | cmd; + lo_hi_writeq(val, priv->regs + LDMA_ORDER_ERG); +} + +static void ls2x_dma_start_transfer(struct ls2x_dma_chan *lchan) +{ + struct ls2x_dma_priv *priv = to_ldma_priv(lchan->vchan.chan.device); + struct ls2x_dma_sg *ldma_sg; + struct virt_dma_desc *vdesc; + u64 val; + + /* Get the next descriptor */ + vdesc = vchan_next_desc(&lchan->vchan); + if (!vdesc) { + lchan->desc = NULL; + return; + } + + list_del(&vdesc->node); + lchan->desc = to_ldma_desc(vdesc); + ldma_sg = &lchan->desc->sg[0]; + + /* Start DMA */ + lo_hi_writeq(0, priv->regs + LDMA_ORDER_ERG); + val = (ldma_sg->llp & ~LDMA_CONFIG_MASK) | LDMA_64BIT_EN | LDMA_START; + lo_hi_writeq(val, priv->regs + LDMA_ORDER_ERG); +} + +static size_t ls2x_dmac_detect_burst(struct ls2x_dma_chan *lchan) +{ + u32 maxburst, buswidth; + + /* Reject definitely invalid configurations */ + if ((lchan->sconfig.src_addr_width & LDMA_SLAVE_BUSWIDTHS) && + (lchan->sconfig.dst_addr_width & LDMA_SLAVE_BUSWIDTHS)) + return 0; + + if (lchan->sconfig.direction == DMA_MEM_TO_DEV) { + maxburst = lchan->sconfig.dst_maxburst; + buswidth = lchan->sconfig.dst_addr_width; + } else { + maxburst = lchan->sconfig.src_maxburst; + buswidth = lchan->sconfig.src_addr_width; + } + + /* If maxburst is zero, fallback to LDMA_MAX_TRANS_LEN */ + return maxburst ? (maxburst * buswidth) >> 2 : LDMA_MAX_TRANS_LEN; +} + +static void ls2x_dma_fill_desc(struct ls2x_dma_chan *lchan, u32 sg_index, + struct ls2x_dma_desc *desc) +{ + struct ls2x_dma_sg *ldma_sg = &desc->sg[sg_index]; + u32 num_segments, segment_size; + + if (desc->direction == DMA_MEM_TO_DEV) { + ldma_sg->hw->cmd = LDMA_INT | LDMA_DATA_DIRECTION; + ldma_sg->hw->apb_addr = lchan->sconfig.dst_addr; + } else { + ldma_sg->hw->cmd = LDMA_INT; + ldma_sg->hw->apb_addr = lchan->sconfig.src_addr; + } + + ldma_sg->hw->mem_addr = lower_32_bits(ldma_sg->phys); + ldma_sg->hw->high_mem_addr = upper_32_bits(ldma_sg->phys); + + /* Split into multiple equally sized segments if necessary */ + num_segments = DIV_ROUND_UP((ldma_sg->len + 3) >> 2, desc->burst_size); + segment_size = DIV_ROUND_UP((ldma_sg->len + 3) >> 2, num_segments); + + /* Word count register takes input in words */ + ldma_sg->hw->len = segment_size; + ldma_sg->hw->step_times = num_segments; + ldma_sg->hw->step_len = 0; + + /* lets make a link list */ + if (sg_index) { + desc->sg[sg_index - 1].hw->ndesc_addr = ldma_sg->llp | LDMA_DESC_EN; + desc->sg[sg_index - 1].hw->high_ndesc_addr = upper_32_bits(ldma_sg->llp); + } +} + +/*-- DMA Engine API --------------------------------------------------*/ + +/* + * ls2x_dma_alloc_chan_resources - allocate resources for DMA channel + * @chan: allocate descriptor resources for this channel + * + * return - the number of allocated descriptors + */ +static int ls2x_dma_alloc_chan_resources(struct dma_chan *chan) +{ + struct ls2x_dma_chan *lchan = to_ldma_chan(chan); + + /* Create a pool of consistent memory blocks for hardware descriptors */ + lchan->pool = dma_pool_create(dev_name(chan2dev(chan)), + chan->device->dev, PAGE_SIZE, + __alignof__(struct ls2x_dma_hw_desc), 0); + if (!lchan->pool) { + dev_err(chan2dev(chan), "No memory for descriptors\n"); + return -ENOMEM; + } + + return 1; +} + +/* + * ls2x_dma_free_chan_resources - free all channel resources + * @chan: DMA channel + */ +static void ls2x_dma_free_chan_resources(struct dma_chan *chan) +{ + struct ls2x_dma_chan *lchan = to_ldma_chan(chan); + + vchan_free_chan_resources(to_virt_chan(chan)); + dma_pool_destroy(lchan->pool); + lchan->pool = NULL; +} + +/* + * ls2x_dma_prep_slave_sg - prepare descriptors for a DMA_SLAVE transaction + * @chan: DMA channel + * @sgl: scatterlist to transfer to/from + * @sg_len: number of entries in @scatterlist + * @direction: DMA direction + * @flags: tx descriptor status flags + * @context: transaction context (ignored) + * + * Return: Async transaction descriptor on success and NULL on failure + */ +static struct dma_async_tx_descriptor * +ls2x_dma_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, + u32 sg_len, enum dma_transfer_direction direction, + unsigned long flags, void *context) +{ + struct ls2x_dma_chan *lchan = to_ldma_chan(chan); + struct ls2x_dma_desc *desc; + struct scatterlist *sg; + size_t burst_size; + int i; + + if (unlikely(!sg_len || !is_slave_direction(direction))) + return NULL; + + burst_size = ls2x_dmac_detect_burst(lchan); + if (!burst_size) + return NULL; + + desc = kzalloc(struct_size(desc, sg, sg_len), GFP_NOWAIT); + if (!desc) + return NULL; + + desc->desc_num = sg_len; + desc->direction = direction; + desc->burst_size = burst_size; + + for_each_sg(sgl, sg, sg_len, i) { + struct ls2x_dma_sg *ldma_sg = &desc->sg[i]; + + /* Allocate DMA capable memory for hardware descriptor */ + ldma_sg->hw = dma_pool_alloc(lchan->pool, GFP_NOWAIT, &ldma_sg->llp); + if (!ldma_sg->hw) { + desc->desc_num = i; + ls2x_dma_desc_free(&desc->vdesc); + return NULL; + } + + ldma_sg->phys = sg_dma_address(sg); + ldma_sg->len = sg_dma_len(sg); + + ls2x_dma_fill_desc(lchan, i, desc); + } + + /* Setting the last descriptor enable bit */ + desc->sg[sg_len - 1].hw->ndesc_addr &= ~LDMA_DESC_EN; + desc->status = DMA_IN_PROGRESS; + + return vchan_tx_prep(&lchan->vchan, &desc->vdesc, flags); +} + +/* + * ls2x_dma_prep_dma_cyclic - prepare the cyclic DMA transfer + * @chan: the DMA channel to prepare + * @buf_addr: physical DMA address where the buffer starts + * @buf_len: total number of bytes for the entire buffer + * @period_len: number of bytes for each period + * @direction: transfer direction, to or from device + * @flags: tx descriptor status flags + * + * Return: Async transaction descriptor on success and NULL on failure + */ +static struct dma_async_tx_descriptor * +ls2x_dma_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len, + size_t period_len, enum dma_transfer_direction direction, + unsigned long flags) +{ + struct ls2x_dma_chan *lchan = to_ldma_chan(chan); + struct ls2x_dma_desc *desc; + size_t burst_size; + u32 num_periods; + int i; + + if (unlikely(!buf_len || !period_len)) + return NULL; + + if (unlikely(!is_slave_direction(direction))) + return NULL; + + burst_size = ls2x_dmac_detect_burst(lchan); + if (!burst_size) + return NULL; + + num_periods = buf_len / period_len; + desc = kzalloc(struct_size(desc, sg, num_periods), GFP_NOWAIT); + if (!desc) + return NULL; + + desc->desc_num = num_periods; + desc->direction = direction; + desc->burst_size = burst_size; + + /* Build cyclic linked list */ + for (i = 0; i < num_periods; i++) { + struct ls2x_dma_sg *ldma_sg = &desc->sg[i]; + + /* Allocate DMA capable memory for hardware descriptor */ + ldma_sg->hw = dma_pool_alloc(lchan->pool, GFP_NOWAIT, &ldma_sg->llp); + if (!ldma_sg->hw) { + desc->desc_num = i; + ls2x_dma_desc_free(&desc->vdesc); + return NULL; + } + + ldma_sg->phys = buf_addr + period_len * i; + ldma_sg->len = period_len; + + ls2x_dma_fill_desc(lchan, i, desc); + } + + /* Lets make a cyclic list */ + desc->sg[num_periods - 1].hw->ndesc_addr = desc->sg[0].llp | LDMA_DESC_EN; + desc->sg[num_periods - 1].hw->high_ndesc_addr = upper_32_bits(desc->sg[0].llp); + desc->cyclic = true; + desc->status = DMA_IN_PROGRESS; + + return vchan_tx_prep(&lchan->vchan, &desc->vdesc, flags); +} + +/* + * ls2x_slave_config - set slave configuration for channel + * @chan: dma channel + * @cfg: slave configuration + * + * Sets slave configuration for channel + */ +static int ls2x_dma_slave_config(struct dma_chan *chan, + struct dma_slave_config *config) +{ + struct ls2x_dma_chan *lchan = to_ldma_chan(chan); + + memcpy(&lchan->sconfig, config, sizeof(*config)); + return 0; +} + +/* + * ls2x_dma_issue_pending - push pending transactions to the hardware + * @chan: channel + * + * When this function is called, all pending transactions are pushed to the + * hardware and executed. + */ +static void ls2x_dma_issue_pending(struct dma_chan *chan) +{ + struct ls2x_dma_chan *lchan = to_ldma_chan(chan); + unsigned long flags; + + spin_lock_irqsave(&lchan->vchan.lock, flags); + if (vchan_issue_pending(&lchan->vchan) && !lchan->desc) + ls2x_dma_start_transfer(lchan); + spin_unlock_irqrestore(&lchan->vchan.lock, flags); +} + +/* + * ls2x_dma_terminate_all - terminate all transactions + * @chan: channel + * + * Stops all DMA transactions. + */ +static int ls2x_dma_terminate_all(struct dma_chan *chan) +{ + struct ls2x_dma_chan *lchan = to_ldma_chan(chan); + unsigned long flags; + LIST_HEAD(head); + + spin_lock_irqsave(&lchan->vchan.lock, flags); + /* Setting stop cmd */ + ls2x_dma_write_cmd(lchan, LDMA_STOP); + if (lchan->desc) { + vchan_terminate_vdesc(&lchan->desc->vdesc); + lchan->desc = NULL; + } + + vchan_get_all_descriptors(&lchan->vchan, &head); + spin_unlock_irqrestore(&lchan->vchan.lock, flags); + + vchan_dma_desc_free_list(&lchan->vchan, &head); + return 0; +} + +/* + * ls2x_dma_synchronize - Synchronizes the termination of transfers to the + * current context. + * @chan: channel + */ +static void ls2x_dma_synchronize(struct dma_chan *chan) +{ + struct ls2x_dma_chan *lchan = to_ldma_chan(chan); + + vchan_synchronize(&lchan->vchan); +} + +static int ls2x_dma_pause(struct dma_chan *chan) +{ + struct ls2x_dma_chan *lchan = to_ldma_chan(chan); + unsigned long flags; + + spin_lock_irqsave(&lchan->vchan.lock, flags); + if (lchan->desc && lchan->desc->status == DMA_IN_PROGRESS) { + ls2x_dma_write_cmd(lchan, LDMA_STOP); + lchan->desc->status = DMA_PAUSED; + } + spin_unlock_irqrestore(&lchan->vchan.lock, flags); + + return 0; +} + +static int ls2x_dma_resume(struct dma_chan *chan) +{ + struct ls2x_dma_chan *lchan = to_ldma_chan(chan); + unsigned long flags; + + spin_lock_irqsave(&lchan->vchan.lock, flags); + if (lchan->desc && lchan->desc->status == DMA_PAUSED) { + lchan->desc->status = DMA_IN_PROGRESS; + ls2x_dma_write_cmd(lchan, LDMA_START); + } + spin_unlock_irqrestore(&lchan->vchan.lock, flags); + + return 0; +} + +/* + * ls2x_dma_isr - LS2X DMA Interrupt handler + * @irq: IRQ number + * @dev_id: Pointer to ls2x_dma_chan + * + * Return: IRQ_HANDLED/IRQ_NONE + */ +static irqreturn_t ls2x_dma_isr(int irq, void *dev_id) +{ + struct ls2x_dma_chan *lchan = dev_id; + struct ls2x_dma_desc *desc; + + spin_lock(&lchan->vchan.lock); + desc = lchan->desc; + if (desc) { + if (desc->cyclic) { + vchan_cyclic_callback(&desc->vdesc); + } else { + desc->status = DMA_COMPLETE; + vchan_cookie_complete(&desc->vdesc); + ls2x_dma_start_transfer(lchan); + } + + /* ls2x_dma_start_transfer() updates lchan->desc */ + if (!lchan->desc) + ls2x_dma_write_cmd(lchan, LDMA_STOP); + } + spin_unlock(&lchan->vchan.lock); + + return IRQ_HANDLED; +} + +static int ls2x_dma_chan_init(struct platform_device *pdev, + struct ls2x_dma_priv *priv) +{ + struct ls2x_dma_chan *lchan = &priv->lchan; + struct device *dev = &pdev->dev; + int ret; + + lchan->irq = platform_get_irq(pdev, 0); + if (lchan->irq < 0) + return lchan->irq; + + ret = devm_request_irq(dev, lchan->irq, ls2x_dma_isr, IRQF_TRIGGER_RISING, + dev_name(&pdev->dev), lchan); + if (ret) + return ret; + + /* Initialize channels related values */ + INIT_LIST_HEAD(&priv->ddev.channels); + lchan->vchan.desc_free = ls2x_dma_desc_free; + vchan_init(&lchan->vchan, &priv->ddev); + + return 0; +} + +/* + * ls2x_dma_probe - Driver probe function + * @pdev: Pointer to the platform_device structure + * + * Return: '0' on success and failure value on error + */ +static int ls2x_dma_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct ls2x_dma_priv *priv; + struct dma_device *ddev; + int ret; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->regs = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(priv->regs)) + return dev_err_probe(dev, PTR_ERR(priv->regs), + "devm_platform_ioremap_resource failed.\n"); + + priv->dma_clk = devm_clk_get(&pdev->dev, NULL); + if (IS_ERR(priv->dma_clk)) + return dev_err_probe(dev, PTR_ERR(priv->dma_clk), "devm_clk_get failed.\n"); + + ret = clk_prepare_enable(priv->dma_clk); + if (ret) + return dev_err_probe(dev, ret, "clk_prepare_enable failed.\n"); + + ret = ls2x_dma_chan_init(pdev, priv); + if (ret) + goto disable_clk; + + ddev = &priv->ddev; + ddev->dev = dev; + dma_cap_zero(ddev->cap_mask); + dma_cap_set(DMA_SLAVE, ddev->cap_mask); + dma_cap_set(DMA_CYCLIC, ddev->cap_mask); + + ddev->device_alloc_chan_resources = ls2x_dma_alloc_chan_resources; + ddev->device_free_chan_resources = ls2x_dma_free_chan_resources; + ddev->device_tx_status = dma_cookie_status; + ddev->device_issue_pending = ls2x_dma_issue_pending; + ddev->device_prep_slave_sg = ls2x_dma_prep_slave_sg; + ddev->device_prep_dma_cyclic = ls2x_dma_prep_dma_cyclic; + ddev->device_config = ls2x_dma_slave_config; + ddev->device_terminate_all = ls2x_dma_terminate_all; + ddev->device_synchronize = ls2x_dma_synchronize; + ddev->device_pause = ls2x_dma_pause; + ddev->device_resume = ls2x_dma_resume; + + ddev->src_addr_widths = LDMA_SLAVE_BUSWIDTHS; + ddev->dst_addr_widths = LDMA_SLAVE_BUSWIDTHS; + ddev->directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV); + + ret = dma_async_device_register(&priv->ddev); + if (ret < 0) + goto disable_clk; + + ret = of_dma_controller_register(dev->of_node, of_dma_xlate_by_chan_id, priv); + if (ret < 0) + goto unregister_dmac; + + platform_set_drvdata(pdev, priv); + + dev_info(dev, "Loongson LS2X APB DMA driver registered successfully.\n"); + return 0; + +unregister_dmac: + dma_async_device_unregister(&priv->ddev); +disable_clk: + clk_disable_unprepare(priv->dma_clk); + + return ret; +} + +/* + * ls2x_dma_remove - Driver remove function + * @pdev: Pointer to the platform_device structure + */ +static void ls2x_dma_remove(struct platform_device *pdev) +{ + struct ls2x_dma_priv *priv = platform_get_drvdata(pdev); + + of_dma_controller_free(pdev->dev.of_node); + dma_async_device_unregister(&priv->ddev); + clk_disable_unprepare(priv->dma_clk); +} + +static const struct of_device_id ls2x_dma_of_match_table[] = { + { .compatible = "loongson,ls2k1000-apbdma" }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, ls2x_dma_of_match_table); + +static struct platform_driver ls2x_dmac_driver = { + .probe = ls2x_dma_probe, + .remove_new = ls2x_dma_remove, + .driver = { + .name = "ls2x-apbdma", + .of_match_table = ls2x_dma_of_match_table, + }, +}; +module_platform_driver(ls2x_dmac_driver); + +MODULE_DESCRIPTION("Loongson LS2X APB DMA Controller driver"); +MODULE_AUTHOR("Loongson Technology Corporation Limited"); +MODULE_LICENSE("GPL"); -- GitLab From a2ab7045389feab1c26ebab105a8ad6bce74a4a7 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Fri, 15 Dec 2023 14:13:09 +0100 Subject: [PATCH 0444/1290] dmaengine: axi-dmac: Small code cleanup Use a for() loop instead of a while() loop in axi_dmac_fill_linear_sg(). This makes the code leaner and cleaner overall, and does not introduce any functional change. Signed-off-by: Paul Cercueil Link: https://lore.kernel.org/r/20231215131313.23840-2-paul@crapouillou.net Signed-off-by: Vinod Koul --- drivers/dma/dma-axi-dmac.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/dma/dma-axi-dmac.c b/drivers/dma/dma-axi-dmac.c index 2457a420c13d7..760940b21eabf 100644 --- a/drivers/dma/dma-axi-dmac.c +++ b/drivers/dma/dma-axi-dmac.c @@ -508,16 +508,13 @@ static struct axi_dmac_sg *axi_dmac_fill_linear_sg(struct axi_dmac_chan *chan, segment_size = ((segment_size - 1) | chan->length_align_mask) + 1; for (i = 0; i < num_periods; i++) { - len = period_len; - - while (len > segment_size) { + for (len = period_len; len > segment_size; sg++) { if (direction == DMA_DEV_TO_MEM) sg->dest_addr = addr; else sg->src_addr = addr; sg->x_len = segment_size; sg->y_len = 1; - sg++; addr += segment_size; len -= segment_size; } -- GitLab From 3f8fd25936ee5f52596f10d420f650c5b5e3285f Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Fri, 15 Dec 2023 14:13:10 +0100 Subject: [PATCH 0445/1290] dmaengine: axi-dmac: Allocate hardware descriptors Change where and how the DMA transfers meta-data is stored, to prepare for the upcoming introduction of scatter-gather support. Allocate hardware descriptors in the format that the HDL core will be expecting them when the scatter-gather feature is enabled, and use these fields to store the data that was previously stored in the axi_dmac_sg structure. Note that the 'x_len' and 'y_len' fields now contain the transfer length minus one, since that's what the hardware will expect in these fields. Signed-off-by: Paul Cercueil Link: https://lore.kernel.org/r/20231215131313.23840-3-paul@crapouillou.net Signed-off-by: Vinod Koul --- drivers/dma/dma-axi-dmac.c | 134 ++++++++++++++++++++++++------------- 1 file changed, 88 insertions(+), 46 deletions(-) diff --git a/drivers/dma/dma-axi-dmac.c b/drivers/dma/dma-axi-dmac.c index 760940b21eabf..185230a769b93 100644 --- a/drivers/dma/dma-axi-dmac.c +++ b/drivers/dma/dma-axi-dmac.c @@ -97,20 +97,31 @@ /* The maximum ID allocated by the hardware is 31 */ #define AXI_DMAC_SG_UNUSED 32U +struct axi_dmac_hw_desc { + u32 flags; + u32 id; + u64 dest_addr; + u64 src_addr; + u64 __unused; + u32 y_len; + u32 x_len; + u32 src_stride; + u32 dst_stride; + u64 __pad[2]; +}; + struct axi_dmac_sg { - dma_addr_t src_addr; - dma_addr_t dest_addr; - unsigned int x_len; - unsigned int y_len; - unsigned int dest_stride; - unsigned int src_stride; - unsigned int id; unsigned int partial_len; bool schedule_when_free; + + struct axi_dmac_hw_desc *hw; + dma_addr_t hw_phys; }; struct axi_dmac_desc { struct virt_dma_desc vdesc; + struct axi_dmac_chan *chan; + bool cyclic; bool have_partial_xfer; @@ -229,7 +240,7 @@ static void axi_dmac_start_transfer(struct axi_dmac_chan *chan) sg = &desc->sg[desc->num_submitted]; /* Already queued in cyclic mode. Wait for it to finish */ - if (sg->id != AXI_DMAC_SG_UNUSED) { + if (sg->hw->id != AXI_DMAC_SG_UNUSED) { sg->schedule_when_free = true; return; } @@ -246,16 +257,16 @@ static void axi_dmac_start_transfer(struct axi_dmac_chan *chan) chan->next_desc = desc; } - sg->id = axi_dmac_read(dmac, AXI_DMAC_REG_TRANSFER_ID); + sg->hw->id = axi_dmac_read(dmac, AXI_DMAC_REG_TRANSFER_ID); if (axi_dmac_dest_is_mem(chan)) { - axi_dmac_write(dmac, AXI_DMAC_REG_DEST_ADDRESS, sg->dest_addr); - axi_dmac_write(dmac, AXI_DMAC_REG_DEST_STRIDE, sg->dest_stride); + axi_dmac_write(dmac, AXI_DMAC_REG_DEST_ADDRESS, sg->hw->dest_addr); + axi_dmac_write(dmac, AXI_DMAC_REG_DEST_STRIDE, sg->hw->dst_stride); } if (axi_dmac_src_is_mem(chan)) { - axi_dmac_write(dmac, AXI_DMAC_REG_SRC_ADDRESS, sg->src_addr); - axi_dmac_write(dmac, AXI_DMAC_REG_SRC_STRIDE, sg->src_stride); + axi_dmac_write(dmac, AXI_DMAC_REG_SRC_ADDRESS, sg->hw->src_addr); + axi_dmac_write(dmac, AXI_DMAC_REG_SRC_STRIDE, sg->hw->src_stride); } /* @@ -270,8 +281,8 @@ static void axi_dmac_start_transfer(struct axi_dmac_chan *chan) if (chan->hw_partial_xfer) flags |= AXI_DMAC_FLAG_PARTIAL_REPORT; - axi_dmac_write(dmac, AXI_DMAC_REG_X_LENGTH, sg->x_len - 1); - axi_dmac_write(dmac, AXI_DMAC_REG_Y_LENGTH, sg->y_len - 1); + axi_dmac_write(dmac, AXI_DMAC_REG_X_LENGTH, sg->hw->x_len); + axi_dmac_write(dmac, AXI_DMAC_REG_Y_LENGTH, sg->hw->y_len); axi_dmac_write(dmac, AXI_DMAC_REG_FLAGS, flags); axi_dmac_write(dmac, AXI_DMAC_REG_START_TRANSFER, 1); } @@ -286,9 +297,9 @@ static inline unsigned int axi_dmac_total_sg_bytes(struct axi_dmac_chan *chan, struct axi_dmac_sg *sg) { if (chan->hw_2d) - return sg->x_len * sg->y_len; + return (sg->hw->x_len + 1) * (sg->hw->y_len + 1); else - return sg->x_len; + return (sg->hw->x_len + 1); } static void axi_dmac_dequeue_partial_xfers(struct axi_dmac_chan *chan) @@ -307,9 +318,9 @@ static void axi_dmac_dequeue_partial_xfers(struct axi_dmac_chan *chan) list_for_each_entry(desc, &chan->active_descs, vdesc.node) { for (i = 0; i < desc->num_sgs; i++) { sg = &desc->sg[i]; - if (sg->id == AXI_DMAC_SG_UNUSED) + if (sg->hw->id == AXI_DMAC_SG_UNUSED) continue; - if (sg->id == id) { + if (sg->hw->id == id) { desc->have_partial_xfer = true; sg->partial_len = len; found_sg = true; @@ -376,12 +387,12 @@ static bool axi_dmac_transfer_done(struct axi_dmac_chan *chan, do { sg = &active->sg[active->num_completed]; - if (sg->id == AXI_DMAC_SG_UNUSED) /* Not yet submitted */ + if (sg->hw->id == AXI_DMAC_SG_UNUSED) /* Not yet submitted */ break; - if (!(BIT(sg->id) & completed_transfers)) + if (!(BIT(sg->hw->id) & completed_transfers)) break; active->num_completed++; - sg->id = AXI_DMAC_SG_UNUSED; + sg->hw->id = AXI_DMAC_SG_UNUSED; if (sg->schedule_when_free) { sg->schedule_when_free = false; start_next = true; @@ -476,22 +487,52 @@ static void axi_dmac_issue_pending(struct dma_chan *c) spin_unlock_irqrestore(&chan->vchan.lock, flags); } -static struct axi_dmac_desc *axi_dmac_alloc_desc(unsigned int num_sgs) +static struct axi_dmac_desc * +axi_dmac_alloc_desc(struct axi_dmac_chan *chan, unsigned int num_sgs) { + struct axi_dmac *dmac = chan_to_axi_dmac(chan); + struct device *dev = dmac->dma_dev.dev; + struct axi_dmac_hw_desc *hws; struct axi_dmac_desc *desc; + dma_addr_t hw_phys; unsigned int i; desc = kzalloc(struct_size(desc, sg, num_sgs), GFP_NOWAIT); if (!desc) return NULL; desc->num_sgs = num_sgs; + desc->chan = chan; - for (i = 0; i < num_sgs; i++) - desc->sg[i].id = AXI_DMAC_SG_UNUSED; + hws = dma_alloc_coherent(dev, PAGE_ALIGN(num_sgs * sizeof(*hws)), + &hw_phys, GFP_ATOMIC); + if (!hws) { + kfree(desc); + return NULL; + } + + for (i = 0; i < num_sgs; i++) { + desc->sg[i].hw = &hws[i]; + desc->sg[i].hw_phys = hw_phys + i * sizeof(*hws); + + hws[i].id = AXI_DMAC_SG_UNUSED; + hws[i].flags = 0; + } return desc; } +static void axi_dmac_free_desc(struct axi_dmac_desc *desc) +{ + struct axi_dmac *dmac = chan_to_axi_dmac(desc->chan); + struct device *dev = dmac->dma_dev.dev; + struct axi_dmac_hw_desc *hw = desc->sg[0].hw; + dma_addr_t hw_phys = desc->sg[0].hw_phys; + + dma_free_coherent(dev, PAGE_ALIGN(desc->num_sgs * sizeof(*hw)), + hw, hw_phys); + kfree(desc); +} + static struct axi_dmac_sg *axi_dmac_fill_linear_sg(struct axi_dmac_chan *chan, enum dma_transfer_direction direction, dma_addr_t addr, unsigned int num_periods, unsigned int period_len, @@ -510,21 +551,22 @@ static struct axi_dmac_sg *axi_dmac_fill_linear_sg(struct axi_dmac_chan *chan, for (i = 0; i < num_periods; i++) { for (len = period_len; len > segment_size; sg++) { if (direction == DMA_DEV_TO_MEM) - sg->dest_addr = addr; + sg->hw->dest_addr = addr; else - sg->src_addr = addr; - sg->x_len = segment_size; - sg->y_len = 1; + sg->hw->src_addr = addr; + sg->hw->x_len = segment_size - 1; + sg->hw->y_len = 0; + sg->hw->flags = 0; addr += segment_size; len -= segment_size; } if (direction == DMA_DEV_TO_MEM) - sg->dest_addr = addr; + sg->hw->dest_addr = addr; else - sg->src_addr = addr; - sg->x_len = len; - sg->y_len = 1; + sg->hw->src_addr = addr; + sg->hw->x_len = len - 1; + sg->hw->y_len = 0; sg++; addr += len; } @@ -551,7 +593,7 @@ static struct dma_async_tx_descriptor *axi_dmac_prep_slave_sg( for_each_sg(sgl, sg, sg_len, i) num_sgs += DIV_ROUND_UP(sg_dma_len(sg), chan->max_length); - desc = axi_dmac_alloc_desc(num_sgs); + desc = axi_dmac_alloc_desc(chan, num_sgs); if (!desc) return NULL; @@ -560,7 +602,7 @@ static struct dma_async_tx_descriptor *axi_dmac_prep_slave_sg( for_each_sg(sgl, sg, sg_len, i) { if (!axi_dmac_check_addr(chan, sg_dma_address(sg)) || !axi_dmac_check_len(chan, sg_dma_len(sg))) { - kfree(desc); + axi_dmac_free_desc(desc); return NULL; } @@ -595,7 +637,7 @@ static struct dma_async_tx_descriptor *axi_dmac_prep_dma_cyclic( num_periods = buf_len / period_len; num_segments = DIV_ROUND_UP(period_len, chan->max_length); - desc = axi_dmac_alloc_desc(num_periods * num_segments); + desc = axi_dmac_alloc_desc(chan, num_periods * num_segments); if (!desc) return NULL; @@ -650,26 +692,26 @@ static struct dma_async_tx_descriptor *axi_dmac_prep_interleaved( return NULL; } - desc = axi_dmac_alloc_desc(1); + desc = axi_dmac_alloc_desc(chan, 1); if (!desc) return NULL; if (axi_dmac_src_is_mem(chan)) { - desc->sg[0].src_addr = xt->src_start; - desc->sg[0].src_stride = xt->sgl[0].size + src_icg; + desc->sg[0].hw->src_addr = xt->src_start; + desc->sg[0].hw->src_stride = xt->sgl[0].size + src_icg; } if (axi_dmac_dest_is_mem(chan)) { - desc->sg[0].dest_addr = xt->dst_start; - desc->sg[0].dest_stride = xt->sgl[0].size + dst_icg; + desc->sg[0].hw->dest_addr = xt->dst_start; + desc->sg[0].hw->dst_stride = xt->sgl[0].size + dst_icg; } if (chan->hw_2d) { - desc->sg[0].x_len = xt->sgl[0].size; - desc->sg[0].y_len = xt->numf; + desc->sg[0].hw->x_len = xt->sgl[0].size - 1; + desc->sg[0].hw->y_len = xt->numf - 1; } else { - desc->sg[0].x_len = xt->sgl[0].size * xt->numf; - desc->sg[0].y_len = 1; + desc->sg[0].hw->x_len = xt->sgl[0].size * xt->numf - 1; + desc->sg[0].hw->y_len = 0; } if (flags & DMA_CYCLIC) @@ -685,7 +727,7 @@ static void axi_dmac_free_chan_resources(struct dma_chan *c) static void axi_dmac_desc_free(struct virt_dma_desc *vdesc) { - kfree(container_of(vdesc, struct axi_dmac_desc, vdesc)); + axi_dmac_free_desc(to_axi_dmac_desc(vdesc)); } static bool axi_dmac_regmap_rdwr(struct device *dev, unsigned int reg) -- GitLab From e97dc7435972d28ac7d96d199d4aedb868d04fd8 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Fri, 15 Dec 2023 14:13:11 +0100 Subject: [PATCH 0446/1290] dmaengine: axi-dmac: Add support for scatter-gather transfers Implement support for scatter-gather transfers. Build a chain of hardware descriptors, each one corresponding to a segment of the transfer, and linked to the next one. The hardware will transfer the chain and only fire interrupts when the whole chain has been transferred. Support for scatter-gather is automatically enabled when the driver detects that the hardware supports it, by writing then reading the AXI_DMAC_REG_SG_ADDRESS register. If not available, the driver will fall back to standard DMA transfers. Signed-off-by: Paul Cercueil Link: https://lore.kernel.org/r/20231215131313.23840-4-paul@crapouillou.net Signed-off-by: Vinod Koul --- drivers/dma/dma-axi-dmac.c | 135 +++++++++++++++++++++++++------------ 1 file changed, 93 insertions(+), 42 deletions(-) diff --git a/drivers/dma/dma-axi-dmac.c b/drivers/dma/dma-axi-dmac.c index 185230a769b93..5109530b66de9 100644 --- a/drivers/dma/dma-axi-dmac.c +++ b/drivers/dma/dma-axi-dmac.c @@ -81,9 +81,13 @@ #define AXI_DMAC_REG_CURRENT_DEST_ADDR 0x438 #define AXI_DMAC_REG_PARTIAL_XFER_LEN 0x44c #define AXI_DMAC_REG_PARTIAL_XFER_ID 0x450 +#define AXI_DMAC_REG_CURRENT_SG_ID 0x454 +#define AXI_DMAC_REG_SG_ADDRESS 0x47c +#define AXI_DMAC_REG_SG_ADDRESS_HIGH 0x4bc #define AXI_DMAC_CTRL_ENABLE BIT(0) #define AXI_DMAC_CTRL_PAUSE BIT(1) +#define AXI_DMAC_CTRL_ENABLE_SG BIT(2) #define AXI_DMAC_IRQ_SOT BIT(0) #define AXI_DMAC_IRQ_EOT BIT(1) @@ -97,12 +101,16 @@ /* The maximum ID allocated by the hardware is 31 */ #define AXI_DMAC_SG_UNUSED 32U +/* Flags for axi_dmac_hw_desc.flags */ +#define AXI_DMAC_HW_FLAG_LAST BIT(0) +#define AXI_DMAC_HW_FLAG_IRQ BIT(1) + struct axi_dmac_hw_desc { u32 flags; u32 id; u64 dest_addr; u64 src_addr; - u64 __unused; + u64 next_sg_addr; u32 y_len; u32 x_len; u32 src_stride; @@ -150,6 +158,7 @@ struct axi_dmac_chan { bool hw_partial_xfer; bool hw_cyclic; bool hw_2d; + bool hw_sg; }; struct axi_dmac { @@ -224,9 +233,11 @@ static void axi_dmac_start_transfer(struct axi_dmac_chan *chan) unsigned int flags = 0; unsigned int val; - val = axi_dmac_read(dmac, AXI_DMAC_REG_START_TRANSFER); - if (val) /* Queue is full, wait for the next SOT IRQ */ - return; + if (!chan->hw_sg) { + val = axi_dmac_read(dmac, AXI_DMAC_REG_START_TRANSFER); + if (val) /* Queue is full, wait for the next SOT IRQ */ + return; + } desc = chan->next_desc; @@ -245,9 +256,10 @@ static void axi_dmac_start_transfer(struct axi_dmac_chan *chan) return; } - desc->num_submitted++; - if (desc->num_submitted == desc->num_sgs || - desc->have_partial_xfer) { + if (chan->hw_sg) { + chan->next_desc = NULL; + } else if (++desc->num_submitted == desc->num_sgs || + desc->have_partial_xfer) { if (desc->cyclic) desc->num_submitted = 0; /* Start again */ else @@ -259,14 +271,16 @@ static void axi_dmac_start_transfer(struct axi_dmac_chan *chan) sg->hw->id = axi_dmac_read(dmac, AXI_DMAC_REG_TRANSFER_ID); - if (axi_dmac_dest_is_mem(chan)) { - axi_dmac_write(dmac, AXI_DMAC_REG_DEST_ADDRESS, sg->hw->dest_addr); - axi_dmac_write(dmac, AXI_DMAC_REG_DEST_STRIDE, sg->hw->dst_stride); - } + if (!chan->hw_sg) { + if (axi_dmac_dest_is_mem(chan)) { + axi_dmac_write(dmac, AXI_DMAC_REG_DEST_ADDRESS, sg->hw->dest_addr); + axi_dmac_write(dmac, AXI_DMAC_REG_DEST_STRIDE, sg->hw->dst_stride); + } - if (axi_dmac_src_is_mem(chan)) { - axi_dmac_write(dmac, AXI_DMAC_REG_SRC_ADDRESS, sg->hw->src_addr); - axi_dmac_write(dmac, AXI_DMAC_REG_SRC_STRIDE, sg->hw->src_stride); + if (axi_dmac_src_is_mem(chan)) { + axi_dmac_write(dmac, AXI_DMAC_REG_SRC_ADDRESS, sg->hw->src_addr); + axi_dmac_write(dmac, AXI_DMAC_REG_SRC_STRIDE, sg->hw->src_stride); + } } /* @@ -281,8 +295,14 @@ static void axi_dmac_start_transfer(struct axi_dmac_chan *chan) if (chan->hw_partial_xfer) flags |= AXI_DMAC_FLAG_PARTIAL_REPORT; - axi_dmac_write(dmac, AXI_DMAC_REG_X_LENGTH, sg->hw->x_len); - axi_dmac_write(dmac, AXI_DMAC_REG_Y_LENGTH, sg->hw->y_len); + if (chan->hw_sg) { + axi_dmac_write(dmac, AXI_DMAC_REG_SG_ADDRESS, (u32)sg->hw_phys); + axi_dmac_write(dmac, AXI_DMAC_REG_SG_ADDRESS_HIGH, + (u64)sg->hw_phys >> 32); + } else { + axi_dmac_write(dmac, AXI_DMAC_REG_X_LENGTH, sg->hw->x_len); + axi_dmac_write(dmac, AXI_DMAC_REG_Y_LENGTH, sg->hw->y_len); + } axi_dmac_write(dmac, AXI_DMAC_REG_FLAGS, flags); axi_dmac_write(dmac, AXI_DMAC_REG_START_TRANSFER, 1); } @@ -359,6 +379,9 @@ static void axi_dmac_compute_residue(struct axi_dmac_chan *chan, rslt->result = DMA_TRANS_NOERROR; rslt->residue = 0; + if (chan->hw_sg) + return; + /* * We get here if the last completed segment is partial, which * means we can compute the residue from that segment onwards @@ -385,36 +408,46 @@ static bool axi_dmac_transfer_done(struct axi_dmac_chan *chan, (completed_transfers & AXI_DMAC_FLAG_PARTIAL_XFER_DONE)) axi_dmac_dequeue_partial_xfers(chan); - do { - sg = &active->sg[active->num_completed]; - if (sg->hw->id == AXI_DMAC_SG_UNUSED) /* Not yet submitted */ - break; - if (!(BIT(sg->hw->id) & completed_transfers)) - break; - active->num_completed++; - sg->hw->id = AXI_DMAC_SG_UNUSED; - if (sg->schedule_when_free) { - sg->schedule_when_free = false; - start_next = true; + if (chan->hw_sg) { + if (active->cyclic) { + vchan_cyclic_callback(&active->vdesc); + } else { + list_del(&active->vdesc.node); + vchan_cookie_complete(&active->vdesc); + active = axi_dmac_active_desc(chan); } + } else { + do { + sg = &active->sg[active->num_completed]; + if (sg->hw->id == AXI_DMAC_SG_UNUSED) /* Not yet submitted */ + break; + if (!(BIT(sg->hw->id) & completed_transfers)) + break; + active->num_completed++; + sg->hw->id = AXI_DMAC_SG_UNUSED; + if (sg->schedule_when_free) { + sg->schedule_when_free = false; + start_next = true; + } - if (sg->partial_len) - axi_dmac_compute_residue(chan, active); + if (sg->partial_len) + axi_dmac_compute_residue(chan, active); - if (active->cyclic) - vchan_cyclic_callback(&active->vdesc); + if (active->cyclic) + vchan_cyclic_callback(&active->vdesc); - if (active->num_completed == active->num_sgs || - sg->partial_len) { - if (active->cyclic) { - active->num_completed = 0; /* wrap around */ - } else { - list_del(&active->vdesc.node); - vchan_cookie_complete(&active->vdesc); - active = axi_dmac_active_desc(chan); + if (active->num_completed == active->num_sgs || + sg->partial_len) { + if (active->cyclic) { + active->num_completed = 0; /* wrap around */ + } else { + list_del(&active->vdesc.node); + vchan_cookie_complete(&active->vdesc); + active = axi_dmac_active_desc(chan); + } } - } - } while (active); + } while (active); + } return start_next; } @@ -478,8 +511,12 @@ static void axi_dmac_issue_pending(struct dma_chan *c) struct axi_dmac_chan *chan = to_axi_dmac_chan(c); struct axi_dmac *dmac = chan_to_axi_dmac(chan); unsigned long flags; + u32 ctrl = AXI_DMAC_CTRL_ENABLE; + + if (chan->hw_sg) + ctrl |= AXI_DMAC_CTRL_ENABLE_SG; - axi_dmac_write(dmac, AXI_DMAC_REG_CTRL, AXI_DMAC_CTRL_ENABLE); + axi_dmac_write(dmac, AXI_DMAC_REG_CTRL, ctrl); spin_lock_irqsave(&chan->vchan.lock, flags); if (vchan_issue_pending(&chan->vchan)) @@ -516,8 +553,14 @@ axi_dmac_alloc_desc(struct axi_dmac_chan *chan, unsigned int num_sgs) hws[i].id = AXI_DMAC_SG_UNUSED; hws[i].flags = 0; + + /* Link hardware descriptors */ + hws[i].next_sg_addr = hw_phys + (i + 1) * sizeof(*hws); } + /* The last hardware descriptor will trigger an interrupt */ + desc->sg[num_sgs - 1].hw->flags = AXI_DMAC_HW_FLAG_LAST | AXI_DMAC_HW_FLAG_IRQ; + return desc; } @@ -753,6 +796,9 @@ static bool axi_dmac_regmap_rdwr(struct device *dev, unsigned int reg) case AXI_DMAC_REG_CURRENT_DEST_ADDR: case AXI_DMAC_REG_PARTIAL_XFER_LEN: case AXI_DMAC_REG_PARTIAL_XFER_ID: + case AXI_DMAC_REG_CURRENT_SG_ID: + case AXI_DMAC_REG_SG_ADDRESS: + case AXI_DMAC_REG_SG_ADDRESS_HIGH: return true; default: return false; @@ -905,6 +951,10 @@ static int axi_dmac_detect_caps(struct axi_dmac *dmac, unsigned int version) if (axi_dmac_read(dmac, AXI_DMAC_REG_FLAGS) == AXI_DMAC_FLAG_CYCLIC) chan->hw_cyclic = true; + axi_dmac_write(dmac, AXI_DMAC_REG_SG_ADDRESS, 0xffffffff); + if (axi_dmac_read(dmac, AXI_DMAC_REG_SG_ADDRESS)) + chan->hw_sg = true; + axi_dmac_write(dmac, AXI_DMAC_REG_Y_LENGTH, 1); if (axi_dmac_read(dmac, AXI_DMAC_REG_Y_LENGTH) == 1) chan->hw_2d = true; @@ -1005,6 +1055,7 @@ static int axi_dmac_probe(struct platform_device *pdev) dma_dev->dst_addr_widths = BIT(dmac->chan.dest_width); dma_dev->directions = BIT(dmac->chan.direction); dma_dev->residue_granularity = DMA_RESIDUE_GRANULARITY_DESCRIPTOR; + dma_dev->max_sg_burst = 31; /* 31 SGs maximum in one burst */ INIT_LIST_HEAD(&dma_dev->channels); dmac->chan.vchan.desc_free = axi_dmac_desc_free; -- GitLab From 238f68a08e19a612b8912c8697901e9982f97811 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Fri, 15 Dec 2023 14:13:12 +0100 Subject: [PATCH 0447/1290] dmaengine: axi-dmac: Use only EOT interrupts when doing scatter-gather Instead of notifying userspace in the end-of-transfer (EOT) interrupt and program the hardware in the start-of-transfer (SOT) interrupt, we can do both things in the EOT, allowing us to mask the SOT, and halve the number of interrupts sent by the HDL core. Signed-off-by: Paul Cercueil Link: https://lore.kernel.org/r/20231215131313.23840-5-paul@crapouillou.net Signed-off-by: Vinod Koul --- drivers/dma/dma-axi-dmac.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/dma/dma-axi-dmac.c b/drivers/dma/dma-axi-dmac.c index 5109530b66de9..f63acae511fb2 100644 --- a/drivers/dma/dma-axi-dmac.c +++ b/drivers/dma/dma-axi-dmac.c @@ -411,10 +411,12 @@ static bool axi_dmac_transfer_done(struct axi_dmac_chan *chan, if (chan->hw_sg) { if (active->cyclic) { vchan_cyclic_callback(&active->vdesc); + start_next = true; } else { list_del(&active->vdesc.node); vchan_cookie_complete(&active->vdesc); active = axi_dmac_active_desc(chan); + start_next = !!active; } } else { do { @@ -1000,6 +1002,7 @@ static int axi_dmac_probe(struct platform_device *pdev) struct axi_dmac *dmac; struct regmap *regmap; unsigned int version; + u32 irq_mask = 0; int ret; dmac = devm_kzalloc(&pdev->dev, sizeof(*dmac), GFP_KERNEL); @@ -1067,7 +1070,10 @@ static int axi_dmac_probe(struct platform_device *pdev) dma_dev->copy_align = (dmac->chan.address_align_mask + 1); - axi_dmac_write(dmac, AXI_DMAC_REG_IRQ_MASK, 0x00); + if (dmac->chan.hw_sg) + irq_mask |= AXI_DMAC_IRQ_SOT; + + axi_dmac_write(dmac, AXI_DMAC_REG_IRQ_MASK, irq_mask); if (of_dma_is_coherent(pdev->dev.of_node)) { ret = axi_dmac_read(dmac, AXI_DMAC_REG_COHERENCY_DESC); -- GitLab From f60dfe0c561a8f1b8e30d3770997cbaa636f57f9 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Fri, 15 Dec 2023 14:13:13 +0100 Subject: [PATCH 0448/1290] dmaengine: axi-dmac: Improve cyclic DMA transfers in SG mode For cyclic transfers, chain the last descriptor to the first one, and disable IRQ generation if there is no callback registered with the cyclic transfer. Signed-off-by: Paul Cercueil Link: https://lore.kernel.org/r/20231215131313.23840-6-paul@crapouillou.net Signed-off-by: Vinod Koul --- drivers/dma/dma-axi-dmac.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/dma/dma-axi-dmac.c b/drivers/dma/dma-axi-dmac.c index f63acae511fb2..4e339c04fc1ea 100644 --- a/drivers/dma/dma-axi-dmac.c +++ b/drivers/dma/dma-axi-dmac.c @@ -285,12 +285,14 @@ static void axi_dmac_start_transfer(struct axi_dmac_chan *chan) /* * If the hardware supports cyclic transfers and there is no callback to - * call and only a single segment, enable hw cyclic mode to avoid - * unnecessary interrupts. + * call, enable hw cyclic mode to avoid unnecessary interrupts. */ - if (chan->hw_cyclic && desc->cyclic && !desc->vdesc.tx.callback && - desc->num_sgs == 1) - flags |= AXI_DMAC_FLAG_CYCLIC; + if (chan->hw_cyclic && desc->cyclic && !desc->vdesc.tx.callback) { + if (chan->hw_sg) + desc->sg[desc->num_sgs - 1].hw->flags &= ~AXI_DMAC_HW_FLAG_IRQ; + else if (desc->num_sgs == 1) + flags |= AXI_DMAC_FLAG_CYCLIC; + } if (chan->hw_partial_xfer) flags |= AXI_DMAC_FLAG_PARTIAL_REPORT; @@ -411,7 +413,6 @@ static bool axi_dmac_transfer_done(struct axi_dmac_chan *chan, if (chan->hw_sg) { if (active->cyclic) { vchan_cyclic_callback(&active->vdesc); - start_next = true; } else { list_del(&active->vdesc.node); vchan_cookie_complete(&active->vdesc); @@ -667,7 +668,7 @@ static struct dma_async_tx_descriptor *axi_dmac_prep_dma_cyclic( { struct axi_dmac_chan *chan = to_axi_dmac_chan(c); struct axi_dmac_desc *desc; - unsigned int num_periods, num_segments; + unsigned int num_periods, num_segments, num_sgs; if (direction != chan->direction) return NULL; @@ -681,11 +682,16 @@ static struct dma_async_tx_descriptor *axi_dmac_prep_dma_cyclic( num_periods = buf_len / period_len; num_segments = DIV_ROUND_UP(period_len, chan->max_length); + num_sgs = num_periods * num_segments; - desc = axi_dmac_alloc_desc(chan, num_periods * num_segments); + desc = axi_dmac_alloc_desc(chan, num_sgs); if (!desc) return NULL; + /* Chain the last descriptor to the first, and remove its "last" flag */ + desc->sg[num_sgs - 1].hw->next_sg_addr = desc->sg[0].hw_phys; + desc->sg[num_sgs - 1].hw->flags &= ~AXI_DMAC_HW_FLAG_LAST; + axi_dmac_fill_linear_sg(chan, direction, buf_addr, num_periods, period_len, desc->sg); -- GitLab From dc51b4442dd94ab12c146c1897bbdb40e16d5636 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Tue, 14 Nov 2023 10:48:21 -0500 Subject: [PATCH 0449/1290] dmaengine: fsl-edma: fix eDMAv4 channel allocation issue The eDMAv4 channel mux has a limitation where certain requests must use even channels, while others must use odd numbers. Add two flags (ARGS_EVEN_CH and ARGS_ODD_CH) to reflect this limitation. The device tree source (dts) files need to be updated accordingly. This issue was identified by the following commit: commit a725990557e7 ("arm64: dts: imx93: Fix the dmas entries order") Reverting channel orders triggered this problem. Fixes: 72f5801a4e2b ("dmaengine: fsl-edma: integrate v3 support") Signed-off-by: Frank Li Link: https://lore.kernel.org/r/20231114154824.3617255-2-Frank.Li@nxp.com Signed-off-by: Vinod Koul --- drivers/dma/fsl-edma-main.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/dma/fsl-edma-main.c b/drivers/dma/fsl-edma-main.c index 4635e16d7705e..3ee08f390f810 100644 --- a/drivers/dma/fsl-edma-main.c +++ b/drivers/dma/fsl-edma-main.c @@ -24,6 +24,8 @@ #define ARGS_RX BIT(0) #define ARGS_REMOTE BIT(1) #define ARGS_MULTI_FIFO BIT(2) +#define ARGS_EVEN_CH BIT(3) +#define ARGS_ODD_CH BIT(4) static void fsl_edma_synchronize(struct dma_chan *chan) { @@ -157,6 +159,12 @@ static struct dma_chan *fsl_edma3_xlate(struct of_phandle_args *dma_spec, fsl_chan->is_remote = dma_spec->args[2] & ARGS_REMOTE; fsl_chan->is_multi_fifo = dma_spec->args[2] & ARGS_MULTI_FIFO; + if ((dma_spec->args[2] & ARGS_EVEN_CH) && (i & 0x1)) + continue; + + if ((dma_spec->args[2] & ARGS_ODD_CH) && !(i & 0x1)) + continue; + if (!b_chmux && i == dma_spec->args[0]) { chan = dma_get_slave_channel(chan); chan->device->privatecnt++; -- GitLab From 1e9b05258271b76ccc04a4b535009d2cb596506a Mon Sep 17 00:00:00 2001 From: Frank Li Date: Tue, 14 Nov 2023 10:48:22 -0500 Subject: [PATCH 0450/1290] dt-bindings: dma: fsl-edma: Add fsl-edma.h to prevent hardcoding in dts Introduce a common dt-bindings header file, fsl-edma.h, shared between the driver and dts files. This addition aims to eliminate hardcoded values in dts files, promoting maintainability and consistency. DTS header file not support BIT() macro yet. Directly use 2^n number. Signed-off-by: Frank Li Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20231114154824.3617255-3-Frank.Li@nxp.com Signed-off-by: Vinod Koul --- include/dt-bindings/dma/fsl-edma.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 include/dt-bindings/dma/fsl-edma.h diff --git a/include/dt-bindings/dma/fsl-edma.h b/include/dt-bindings/dma/fsl-edma.h new file mode 100644 index 0000000000000..fd11478cfe9cc --- /dev/null +++ b/include/dt-bindings/dma/fsl-edma.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ + +#ifndef _FSL_EDMA_DT_BINDING_H_ +#define _FSL_EDMA_DT_BINDING_H_ + +/* Receive Channel */ +#define FSL_EDMA_RX 0x1 + +/* iMX8 audio remote DMA */ +#define FSL_EDMA_REMOTE 0x2 + +/* FIFO is continue memory region */ +#define FSL_EDMA_MULTI_FIFO 0x4 + +/* Channel need stick to even channel */ +#define FSL_EDMA_EVEN_CH 0x8 + +/* Channel need stick to odd channel */ +#define FSL_EDMA_ODD_CH 0x10 + +#endif -- GitLab From d0e217b72f9f5c5ef35e3423d393ea8093ce98ec Mon Sep 17 00:00:00 2001 From: Frank Li Date: Tue, 14 Nov 2023 10:48:23 -0500 Subject: [PATCH 0451/1290] dmaengine: fsl-edma: utilize common dt-binding header file Refactor the code to use the common dt-binding header file, fsl-edma.h. Renaming ARGS* to FSL_EDMA*, ensuring no functional changes. Signed-off-by: Frank Li Link: https://lore.kernel.org/r/20231114154824.3617255-4-Frank.Li@nxp.com Signed-off-by: Vinod Koul --- drivers/dma/fsl-edma-main.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/drivers/dma/fsl-edma-main.c b/drivers/dma/fsl-edma-main.c index 3ee08f390f810..f53b0ec17bcbc 100644 --- a/drivers/dma/fsl-edma-main.c +++ b/drivers/dma/fsl-edma-main.c @@ -9,6 +9,7 @@ * Vybrid and Layerscape SoCs. */ +#include #include #include #include @@ -21,12 +22,6 @@ #include "fsl-edma-common.h" -#define ARGS_RX BIT(0) -#define ARGS_REMOTE BIT(1) -#define ARGS_MULTI_FIFO BIT(2) -#define ARGS_EVEN_CH BIT(3) -#define ARGS_ODD_CH BIT(4) - static void fsl_edma_synchronize(struct dma_chan *chan) { struct fsl_edma_chan *fsl_chan = to_fsl_edma_chan(chan); @@ -155,14 +150,14 @@ static struct dma_chan *fsl_edma3_xlate(struct of_phandle_args *dma_spec, i = fsl_chan - fsl_edma->chans; fsl_chan->priority = dma_spec->args[1]; - fsl_chan->is_rxchan = dma_spec->args[2] & ARGS_RX; - fsl_chan->is_remote = dma_spec->args[2] & ARGS_REMOTE; - fsl_chan->is_multi_fifo = dma_spec->args[2] & ARGS_MULTI_FIFO; + fsl_chan->is_rxchan = dma_spec->args[2] & FSL_EDMA_RX; + fsl_chan->is_remote = dma_spec->args[2] & FSL_EDMA_REMOTE; + fsl_chan->is_multi_fifo = dma_spec->args[2] & FSL_EDMA_MULTI_FIFO; - if ((dma_spec->args[2] & ARGS_EVEN_CH) && (i & 0x1)) + if ((dma_spec->args[2] & FSL_EDMA_EVEN_CH) && (i & 0x1)) continue; - if ((dma_spec->args[2] & ARGS_ODD_CH) && !(i & 0x1)) + if ((dma_spec->args[2] & FSL_EDMA_ODD_CH) && !(i & 0x1)) continue; if (!b_chmux && i == dma_spec->args[0]) { -- GitLab From f5c24d94512f1b288262beda4d3dcb9629222fc7 Mon Sep 17 00:00:00 2001 From: Amelie Delaunay Date: Wed, 13 Dec 2023 17:04:52 +0100 Subject: [PATCH 0452/1290] dmaengine: fix NULL pointer in channel unregistration function __dma_async_device_channel_register() can fail. In case of failure, chan->local is freed (with free_percpu()), and chan->local is nullified. When dma_async_device_unregister() is called (because of managed API or intentionally by DMA controller driver), channels are unconditionally unregistered, leading to this NULL pointer: [ 1.318693] Unable to handle kernel NULL pointer dereference at virtual address 00000000000000d0 [...] [ 1.484499] Call trace: [ 1.486930] device_del+0x40/0x394 [ 1.490314] device_unregister+0x20/0x7c [ 1.494220] __dma_async_device_channel_unregister+0x68/0xc0 Look at dma_async_device_register() function error path, channel device unregistration is done only if chan->local is not NULL. Then add the same condition at the beginning of __dma_async_device_channel_unregister() function, to avoid NULL pointer issue whatever the API used to reach this function. Fixes: d2fb0a043838 ("dmaengine: break out channel registration") Signed-off-by: Amelie Delaunay Reviewed-by: Dave Jiang Link: https://lore.kernel.org/r/20231213160452.2598073-1-amelie.delaunay@foss.st.com Signed-off-by: Vinod Koul --- drivers/dma/dmaengine.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index b7388ae62d7f1..491b222402216 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -1103,6 +1103,9 @@ EXPORT_SYMBOL_GPL(dma_async_device_channel_register); static void __dma_async_device_channel_unregister(struct dma_device *device, struct dma_chan *chan) { + if (chan->local == NULL) + return; + WARN_ONCE(!device->device_release && chan->client_count, "%s called while %d clients hold a reference\n", __func__, chan->client_count); -- GitLab From 3b08b3775593442be52cbb99efbccbd7fe4fa3fe Mon Sep 17 00:00:00 2001 From: Vignesh Raghavendra Date: Wed, 13 Dec 2023 13:43:18 +0530 Subject: [PATCH 0453/1290] dmaengine: ti: k3-udma: Add PSIL threads for AM62P and J722S Add PSIL thread information and enable UDMA support for AM62P and J722S SoC. J722S SoC family is a superset of AM62P, thus common PSIL thread ID map is reused for both devices. For those interested, more details about the SoC can be found in the Technical Reference Manual here: AM62P - https://www.ti.com/lit/pdf/spruj83 J722S - https://www.ti.com/lit/zip/sprujb3 Signed-off-by: Vignesh Raghavendra Signed-off-by: Bryan Brattlof Signed-off-by: Vaishnav Achath Reviewed-by: Jai Luthra Link: https://lore.kernel.org/r/20231213081318.26203-1-vaishnav.a@ti.com Signed-off-by: Vinod Koul --- drivers/dma/ti/Makefile | 3 +- drivers/dma/ti/k3-psil-am62p.c | 325 +++++++++++++++++++++++++++++++++ drivers/dma/ti/k3-psil-priv.h | 1 + drivers/dma/ti/k3-psil.c | 2 + drivers/dma/ti/k3-udma.c | 2 + 5 files changed, 332 insertions(+), 1 deletion(-) create mode 100644 drivers/dma/ti/k3-psil-am62p.c diff --git a/drivers/dma/ti/Makefile b/drivers/dma/ti/Makefile index acc950bf609c3..d376c117cecf6 100644 --- a/drivers/dma/ti/Makefile +++ b/drivers/dma/ti/Makefile @@ -12,6 +12,7 @@ k3-psil-lib-objs := k3-psil.o \ k3-psil-j721s2.o \ k3-psil-am62.o \ k3-psil-am62a.o \ - k3-psil-j784s4.o + k3-psil-j784s4.o \ + k3-psil-am62p.o obj-$(CONFIG_TI_K3_PSIL) += k3-psil-lib.o obj-$(CONFIG_TI_DMA_CROSSBAR) += dma-crossbar.o diff --git a/drivers/dma/ti/k3-psil-am62p.c b/drivers/dma/ti/k3-psil-am62p.c new file mode 100644 index 0000000000000..0f338e16d9710 --- /dev/null +++ b/drivers/dma/ti/k3-psil-am62p.c @@ -0,0 +1,325 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023 Texas Instruments Incorporated - https://www.ti.com + */ + +#include + +#include "k3-psil-priv.h" + +#define PSIL_PDMA_XY_TR(x) \ + { \ + .thread_id = x, \ + .ep_config = { \ + .ep_type = PSIL_EP_PDMA_XY, \ + .mapped_channel_id = -1, \ + .default_flow_id = -1, \ + }, \ + } + +#define PSIL_PDMA_XY_PKT(x) \ + { \ + .thread_id = x, \ + .ep_config = { \ + .ep_type = PSIL_EP_PDMA_XY, \ + .mapped_channel_id = -1, \ + .default_flow_id = -1, \ + .pkt_mode = 1, \ + }, \ + } + +#define PSIL_ETHERNET(x, ch, flow_base, flow_cnt) \ + { \ + .thread_id = x, \ + .ep_config = { \ + .ep_type = PSIL_EP_NATIVE, \ + .pkt_mode = 1, \ + .needs_epib = 1, \ + .psd_size = 16, \ + .mapped_channel_id = ch, \ + .flow_start = flow_base, \ + .flow_num = flow_cnt, \ + .default_flow_id = flow_base, \ + }, \ + } + +#define PSIL_SAUL(x, ch, flow_base, flow_cnt, default_flow, tx) \ + { \ + .thread_id = x, \ + .ep_config = { \ + .ep_type = PSIL_EP_NATIVE, \ + .pkt_mode = 1, \ + .needs_epib = 1, \ + .psd_size = 64, \ + .mapped_channel_id = ch, \ + .flow_start = flow_base, \ + .flow_num = flow_cnt, \ + .default_flow_id = default_flow, \ + .notdpkt = tx, \ + }, \ + } + +#define PSIL_PDMA_MCASP(x) \ + { \ + .thread_id = x, \ + .ep_config = { \ + .ep_type = PSIL_EP_PDMA_XY, \ + .pdma_acc32 = 1, \ + .pdma_burst = 1, \ + }, \ + } + +#define PSIL_CSI2RX(x) \ + { \ + .thread_id = x, \ + .ep_config = { \ + .ep_type = PSIL_EP_NATIVE, \ + }, \ + } + +/* PSI-L source thread IDs, used for RX (DMA_DEV_TO_MEM) */ +static struct psil_ep am62p_src_ep_map[] = { + /* SAUL */ + PSIL_SAUL(0x7504, 20, 35, 8, 35, 0), + PSIL_SAUL(0x7505, 21, 35, 8, 36, 0), + PSIL_SAUL(0x7506, 22, 43, 8, 43, 0), + PSIL_SAUL(0x7507, 23, 43, 8, 44, 0), + /* PDMA_MAIN0 - SPI0-2 */ + PSIL_PDMA_XY_PKT(0x4300), + PSIL_PDMA_XY_PKT(0x4301), + PSIL_PDMA_XY_PKT(0x4302), + PSIL_PDMA_XY_PKT(0x4303), + PSIL_PDMA_XY_PKT(0x4304), + PSIL_PDMA_XY_PKT(0x4305), + PSIL_PDMA_XY_PKT(0x4306), + PSIL_PDMA_XY_PKT(0x4307), + PSIL_PDMA_XY_PKT(0x4308), + PSIL_PDMA_XY_PKT(0x4309), + PSIL_PDMA_XY_PKT(0x430a), + PSIL_PDMA_XY_PKT(0x430b), + /* PDMA_MAIN1 - UART0-6 */ + PSIL_PDMA_XY_PKT(0x4400), + PSIL_PDMA_XY_PKT(0x4401), + PSIL_PDMA_XY_PKT(0x4402), + PSIL_PDMA_XY_PKT(0x4403), + PSIL_PDMA_XY_PKT(0x4404), + PSIL_PDMA_XY_PKT(0x4405), + PSIL_PDMA_XY_PKT(0x4406), + /* PDMA_MAIN2 - MCASP0-2 */ + PSIL_PDMA_MCASP(0x4500), + PSIL_PDMA_MCASP(0x4501), + PSIL_PDMA_MCASP(0x4502), + /* CPSW3G */ + PSIL_ETHERNET(0x4600, 19, 19, 16), + /* CSI2RX */ + PSIL_CSI2RX(0x5000), + PSIL_CSI2RX(0x5001), + PSIL_CSI2RX(0x5002), + PSIL_CSI2RX(0x5003), + PSIL_CSI2RX(0x5004), + PSIL_CSI2RX(0x5005), + PSIL_CSI2RX(0x5006), + PSIL_CSI2RX(0x5007), + PSIL_CSI2RX(0x5008), + PSIL_CSI2RX(0x5009), + PSIL_CSI2RX(0x500a), + PSIL_CSI2RX(0x500b), + PSIL_CSI2RX(0x500c), + PSIL_CSI2RX(0x500d), + PSIL_CSI2RX(0x500e), + PSIL_CSI2RX(0x500f), + PSIL_CSI2RX(0x5010), + PSIL_CSI2RX(0x5011), + PSIL_CSI2RX(0x5012), + PSIL_CSI2RX(0x5013), + PSIL_CSI2RX(0x5014), + PSIL_CSI2RX(0x5015), + PSIL_CSI2RX(0x5016), + PSIL_CSI2RX(0x5017), + PSIL_CSI2RX(0x5018), + PSIL_CSI2RX(0x5019), + PSIL_CSI2RX(0x501a), + PSIL_CSI2RX(0x501b), + PSIL_CSI2RX(0x501c), + PSIL_CSI2RX(0x501d), + PSIL_CSI2RX(0x501e), + PSIL_CSI2RX(0x501f), + PSIL_CSI2RX(0x5000), + PSIL_CSI2RX(0x5001), + PSIL_CSI2RX(0x5002), + PSIL_CSI2RX(0x5003), + PSIL_CSI2RX(0x5004), + PSIL_CSI2RX(0x5005), + PSIL_CSI2RX(0x5006), + PSIL_CSI2RX(0x5007), + PSIL_CSI2RX(0x5008), + PSIL_CSI2RX(0x5009), + PSIL_CSI2RX(0x500a), + PSIL_CSI2RX(0x500b), + PSIL_CSI2RX(0x500c), + PSIL_CSI2RX(0x500d), + PSIL_CSI2RX(0x500e), + PSIL_CSI2RX(0x500f), + PSIL_CSI2RX(0x5010), + PSIL_CSI2RX(0x5011), + PSIL_CSI2RX(0x5012), + PSIL_CSI2RX(0x5013), + PSIL_CSI2RX(0x5014), + PSIL_CSI2RX(0x5015), + PSIL_CSI2RX(0x5016), + PSIL_CSI2RX(0x5017), + PSIL_CSI2RX(0x5018), + PSIL_CSI2RX(0x5019), + PSIL_CSI2RX(0x501a), + PSIL_CSI2RX(0x501b), + PSIL_CSI2RX(0x501c), + PSIL_CSI2RX(0x501d), + PSIL_CSI2RX(0x501e), + PSIL_CSI2RX(0x501f), + /* CSIRX 1-3 (only for J722S) */ + PSIL_CSI2RX(0x5100), + PSIL_CSI2RX(0x5101), + PSIL_CSI2RX(0x5102), + PSIL_CSI2RX(0x5103), + PSIL_CSI2RX(0x5104), + PSIL_CSI2RX(0x5105), + PSIL_CSI2RX(0x5106), + PSIL_CSI2RX(0x5107), + PSIL_CSI2RX(0x5108), + PSIL_CSI2RX(0x5109), + PSIL_CSI2RX(0x510a), + PSIL_CSI2RX(0x510b), + PSIL_CSI2RX(0x510c), + PSIL_CSI2RX(0x510d), + PSIL_CSI2RX(0x510e), + PSIL_CSI2RX(0x510f), + PSIL_CSI2RX(0x5110), + PSIL_CSI2RX(0x5111), + PSIL_CSI2RX(0x5112), + PSIL_CSI2RX(0x5113), + PSIL_CSI2RX(0x5114), + PSIL_CSI2RX(0x5115), + PSIL_CSI2RX(0x5116), + PSIL_CSI2RX(0x5117), + PSIL_CSI2RX(0x5118), + PSIL_CSI2RX(0x5119), + PSIL_CSI2RX(0x511a), + PSIL_CSI2RX(0x511b), + PSIL_CSI2RX(0x511c), + PSIL_CSI2RX(0x511d), + PSIL_CSI2RX(0x511e), + PSIL_CSI2RX(0x511f), + PSIL_CSI2RX(0x5200), + PSIL_CSI2RX(0x5201), + PSIL_CSI2RX(0x5202), + PSIL_CSI2RX(0x5203), + PSIL_CSI2RX(0x5204), + PSIL_CSI2RX(0x5205), + PSIL_CSI2RX(0x5206), + PSIL_CSI2RX(0x5207), + PSIL_CSI2RX(0x5208), + PSIL_CSI2RX(0x5209), + PSIL_CSI2RX(0x520a), + PSIL_CSI2RX(0x520b), + PSIL_CSI2RX(0x520c), + PSIL_CSI2RX(0x520d), + PSIL_CSI2RX(0x520e), + PSIL_CSI2RX(0x520f), + PSIL_CSI2RX(0x5210), + PSIL_CSI2RX(0x5211), + PSIL_CSI2RX(0x5212), + PSIL_CSI2RX(0x5213), + PSIL_CSI2RX(0x5214), + PSIL_CSI2RX(0x5215), + PSIL_CSI2RX(0x5216), + PSIL_CSI2RX(0x5217), + PSIL_CSI2RX(0x5218), + PSIL_CSI2RX(0x5219), + PSIL_CSI2RX(0x521a), + PSIL_CSI2RX(0x521b), + PSIL_CSI2RX(0x521c), + PSIL_CSI2RX(0x521d), + PSIL_CSI2RX(0x521e), + PSIL_CSI2RX(0x521f), + PSIL_CSI2RX(0x5300), + PSIL_CSI2RX(0x5301), + PSIL_CSI2RX(0x5302), + PSIL_CSI2RX(0x5303), + PSIL_CSI2RX(0x5304), + PSIL_CSI2RX(0x5305), + PSIL_CSI2RX(0x5306), + PSIL_CSI2RX(0x5307), + PSIL_CSI2RX(0x5308), + PSIL_CSI2RX(0x5309), + PSIL_CSI2RX(0x530a), + PSIL_CSI2RX(0x530b), + PSIL_CSI2RX(0x530c), + PSIL_CSI2RX(0x530d), + PSIL_CSI2RX(0x530e), + PSIL_CSI2RX(0x530f), + PSIL_CSI2RX(0x5310), + PSIL_CSI2RX(0x5311), + PSIL_CSI2RX(0x5312), + PSIL_CSI2RX(0x5313), + PSIL_CSI2RX(0x5314), + PSIL_CSI2RX(0x5315), + PSIL_CSI2RX(0x5316), + PSIL_CSI2RX(0x5317), + PSIL_CSI2RX(0x5318), + PSIL_CSI2RX(0x5319), + PSIL_CSI2RX(0x531a), + PSIL_CSI2RX(0x531b), + PSIL_CSI2RX(0x531c), + PSIL_CSI2RX(0x531d), + PSIL_CSI2RX(0x531e), + PSIL_CSI2RX(0x531f), +}; + +/* PSI-L destination thread IDs, used for TX (DMA_MEM_TO_DEV) */ +static struct psil_ep am62p_dst_ep_map[] = { + /* SAUL */ + PSIL_SAUL(0xf500, 27, 83, 8, 83, 1), + PSIL_SAUL(0xf501, 28, 91, 8, 91, 1), + /* PDMA_MAIN0 - SPI0-2 */ + PSIL_PDMA_XY_PKT(0xc300), + PSIL_PDMA_XY_PKT(0xc301), + PSIL_PDMA_XY_PKT(0xc302), + PSIL_PDMA_XY_PKT(0xc303), + PSIL_PDMA_XY_PKT(0xc304), + PSIL_PDMA_XY_PKT(0xc305), + PSIL_PDMA_XY_PKT(0xc306), + PSIL_PDMA_XY_PKT(0xc307), + PSIL_PDMA_XY_PKT(0xc308), + PSIL_PDMA_XY_PKT(0xc309), + PSIL_PDMA_XY_PKT(0xc30a), + PSIL_PDMA_XY_PKT(0xc30b), + /* PDMA_MAIN1 - UART0-6 */ + PSIL_PDMA_XY_PKT(0xc400), + PSIL_PDMA_XY_PKT(0xc401), + PSIL_PDMA_XY_PKT(0xc402), + PSIL_PDMA_XY_PKT(0xc403), + PSIL_PDMA_XY_PKT(0xc404), + PSIL_PDMA_XY_PKT(0xc405), + PSIL_PDMA_XY_PKT(0xc406), + /* PDMA_MAIN2 - MCASP0-2 */ + PSIL_PDMA_MCASP(0xc500), + PSIL_PDMA_MCASP(0xc501), + PSIL_PDMA_MCASP(0xc502), + /* CPSW3G */ + PSIL_ETHERNET(0xc600, 19, 19, 8), + PSIL_ETHERNET(0xc601, 20, 27, 8), + PSIL_ETHERNET(0xc602, 21, 35, 8), + PSIL_ETHERNET(0xc603, 22, 43, 8), + PSIL_ETHERNET(0xc604, 23, 51, 8), + PSIL_ETHERNET(0xc605, 24, 59, 8), + PSIL_ETHERNET(0xc606, 25, 67, 8), + PSIL_ETHERNET(0xc607, 26, 75, 8), +}; + +struct psil_ep_map am62p_ep_map = { + .name = "am62p", + .src = am62p_src_ep_map, + .src_count = ARRAY_SIZE(am62p_src_ep_map), + .dst = am62p_dst_ep_map, + .dst_count = ARRAY_SIZE(am62p_dst_ep_map), +}; diff --git a/drivers/dma/ti/k3-psil-priv.h b/drivers/dma/ti/k3-psil-priv.h index c383723d1c8f6..a577be97e3447 100644 --- a/drivers/dma/ti/k3-psil-priv.h +++ b/drivers/dma/ti/k3-psil-priv.h @@ -45,5 +45,6 @@ extern struct psil_ep_map j721s2_ep_map; extern struct psil_ep_map am62_ep_map; extern struct psil_ep_map am62a_ep_map; extern struct psil_ep_map j784s4_ep_map; +extern struct psil_ep_map am62p_ep_map; #endif /* K3_PSIL_PRIV_H_ */ diff --git a/drivers/dma/ti/k3-psil.c b/drivers/dma/ti/k3-psil.c index c11389d67a3f0..25148d9524720 100644 --- a/drivers/dma/ti/k3-psil.c +++ b/drivers/dma/ti/k3-psil.c @@ -26,6 +26,8 @@ static const struct soc_device_attribute k3_soc_devices[] = { { .family = "AM62X", .data = &am62_ep_map }, { .family = "AM62AX", .data = &am62a_ep_map }, { .family = "J784S4", .data = &j784s4_ep_map }, + { .family = "AM62PX", .data = &am62p_ep_map }, + { .family = "J722S", .data = &am62p_ep_map }, { /* sentinel */ } }; diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c index 30fd2f386f36a..2841a539c2648 100644 --- a/drivers/dma/ti/k3-udma.c +++ b/drivers/dma/ti/k3-udma.c @@ -4441,6 +4441,8 @@ static const struct soc_device_attribute k3_soc_devices[] = { { .family = "AM62X", .data = &am64_soc_data }, { .family = "AM62AX", .data = &am64_soc_data }, { .family = "J784S4", .data = &j721e_soc_data }, + { .family = "AM62PX", .data = &am64_soc_data }, + { .family = "J722S", .data = &am64_soc_data }, { /* sentinel */ } }; -- GitLab From e271c0ba3f919c48e90c64b703538fbb7865cb63 Mon Sep 17 00:00:00 2001 From: Rex Zhang Date: Tue, 12 Dec 2023 10:21:58 +0800 Subject: [PATCH 0454/1290] dmaengine: idxd: Move dma_free_coherent() out of spinlocked context Task may be rescheduled within dma_free_coherent(). So dma_free_coherent() can't be called between spin_lock() and spin_unlock() to avoid Call Trace: Call Trace: dump_stack_lvl+0x37/0x50 __might_resched+0x16a/0x1c0 vunmap+0x2c/0x70 __iommu_dma_free+0x96/0x100 idxd_device_evl_free+0xd5/0x100 [idxd] device_release_driver_internal+0x197/0x200 unbind_store+0xa1/0xb0 kernfs_fop_write_iter+0x120/0x1c0 vfs_write+0x2d3/0x400 ksys_write+0x63/0xe0 do_syscall_64+0x44/0xa0 entry_SYSCALL_64_after_hwframe+0x6e/0xd8 Move it out of the context. Fixes: 244da66cda35 ("dmaengine: idxd: setup event log configuration") Signed-off-by: Rex Zhang Reviewed-by: Dave Jiang Reviewed-by: Fenghua Yu Link: https://lore.kernel.org/r/20231212022158.358619-2-rex.zhang@intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/device.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c index 8f754f922217d..fa0f880beae64 100644 --- a/drivers/dma/idxd/device.c +++ b/drivers/dma/idxd/device.c @@ -802,6 +802,9 @@ err_bmap: static void idxd_device_evl_free(struct idxd_device *idxd) { + void *evl_log; + unsigned int evl_log_size; + dma_addr_t evl_dma; union gencfg_reg gencfg; union genctrl_reg genctrl; struct device *dev = &idxd->pdev->dev; @@ -822,11 +825,15 @@ static void idxd_device_evl_free(struct idxd_device *idxd) iowrite64(0, idxd->reg_base + IDXD_EVLCFG_OFFSET); iowrite64(0, idxd->reg_base + IDXD_EVLCFG_OFFSET + 8); - dma_free_coherent(dev, evl->log_size, evl->log, evl->dma); bitmap_free(evl->bmap); + evl_log = evl->log; + evl_log_size = evl->log_size; + evl_dma = evl->dma; evl->log = NULL; evl->size = IDXD_EVL_SIZE_MIN; spin_unlock(&evl->lock); + + dma_free_coherent(dev, evl_log_size, evl_log, evl_dma); } static void idxd_group_config_write(struct idxd_group *group) -- GitLab From 26ee018ff6d1c326ac9b9be36513e35870ed09db Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 30 Nov 2023 12:13:12 +0100 Subject: [PATCH 0455/1290] dmaengine: xilinx: xdma: Fix the count of elapsed periods in cyclic mode Xilinx DMA engine is capable of keeping track of the number of elapsed periods and this is an increasing 32-bit counter which is only reset when turning off the engine. No need to add this value to our local counter. Fixes: cd8c732ce1a5 ("dmaengine: xilinx: xdma: Support cyclic transfers") Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/r/20231130111315.729430-2-miquel.raynal@bootlin.com Signed-off-by: Vinod Koul --- drivers/dma/xilinx/xdma.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/dma/xilinx/xdma.c b/drivers/dma/xilinx/xdma.c index 84a88029226fd..2c9c72d4b5a20 100644 --- a/drivers/dma/xilinx/xdma.c +++ b/drivers/dma/xilinx/xdma.c @@ -754,9 +754,9 @@ static irqreturn_t xdma_channel_isr(int irq, void *dev_id) if (ret) goto out; - desc->completed_desc_num += complete_desc_num; - if (desc->cyclic) { + desc->completed_desc_num = complete_desc_num; + ret = regmap_read(xdev->rmap, xchan->base + XDMA_CHAN_STATUS, &st); if (ret) @@ -768,6 +768,8 @@ static irqreturn_t xdma_channel_isr(int irq, void *dev_id) goto out; } + desc->completed_desc_num += complete_desc_num; + /* * if all data blocks are transferred, remove and complete the request */ -- GitLab From 58b61fc75ba901b1fd63c911b31249f36d17e9c4 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 30 Nov 2023 12:13:13 +0100 Subject: [PATCH 0456/1290] dmaengine: xilinx: xdma: Clarify the logic between cyclic/sg modes We support both modes, but they perform totally different taks in the interrupt handler. Clarify what shall be done in each case. Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/r/20231130111315.729430-3-miquel.raynal@bootlin.com Signed-off-by: Vinod Koul --- drivers/dma/xilinx/xdma.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/drivers/dma/xilinx/xdma.c b/drivers/dma/xilinx/xdma.c index 2c9c72d4b5a20..4efef1b5f89c4 100644 --- a/drivers/dma/xilinx/xdma.c +++ b/drivers/dma/xilinx/xdma.c @@ -765,26 +765,23 @@ static irqreturn_t xdma_channel_isr(int irq, void *dev_id) regmap_write(xdev->rmap, xchan->base + XDMA_CHAN_STATUS, st); vchan_cyclic_callback(vd); - goto out; - } - - desc->completed_desc_num += complete_desc_num; + } else { + desc->completed_desc_num += complete_desc_num; - /* - * if all data blocks are transferred, remove and complete the request - */ - if (desc->completed_desc_num == desc->desc_num) { - list_del(&vd->node); - vchan_cookie_complete(vd); - goto out; - } + /* if all data blocks are transferred, remove and complete the request */ + if (desc->completed_desc_num == desc->desc_num) { + list_del(&vd->node); + vchan_cookie_complete(vd); + goto out; + } - if (desc->completed_desc_num > desc->desc_num || - complete_desc_num != XDMA_DESC_BLOCK_NUM * XDMA_DESC_ADJACENT) - goto out; + if (desc->completed_desc_num > desc->desc_num || + complete_desc_num != XDMA_DESC_BLOCK_NUM * XDMA_DESC_ADJACENT) + goto out; - /* transfer the rest of data (SG only) */ - xdma_xfer_start(xchan); + /* transfer the rest of data */ + xdma_xfer_start(xchan); + } out: spin_unlock(&xchan->vchan.lock); -- GitLab From b3072be7f955e56789a0508c18e9870f45cd9a11 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 30 Nov 2023 12:13:14 +0100 Subject: [PATCH 0457/1290] dmaengine: xilinx: xdma: Better handling of the busy variable The driver internal scatter-gather logic is: * set busy to true * start transfer * set busy to false * trigger next transfer if any * set busy to true Setting busy to false in cyclic transfers does not make any sense and is conceptually wrong. In order to ease the integration of additional callbacks let's move this change to the scatter-gather path. Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/r/20231130111315.729430-4-miquel.raynal@bootlin.com Signed-off-by: Vinod Koul --- drivers/dma/xilinx/xdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/xilinx/xdma.c b/drivers/dma/xilinx/xdma.c index 4efef1b5f89c4..e931ff42209cd 100644 --- a/drivers/dma/xilinx/xdma.c +++ b/drivers/dma/xilinx/xdma.c @@ -745,7 +745,6 @@ static irqreturn_t xdma_channel_isr(int irq, void *dev_id) if (!vd) goto out; - xchan->busy = false; desc = to_xdma_desc(vd); xdev = xchan->xdev_hdl; @@ -766,6 +765,7 @@ static irqreturn_t xdma_channel_isr(int irq, void *dev_id) vchan_cyclic_callback(vd); } else { + xchan->busy = false; desc->completed_desc_num += complete_desc_num; /* if all data blocks are transferred, remove and complete the request */ -- GitLab From f5c392d106e7cc58c7705799ef4c36c3b2f60b31 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 30 Nov 2023 12:13:15 +0100 Subject: [PATCH 0458/1290] dmaengine: xilinx: xdma: Add terminate_all/synchronize callbacks The driver is capable of starting scatter-gather transfers and needs to wait until their end. It is also capable of starting cyclic transfers and will only be "reset" next time the channel will be reused. In practice most of the time we hear no audio glitch because the sound card stops the flow on its side so the DMA transfers are just discarded. There are however some cases (when playing a bit with a number of frames and with a discontinuous sound file) when the sound card seems to be slightly too slow at stopping the flow, leading to a glitch that can be heard. In all cases, we need to earn better control of the DMA engine and adding proper ->device_terminate_all() and ->device_synchronize() callbacks feels totally relevant. With these two callbacks, no glitch can be heard anymore. Fixes: cd8c732ce1a5 ("dmaengine: xilinx: xdma: Support cyclic transfers") Signed-off-by: Miquel Raynal Tested-by: Lizhi Hou Link: https://lore.kernel.org/r/20231130111315.729430-5-miquel.raynal@bootlin.com Signed-off-by: Vinod Koul --- drivers/dma/xilinx/xdma.c | 68 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/drivers/dma/xilinx/xdma.c b/drivers/dma/xilinx/xdma.c index e931ff42209cd..290bb5d2d1e2b 100644 --- a/drivers/dma/xilinx/xdma.c +++ b/drivers/dma/xilinx/xdma.c @@ -371,6 +371,31 @@ static int xdma_xfer_start(struct xdma_chan *xchan) return ret; xchan->busy = true; + + return 0; +} + +/** + * xdma_xfer_stop - Stop DMA transfer + * @xchan: DMA channel pointer + */ +static int xdma_xfer_stop(struct xdma_chan *xchan) +{ + struct virt_dma_desc *vd = vchan_next_desc(&xchan->vchan); + struct xdma_device *xdev = xchan->xdev_hdl; + int ret; + + if (!vd || !xchan->busy) + return -EINVAL; + + /* clear run stop bit to prevent any further auto-triggering */ + ret = regmap_write(xdev->rmap, xchan->base + XDMA_CHAN_CONTROL_W1C, + CHAN_CTRL_RUN_STOP); + if (ret) + return ret; + + xchan->busy = false; + return 0; } @@ -475,6 +500,47 @@ static void xdma_issue_pending(struct dma_chan *chan) spin_unlock_irqrestore(&xdma_chan->vchan.lock, flags); } +/** + * xdma_terminate_all - Terminate all transactions + * @chan: DMA channel pointer + */ +static int xdma_terminate_all(struct dma_chan *chan) +{ + struct xdma_chan *xdma_chan = to_xdma_chan(chan); + struct xdma_desc *desc = NULL; + struct virt_dma_desc *vd; + unsigned long flags; + LIST_HEAD(head); + + spin_lock_irqsave(&xdma_chan->vchan.lock, flags); + xdma_xfer_stop(xdma_chan); + + vd = vchan_next_desc(&xdma_chan->vchan); + if (vd) + desc = to_xdma_desc(vd); + if (desc) { + dma_cookie_complete(&desc->vdesc.tx); + vchan_terminate_vdesc(&desc->vdesc); + } + + vchan_get_all_descriptors(&xdma_chan->vchan, &head); + spin_unlock_irqrestore(&xdma_chan->vchan.lock, flags); + vchan_dma_desc_free_list(&xdma_chan->vchan, &head); + + return 0; +} + +/** + * xdma_synchronize - Synchronize terminated transactions + * @chan: DMA channel pointer + */ +static void xdma_synchronize(struct dma_chan *chan) +{ + struct xdma_chan *xdma_chan = to_xdma_chan(chan); + + vchan_synchronize(&xdma_chan->vchan); +} + /** * xdma_prep_device_sg - prepare a descriptor for a DMA transaction * @chan: DMA channel pointer @@ -1088,6 +1154,8 @@ static int xdma_probe(struct platform_device *pdev) xdev->dma_dev.device_prep_slave_sg = xdma_prep_device_sg; xdev->dma_dev.device_config = xdma_device_config; xdev->dma_dev.device_issue_pending = xdma_issue_pending; + xdev->dma_dev.device_terminate_all = xdma_terminate_all; + xdev->dma_dev.device_synchronize = xdma_synchronize; xdev->dma_dev.filter.map = pdata->device_map; xdev->dma_dev.filter.mapcnt = pdata->device_map_cnt; xdev->dma_dev.filter.fn = xdma_filter_fn; -- GitLab From 6e2387183312cdfce6326b2626c0b801c2ffe686 Mon Sep 17 00:00:00 2001 From: Jan Kuliga Date: Mon, 18 Dec 2023 12:39:36 +0100 Subject: [PATCH 0459/1290] dmaengine: xilinx: xdma: Get rid of unused code Get rid of duplicated macro definitions, as these macros are defined earlier in the file. Also, get rid of unused member of 'struct xdma_desc'. Signed-off-by: Jan Kuliga Link: https://lore.kernel.org/r/20231218113943.9099-2-jankul@alatek.krakow.pl Signed-off-by: Vinod Koul --- drivers/dma/xilinx/xdma-regs.h | 12 ------------ drivers/dma/xilinx/xdma.c | 2 -- 2 files changed, 14 deletions(-) diff --git a/drivers/dma/xilinx/xdma-regs.h b/drivers/dma/xilinx/xdma-regs.h index e641a5083e14b..0b17a931f5830 100644 --- a/drivers/dma/xilinx/xdma-regs.h +++ b/drivers/dma/xilinx/xdma-regs.h @@ -134,18 +134,6 @@ struct xdma_hw_desc { #define XDMA_SGDMA_DESC_ADJ 0x4088 #define XDMA_SGDMA_DESC_CREDIT 0x408c -/* bits of the SG DMA control register */ -#define XDMA_CTRL_RUN_STOP BIT(0) -#define XDMA_CTRL_IE_DESC_STOPPED BIT(1) -#define XDMA_CTRL_IE_DESC_COMPLETED BIT(2) -#define XDMA_CTRL_IE_DESC_ALIGN_MISMATCH BIT(3) -#define XDMA_CTRL_IE_MAGIC_STOPPED BIT(4) -#define XDMA_CTRL_IE_IDLE_STOPPED BIT(6) -#define XDMA_CTRL_IE_READ_ERROR GENMASK(13, 9) -#define XDMA_CTRL_IE_DESC_ERROR GENMASK(23, 19) -#define XDMA_CTRL_NON_INCR_ADDR BIT(25) -#define XDMA_CTRL_POLL_MODE_WB BIT(26) - /* * interrupt registers */ diff --git a/drivers/dma/xilinx/xdma.c b/drivers/dma/xilinx/xdma.c index 290bb5d2d1e2b..ddb9e7d074616 100644 --- a/drivers/dma/xilinx/xdma.c +++ b/drivers/dma/xilinx/xdma.c @@ -78,7 +78,6 @@ struct xdma_chan { * @vdesc: Virtual DMA descriptor * @chan: DMA channel pointer * @dir: Transferring direction of the request - * @dev_addr: Physical address on DMA device side * @desc_blocks: Hardware descriptor blocks * @dblk_num: Number of hardware descriptor blocks * @desc_num: Number of hardware descriptors @@ -91,7 +90,6 @@ struct xdma_desc { struct virt_dma_desc vdesc; struct xdma_chan *chan; enum dma_transfer_direction dir; - u64 dev_addr; struct xdma_desc_block *desc_blocks; u32 dblk_num; u32 desc_num; -- GitLab From 7a9c7f46bd0abea214d96f00f78622f24c798ad8 Mon Sep 17 00:00:00 2001 From: Jan Kuliga Date: Mon, 18 Dec 2023 12:39:37 +0100 Subject: [PATCH 0460/1290] dmaengine: xilinx: xdma: Add necessary macro definitions Complete lacking bits describing the status/control register values. Add macros describing the status/control registers. Signed-off-by: Jan Kuliga Link: https://lore.kernel.org/r/20231218113943.9099-3-jankul@alatek.krakow.pl Signed-off-by: Vinod Koul --- drivers/dma/xilinx/xdma-regs.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/dma/xilinx/xdma-regs.h b/drivers/dma/xilinx/xdma-regs.h index 0b17a931f5830..98117e8a466fa 100644 --- a/drivers/dma/xilinx/xdma-regs.h +++ b/drivers/dma/xilinx/xdma-regs.h @@ -76,6 +76,7 @@ struct xdma_hw_desc { #define XDMA_CHAN_CONTROL_W1S 0x8 #define XDMA_CHAN_CONTROL_W1C 0xc #define XDMA_CHAN_STATUS 0x40 +#define XDMA_CHAN_STATUS_RC 0x44 #define XDMA_CHAN_COMPLETED_DESC 0x48 #define XDMA_CHAN_ALIGNMENTS 0x4c #define XDMA_CHAN_INTR_ENABLE 0x90 @@ -101,6 +102,7 @@ struct xdma_hw_desc { #define CHAN_CTRL_IE_MAGIC_STOPPED BIT(4) #define CHAN_CTRL_IE_IDLE_STOPPED BIT(6) #define CHAN_CTRL_IE_READ_ERROR GENMASK(13, 9) +#define CHAN_CTRL_IE_WRITE_ERROR GENMASK(18, 14) #define CHAN_CTRL_IE_DESC_ERROR GENMASK(23, 19) #define CHAN_CTRL_NON_INCR_ADDR BIT(25) #define CHAN_CTRL_POLL_MODE_WB BIT(26) @@ -111,8 +113,17 @@ struct xdma_hw_desc { CHAN_CTRL_IE_DESC_ALIGN_MISMATCH | \ CHAN_CTRL_IE_MAGIC_STOPPED | \ CHAN_CTRL_IE_READ_ERROR | \ + CHAN_CTRL_IE_WRITE_ERROR | \ CHAN_CTRL_IE_DESC_ERROR) +#define XDMA_CHAN_STATUS_MASK CHAN_CTRL_START + +#define XDMA_CHAN_ERROR_MASK (CHAN_CTRL_IE_DESC_ALIGN_MISMATCH | \ + CHAN_CTRL_IE_MAGIC_STOPPED | \ + CHAN_CTRL_IE_READ_ERROR | \ + CHAN_CTRL_IE_WRITE_ERROR | \ + CHAN_CTRL_IE_DESC_ERROR) + /* bits of the channel interrupt enable mask */ #define CHAN_IM_DESC_ERROR BIT(19) #define CHAN_IM_READ_ERROR BIT(9) -- GitLab From e5bc76b0e1c54906ca744ed1a7872f4f407d5d2e Mon Sep 17 00:00:00 2001 From: Jan Kuliga Date: Mon, 18 Dec 2023 12:39:38 +0100 Subject: [PATCH 0461/1290] dmaengine: xilinx: xdma: Ease dma_pool alignment requirements According to the XDMA datasheet (PG195), the address of any descriptor must be 32 byte aligned. The datasheet also states that a contiguous block of descriptors must not cross a 4k address boundary. Therefore, it is possible to ease the pressure put on the dma_pool allocator just by requiring sufficient alignment and boundary values. Add proper macro definition and change the values passed into the dma_pool_create(). Signed-off-by: Jan Kuliga Link: https://lore.kernel.org/r/20231218113943.9099-4-jankul@alatek.krakow.pl Signed-off-by: Vinod Koul --- drivers/dma/xilinx/xdma-regs.h | 7 ++++--- drivers/dma/xilinx/xdma.c | 5 ++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/dma/xilinx/xdma-regs.h b/drivers/dma/xilinx/xdma-regs.h index 98117e8a466fa..98f5f6fb9ff9c 100644 --- a/drivers/dma/xilinx/xdma-regs.h +++ b/drivers/dma/xilinx/xdma-regs.h @@ -64,9 +64,10 @@ struct xdma_hw_desc { __le64 next_desc; }; -#define XDMA_DESC_SIZE sizeof(struct xdma_hw_desc) -#define XDMA_DESC_BLOCK_SIZE (XDMA_DESC_SIZE * XDMA_DESC_ADJACENT) -#define XDMA_DESC_BLOCK_ALIGN 4096 +#define XDMA_DESC_SIZE sizeof(struct xdma_hw_desc) +#define XDMA_DESC_BLOCK_SIZE (XDMA_DESC_SIZE * XDMA_DESC_ADJACENT) +#define XDMA_DESC_BLOCK_ALIGN 32 +#define XDMA_DESC_BLOCK_BOUNDARY 4096 /* * Channel registers diff --git a/drivers/dma/xilinx/xdma.c b/drivers/dma/xilinx/xdma.c index ddb9e7d074616..c22701e76b693 100644 --- a/drivers/dma/xilinx/xdma.c +++ b/drivers/dma/xilinx/xdma.c @@ -741,9 +741,8 @@ static int xdma_alloc_chan_resources(struct dma_chan *chan) return -EINVAL; } - xdma_chan->desc_pool = dma_pool_create(dma_chan_name(chan), - dev, XDMA_DESC_BLOCK_SIZE, - XDMA_DESC_BLOCK_ALIGN, 0); + xdma_chan->desc_pool = dma_pool_create(dma_chan_name(chan), dev, XDMA_DESC_BLOCK_SIZE, + XDMA_DESC_BLOCK_ALIGN, XDMA_DESC_BLOCK_BOUNDARY); if (!xdma_chan->desc_pool) { xdma_err(xdev, "unable to allocate descriptor pool"); return -ENOMEM; -- GitLab From 2226ec072ed3f1bd3f8dbe0cbf0e6cad699aedc2 Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Thu, 7 Dec 2023 14:19:10 +0200 Subject: [PATCH 0462/1290] phy: qcom-qmp: qserdes-com: Add some more v6 register offsets Add some missing V6 registers offsets that are needed by the new Snapdragon X Elite (X1E80100) platform. Reviewed-by: Dmitry Baryshkov Signed-off-by: Abel Vesa Link: https://lore.kernel.org/r/20231122-phy-qualcomm-v6-v6-20-v7-new-offsets-v3-1-dfd1c375ef61@linaro.org Signed-off-by: Vinod Koul --- drivers/phy/qualcomm/phy-qcom-qmp-qserdes-com-v6.h | 5 +++++ drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v6.h | 1 + 2 files changed, 6 insertions(+) diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-com-v6.h b/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-com-v6.h index f420f8faf16a7..ec7291424dd1f 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-com-v6.h +++ b/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-com-v6.h @@ -22,6 +22,8 @@ #define QSERDES_V6_COM_DIV_FRAC_START2_MODE1 0x34 #define QSERDES_V6_COM_DIV_FRAC_START3_MODE1 0x38 #define QSERDES_V6_COM_HSCLK_SEL_1 0x3c +#define QSERDES_V6_COM_INTEGLOOP_GAIN0_MODE1 0x40 +#define QSERDES_V6_COM_INTEGLOOP_GAIN1_MODE1 0x44 #define QSERDES_V6_COM_VCO_TUNE1_MODE1 0x48 #define QSERDES_V6_COM_VCO_TUNE2_MODE1 0x4c #define QSERDES_V6_COM_BIN_VCOCAL_CMP_CODE1_MODE1 0x50 @@ -48,6 +50,7 @@ #define QSERDES_V6_COM_VCO_TUNE2_MODE0 0xac #define QSERDES_V6_COM_BG_TIMER 0xbc #define QSERDES_V6_COM_SSC_EN_CENTER 0xc0 +#define QSERDES_V6_COM_SSC_ADJ_PER1 0xc4 #define QSERDES_V6_COM_SSC_PER1 0xcc #define QSERDES_V6_COM_SSC_PER2 0xd0 #define QSERDES_V6_COM_PLL_POST_DIV_MUX 0xd8 @@ -56,6 +59,7 @@ #define QSERDES_V6_COM_SYS_CLK_CTRL 0xe4 #define QSERDES_V6_COM_SYSCLK_BUF_ENABLE 0xe8 #define QSERDES_V6_COM_PLL_IVCO 0xf4 +#define QSERDES_V6_COM_PLL_IVCO_MODE1 0xf8 #define QSERDES_V6_COM_SYSCLK_EN_SEL 0x110 #define QSERDES_V6_COM_RESETSM_CNTRL 0x118 #define QSERDES_V6_COM_LOCK_CMP_EN 0x120 @@ -63,6 +67,7 @@ #define QSERDES_V6_COM_VCO_TUNE_CTRL 0x13c #define QSERDES_V6_COM_VCO_TUNE_MAP 0x140 #define QSERDES_V6_COM_VCO_TUNE_INITVAL2 0x148 +#define QSERDES_V6_COM_VCO_TUNE_MAXVAL2 0x158 #define QSERDES_V6_COM_CLK_SELECT 0x164 #define QSERDES_V6_COM_CORE_CLK_EN 0x170 #define QSERDES_V6_COM_CMN_CONFIG_1 0x174 diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v6.h b/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v6.h index 8883e1de730ef..23ffcfae9efab 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v6.h +++ b/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v6.h @@ -23,6 +23,7 @@ #define QSERDES_V6_TX_PARRATE_REC_DETECT_IDLE_EN 0x60 #define QSERDES_V6_TX_BIST_PATTERN7 0x7c #define QSERDES_V6_TX_LANE_MODE_1 0x84 +#define QSERDES_V6_TX_LANE_MODE_2 0x88 #define QSERDES_V6_TX_LANE_MODE_3 0x8c #define QSERDES_V6_TX_LANE_MODE_4 0x90 #define QSERDES_V6_TX_LANE_MODE_5 0x94 -- GitLab From a40542507b9045da03f4e013ab8562f6e6fe8aad Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Thu, 7 Dec 2023 14:19:11 +0200 Subject: [PATCH 0463/1290] phy: qcom-qmp: qserdes-txrx: Add some more v6.20 register offsets Add some missing v6.20 registers offsets that are needed by the new Snapdragon X Elite (X1E80100) platform. Reviewed-by: Dmitry Baryshkov Signed-off-by: Abel Vesa Link: https://lore.kernel.org/r/20231122-phy-qualcomm-v6-v6-20-v7-new-offsets-v3-2-dfd1c375ef61@linaro.org Signed-off-by: Vinod Koul --- drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v6_20.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v6_20.h b/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v6_20.h index 5385a8b609707..6ed5339fd2ea8 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v6_20.h +++ b/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v6_20.h @@ -15,10 +15,13 @@ #define QSERDES_V6_20_RX_UCDR_FO_GAIN_RATE_2 0x08 #define QSERDES_V6_20_RX_UCDR_FO_GAIN_RATE_3 0x0c +#define QSERDES_V6_20_RX_UCDR_SO_GAIN_RATE_2 0x18 #define QSERDES_V6_20_RX_UCDR_PI_CONTROLS 0x20 #define QSERDES_V6_20_RX_UCDR_SO_ACC_DEFAULT_VAL_RATE3 0x34 #define QSERDES_V6_20_RX_IVCM_CAL_CTRL2 0x9c #define QSERDES_V6_20_RX_IVCM_POSTCAL_OFFSET 0xa0 +#define QSERDES_V6_20_RX_DFE_1 0xac +#define QSERDES_V6_20_RX_DFE_2 0xb0 #define QSERDES_V6_20_RX_DFE_3 0xb4 #define QSERDES_V6_20_RX_VGA_CAL_MAN_VAL 0xe8 #define QSERDES_V6_20_RX_GM_CAL 0x10c @@ -41,5 +44,6 @@ #define QSERDES_V6_20_RX_MODE_RATE3_B4 0x220 #define QSERDES_V6_20_RX_MODE_RATE3_B5 0x224 #define QSERDES_V6_20_RX_MODE_RATE3_B6 0x228 +#define QSERDES_V6_20_RX_BKUP_CTRL1 0x22c #endif -- GitLab From 7b98cf0e9b5f8a05a7f0f0d06d3cfa130bb576e2 Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Thu, 7 Dec 2023 14:19:12 +0200 Subject: [PATCH 0464/1290] phy: qcom-qmp: pcs: Add v7 register offsets The X1E80100 platform bumps the HW version of QMP phy to v7 for USB, and PCIe. Add the new PCS offsets in a dedicated header file. Signed-off-by: Abel Vesa Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20231122-phy-qualcomm-v6-v6-20-v7-new-offsets-v3-3-dfd1c375ef61@linaro.org Signed-off-by: Vinod Koul --- drivers/phy/qualcomm/phy-qcom-qmp-pcs-v7.h | 32 ++++++++++++++++++++++ drivers/phy/qualcomm/phy-qcom-qmp.h | 2 ++ 2 files changed, 34 insertions(+) create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-pcs-v7.h diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-pcs-v7.h b/drivers/phy/qualcomm/phy-qcom-qmp-pcs-v7.h new file mode 100644 index 0000000000000..c7759892ed2ea --- /dev/null +++ b/drivers/phy/qualcomm/phy-qcom-qmp-pcs-v7.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023, Linaro Limited + */ + +#ifndef QCOM_PHY_QMP_PCS_V7_H_ +#define QCOM_PHY_QMP_PCS_V7_H_ + +/* Only for QMP V7 PHY - USB/PCIe PCS registers */ +#define QPHY_V7_PCS_SW_RESET 0x000 +#define QPHY_V7_PCS_PCS_STATUS1 0x014 +#define QPHY_V7_PCS_POWER_DOWN_CONTROL 0x040 +#define QPHY_V7_PCS_START_CONTROL 0x044 +#define QPHY_V7_PCS_POWER_STATE_CONFIG1 0x090 +#define QPHY_V7_PCS_LOCK_DETECT_CONFIG1 0x0c4 +#define QPHY_V7_PCS_LOCK_DETECT_CONFIG2 0x0c8 +#define QPHY_V7_PCS_LOCK_DETECT_CONFIG3 0x0cc +#define QPHY_V7_PCS_LOCK_DETECT_CONFIG6 0x0d8 +#define QPHY_V7_PCS_REFGEN_REQ_CONFIG1 0x0dc +#define QPHY_V7_PCS_RX_SIGDET_LVL 0x188 +#define QPHY_V7_PCS_RCVR_DTCT_DLY_P1U2_L 0x190 +#define QPHY_V7_PCS_RCVR_DTCT_DLY_P1U2_H 0x194 +#define QPHY_V7_PCS_RATE_SLEW_CNTRL1 0x198 +#define QPHY_V7_PCS_CDR_RESET_TIME 0x1b0 +#define QPHY_V7_PCS_ALIGN_DETECT_CONFIG1 0x1c0 +#define QPHY_V7_PCS_ALIGN_DETECT_CONFIG2 0x1c4 +#define QPHY_V7_PCS_PCS_TX_RX_CONFIG 0x1d0 +#define QPHY_V7_PCS_EQ_CONFIG1 0x1dc +#define QPHY_V7_PCS_EQ_CONFIG2 0x1e0 +#define QPHY_V7_PCS_EQ_CONFIG5 0x1ec + +#endif diff --git a/drivers/phy/qualcomm/phy-qcom-qmp.h b/drivers/phy/qualcomm/phy-qcom-qmp.h index 71f063f4a56e3..21f6a56e7ae37 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp.h +++ b/drivers/phy/qualcomm/phy-qcom-qmp.h @@ -44,6 +44,8 @@ #include "phy-qcom-qmp-pcs-v6_20.h" +#include "phy-qcom-qmp-pcs-v7.h" + /* Only for QMP V3 & V4 PHY - DP COM registers */ #define QPHY_V3_DP_COM_PHY_MODE_CTRL 0x00 #define QPHY_V3_DP_COM_SW_RESET 0x04 -- GitLab From 8d4f9f801095b120e433d935b296baf0e3bdc6a0 Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Thu, 7 Dec 2023 14:19:13 +0200 Subject: [PATCH 0465/1290] phy: qcom-qmp: pcs-usb: Add v7 register offsets The X1E80100 platform bumps the HW version of QMP phy to v7 for USB. Add the new PCS USB specific offsets in a dedicated header file. Signed-off-by: Abel Vesa Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20231122-phy-qualcomm-v6-v6-20-v7-new-offsets-v3-4-dfd1c375ef61@linaro.org Signed-off-by: Vinod Koul --- drivers/phy/qualcomm/phy-qcom-qmp-pcs-usb-v7.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-pcs-usb-v7.h diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-pcs-usb-v7.h b/drivers/phy/qualcomm/phy-qcom-qmp-pcs-usb-v7.h new file mode 100644 index 0000000000000..24368d45ae764 --- /dev/null +++ b/drivers/phy/qualcomm/phy-qcom-qmp-pcs-usb-v7.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023, Linaro Limited + */ + +#ifndef QCOM_PHY_QMP_PCS_USB_V7_H_ +#define QCOM_PHY_QMP_PCS_USB_V7_H_ + +#define QPHY_V7_PCS_USB3_POWER_STATE_CONFIG1 0x00 +#define QPHY_V7_PCS_USB3_AUTONOMOUS_MODE_CTRL 0x08 +#define QPHY_V7_PCS_USB3_LFPS_RXTERM_IRQ_CLEAR 0x14 +#define QPHY_V7_PCS_USB3_LFPS_DET_HIGH_COUNT_VAL 0x18 +#define QPHY_V7_PCS_USB3_RXEQTRAINING_DFE_TIME_S2 0x3c +#define QPHY_V7_PCS_USB3_RCVR_DTCT_DLY_U3_L 0x40 +#define QPHY_V7_PCS_USB3_RCVR_DTCT_DLY_U3_H 0x44 + +#endif -- GitLab From bc546cc85c1d92d9ba7b278b77016b7d4334fafa Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Thu, 7 Dec 2023 14:19:14 +0200 Subject: [PATCH 0466/1290] phy: qcom-qmp: qserdes-com: Add v7 register offsets The X1E80100 platform bumps the HW version of QMP phy to v7 for USB and PCIE g3x2. Add the new qserdes com offsets in a dedicated header file. Signed-off-by: Abel Vesa Link: https://lore.kernel.org/r/20231122-phy-qualcomm-v6-v6-20-v7-new-offsets-v3-5-dfd1c375ef61@linaro.org Signed-off-by: Vinod Koul --- .../qualcomm/phy-qcom-qmp-qserdes-com-v7.h | 87 +++++++++++++++++++ drivers/phy/qualcomm/phy-qcom-qmp.h | 2 + 2 files changed, 89 insertions(+) create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-qserdes-com-v7.h diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-com-v7.h b/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-com-v7.h new file mode 100644 index 0000000000000..7430f49214779 --- /dev/null +++ b/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-com-v7.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023, Linaro Limited + */ + +#ifndef QCOM_PHY_QMP_QSERDES_COM_V7_H_ +#define QCOM_PHY_QMP_QSERDES_COM_V7_H_ + +/* Only for QMP V7 PHY - QSERDES COM registers */ + +#define QSERDES_V7_COM_SSC_STEP_SIZE1_MODE1 0x00 +#define QSERDES_V7_COM_SSC_STEP_SIZE2_MODE1 0x04 +#define QSERDES_V7_COM_CP_CTRL_MODE1 0x10 +#define QSERDES_V7_COM_PLL_RCTRL_MODE1 0x14 +#define QSERDES_V7_COM_PLL_CCTRL_MODE1 0x18 +#define QSERDES_V7_COM_CORECLK_DIV_MODE1 0x1c +#define QSERDES_V7_COM_LOCK_CMP1_MODE1 0x20 +#define QSERDES_V7_COM_LOCK_CMP2_MODE1 0x24 +#define QSERDES_V7_COM_DEC_START_MODE1 0x28 +#define QSERDES_V7_COM_DEC_START_MSB_MODE1 0x2c +#define QSERDES_V7_COM_DIV_FRAC_START1_MODE1 0x30 +#define QSERDES_V7_COM_DIV_FRAC_START2_MODE1 0x34 +#define QSERDES_V7_COM_DIV_FRAC_START3_MODE1 0x38 +#define QSERDES_V7_COM_HSCLK_SEL_1 0x3c +#define QSERDES_V7_COM_INTEGLOOP_GAIN0_MODE1 0x40 +#define QSERDES_V7_COM_INTEGLOOP_GAIN1_MODE1 0x44 +#define QSERDES_V7_COM_VCO_TUNE1_MODE1 0x48 +#define QSERDES_V7_COM_VCO_TUNE2_MODE1 0x4c +#define QSERDES_V7_COM_BIN_VCOCAL_CMP_CODE1_MODE1 0x50 +#define QSERDES_V7_COM_BIN_VCOCAL_CMP_CODE2_MODE1 0x54 +#define QSERDES_V7_COM_BIN_VCOCAL_CMP_CODE1_MODE0 0x58 +#define QSERDES_V7_COM_BIN_VCOCAL_CMP_CODE2_MODE0 0x5c +#define QSERDES_V7_COM_SSC_STEP_SIZE1_MODE0 0x60 +#define QSERDES_V7_COM_SSC_STEP_SIZE2_MODE0 0x64 +#define QSERDES_V7_COM_CP_CTRL_MODE0 0x70 +#define QSERDES_V7_COM_PLL_RCTRL_MODE0 0x74 +#define QSERDES_V7_COM_PLL_CCTRL_MODE0 0x78 +#define QSERDES_V7_COM_PLL_CORE_CLK_DIV_MODE0 0x7c +#define QSERDES_V7_COM_LOCK_CMP1_MODE0 0x80 +#define QSERDES_V7_COM_LOCK_CMP2_MODE0 0x84 +#define QSERDES_V7_COM_DEC_START_MODE0 0x88 +#define QSERDES_V7_COM_DEC_START_MSB_MODE0 0x8c +#define QSERDES_V7_COM_DIV_FRAC_START1_MODE0 0x90 +#define QSERDES_V7_COM_DIV_FRAC_START2_MODE0 0x94 +#define QSERDES_V7_COM_DIV_FRAC_START3_MODE0 0x98 +#define QSERDES_V7_COM_HSCLK_HS_SWITCH_SEL_1 0x9c +#define QSERDES_V7_COM_INTEGLOOP_GAIN0_MODE0 0xa0 +#define QSERDES_V7_COM_INTEGLOOP_GAIN1_MODE0 0xa4 +#define QSERDES_V7_COM_VCO_TUNE1_MODE0 0xa8 +#define QSERDES_V7_COM_VCO_TUNE2_MODE0 0xac +#define QSERDES_V7_COM_BG_TIMER 0xbc +#define QSERDES_V7_COM_SSC_EN_CENTER 0xc0 +#define QSERDES_V7_COM_SSC_ADJ_PER1 0xc4 +#define QSERDES_V7_COM_SSC_PER1 0xcc +#define QSERDES_V7_COM_SSC_PER2 0xd0 +#define QSERDES_V7_COM_PLL_POST_DIV_MUX 0xd8 +#define QSERDES_V7_COM_PLL_BIAS_EN_CLK_BUFLR_EN 0xdc +#define QSERDES_V7_COM_CLK_ENABLE1 0xe0 +#define QSERDES_V7_COM_SYS_CLK_CTRL 0xe4 +#define QSERDES_V7_COM_SYSCLK_BUF_ENABLE 0xe8 +#define QSERDES_V7_COM_PLL_IVCO 0xf4 +#define QSERDES_V7_COM_PLL_IVCO_MODE1 0xf8 +#define QSERDES_V7_COM_SYSCLK_EN_SEL 0x110 +#define QSERDES_V7_COM_RESETSM_CNTRL 0x118 +#define QSERDES_V7_COM_LOCK_CMP_EN 0x120 +#define QSERDES_V7_COM_LOCK_CMP_CFG 0x124 +#define QSERDES_V7_COM_VCO_TUNE_CTRL 0x13c +#define QSERDES_V7_COM_VCO_TUNE_MAP 0x140 +#define QSERDES_V7_COM_VCO_TUNE_INITVAL2 0x148 +#define QSERDES_V7_COM_VCO_TUNE_MAXVAL2 0x158 +#define QSERDES_V7_COM_CLK_SELECT 0x164 +#define QSERDES_V7_COM_CORE_CLK_EN 0x170 +#define QSERDES_V7_COM_CMN_CONFIG_1 0x174 +#define QSERDES_V7_COM_SVS_MODE_CLK_SEL 0x17c +#define QSERDES_V7_COM_CMN_MISC_1 0x184 +#define QSERDES_V7_COM_CMN_MODE 0x188 +#define QSERDES_V7_COM_PLL_VCO_DC_LEVEL_CTRL 0x198 +#define QSERDES_V7_COM_AUTO_GAIN_ADJ_CTRL_1 0x1a4 +#define QSERDES_V7_COM_AUTO_GAIN_ADJ_CTRL_2 0x1a8 +#define QSERDES_V7_COM_AUTO_GAIN_ADJ_CTRL_3 0x1ac +#define QSERDES_V7_COM_ADDITIONAL_MISC 0x1b4 +#define QSERDES_V7_COM_ADDITIONAL_MISC_2 0x1b8 +#define QSERDES_V7_COM_ADDITIONAL_MISC_3 0x1bc +#define QSERDES_V7_COM_CMN_STATUS 0x1d0 +#define QSERDES_V7_COM_C_READY_STATUS 0x1f8 + +#endif diff --git a/drivers/phy/qualcomm/phy-qcom-qmp.h b/drivers/phy/qualcomm/phy-qcom-qmp.h index 21f6a56e7ae37..3a0512c3e07ab 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp.h +++ b/drivers/phy/qualcomm/phy-qcom-qmp.h @@ -26,6 +26,8 @@ #include "phy-qcom-qmp-qserdes-txrx-v6_20.h" #include "phy-qcom-qmp-qserdes-ln-shrd-v6.h" +#include "phy-qcom-qmp-qserdes-com-v7.h" + #include "phy-qcom-qmp-qserdes-pll.h" #include "phy-qcom-qmp-pcs-v2.h" -- GitLab From 762c3565f3c8105603089eeaa0501e5089922221 Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Thu, 7 Dec 2023 14:19:15 +0200 Subject: [PATCH 0467/1290] phy: qcom-qmp: qserdes-txrx: Add V6 N4 register offsets There is a variant of V6 offsets that are different, the QMP PHY N4, and it is found on the X1E80100 platform. Reviewed-by: Dmitry Baryshkov Signed-off-by: Abel Vesa Link: https://lore.kernel.org/r/20231122-phy-qualcomm-v6-v6-20-v7-new-offsets-v3-6-dfd1c375ef61@linaro.org Signed-off-by: Vinod Koul --- .../phy-qcom-qmp-qserdes-txrx-v6_n4.h | 51 +++++++++++++++++++ drivers/phy/qualcomm/phy-qcom-qmp.h | 1 + 2 files changed, 52 insertions(+) create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v6_n4.h diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v6_n4.h b/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v6_n4.h new file mode 100644 index 0000000000000..a814ad11af071 --- /dev/null +++ b/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v6_n4.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023, Linaro Limited + */ + +#ifndef QCOM_PHY_QMP_QSERDES_TXRX_V6_N4_H_ +#define QCOM_PHY_QMP_QSERDES_TXRX_V6_N4_H_ + +#define QSERDES_V6_N4_TX_RES_CODE_LANE_OFFSET_TX 0x30 +#define QSERDES_V6_N4_TX_RES_CODE_LANE_OFFSET_RX 0x34 +#define QSERDES_V6_N4_TX_LANE_MODE_1 0x78 +#define QSERDES_V6_N4_TX_LANE_MODE_2 0x7c +#define QSERDES_V6_N4_TX_LANE_MODE_3 0x80 + +#define QSERDES_V6_N4_RX_UCDR_FO_GAIN_RATE2 0x8 +#define QSERDES_V6_N4_RX_UCDR_SO_GAIN_RATE2 0x18 +#define QSERDES_V6_N4_RX_UCDR_PI_CONTROLS 0x20 +#define QSERDES_V6_N4_RX_IVCM_CAL_CODE_OVERRIDE 0x94 +#define QSERDES_V6_N4_RX_RX_IVCM_CAL_CTRL2 0x9c +#define QSERDES_V6_N4_RX_RX_IVCM_POSTCAL_OFFSET 0xa0 +#define QSERDES_V6_N4_RX_DFE_3 0xb4 +#define QSERDES_V6_N4_RX_VGA_CAL_CNTRL1 0xe0 +#define QSERDES_V6_N4_RX_VGA_CAL_MAN_VAL 0xe8 +#define QSERDES_V6_N4_RX_GM_CAL 0x10c +#define QSERDES_V6_N4_RX_SIGDET_ENABLES 0x148 +#define QSERDES_V6_N4_RX_SIGDET_CNTRL 0x14c +#define QSERDES_V6_N4_RX_SIGDET_DEGLITCH_CNTRL 0x154 +#define QSERDES_V6_N4_RX_DFE_CTLE_POST_CAL_OFFSET 0x194 +#define QSERDES_V6_N4_RX_Q_PI_INTRINSIC_BIAS_RATE32 0x1dc +#define QSERDES_V6_N4_RX_UCDR_PI_CTRL1 0x23c +#define QSERDES_V6_N4_RX_UCDR_PI_CTRL2 0x240 +#define QSERDES_V6_N4_RX_UCDR_SB2_GAIN2_RATE2 0x27c +#define QSERDES_V6_N4_RX_DFE_DAC_ENABLE1 0x298 +#define QSERDES_V6_N4_RX_MODE_RATE_0_1_B0 0x2b8 +#define QSERDES_V6_N4_RX_MODE_RATE_0_1_B1 0x2bc +#define QSERDES_V6_N4_RX_MODE_RATE_0_1_B2 0x2c0 +#define QSERDES_V6_N4_RX_MODE_RATE_0_1_B3 0x2c4 +#define QSERDES_V6_N4_RX_MODE_RATE_0_1_B4 0x2c8 +#define QSERDES_V6_N4_RX_MODE_RATE_0_1_B5 0x2cc +#define QSERDES_V6_N4_RX_MODE_RATE_0_1_B6 0x2d0 +#define QSERDES_V6_N4_RX_MODE_RATE2_B0 0x2d4 +#define QSERDES_V6_N4_RX_MODE_RATE2_B1 0x2d8 +#define QSERDES_V6_N4_RX_MODE_RATE2_B2 0x2dc +#define QSERDES_V6_N4_RX_MODE_RATE2_B3 0x2e0 +#define QSERDES_V6_N4_RX_MODE_RATE2_B4 0x2e4 +#define QSERDES_V6_N4_RX_MODE_RATE2_B5 0x2e8 +#define QSERDES_V6_N4_RX_MODE_RATE2_B6 0x2ec +#define QSERDES_V6_N4_RX_RX_SUMMER_CAL_SPD_MODE 0x30c +#define QSERDES_V6_N4_RX_RX_BKUP_CTRL1 0x310 + +#endif diff --git a/drivers/phy/qualcomm/phy-qcom-qmp.h b/drivers/phy/qualcomm/phy-qcom-qmp.h index 3a0512c3e07ab..63b3cbfcb50f2 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp.h +++ b/drivers/phy/qualcomm/phy-qcom-qmp.h @@ -24,6 +24,7 @@ #include "phy-qcom-qmp-qserdes-com-v6.h" #include "phy-qcom-qmp-qserdes-txrx-v6.h" #include "phy-qcom-qmp-qserdes-txrx-v6_20.h" +#include "phy-qcom-qmp-qserdes-txrx-v6_n4.h" #include "phy-qcom-qmp-qserdes-ln-shrd-v6.h" #include "phy-qcom-qmp-qserdes-com-v7.h" -- GitLab From ee6fcc0f337d6790b46838bab76c36e8bdd5658e Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Thu, 7 Dec 2023 14:19:16 +0200 Subject: [PATCH 0468/1290] phy: qcom-qmp: qserdes-txrx: Add v7 register offsets The X1E80100 platform bumps the HW version of QMP phy to v7 for USB and PCIE. Add the new qserdes TX RX offsets in a dedicated header file. Reviewed-by: Dmitry Baryshkov Signed-off-by: Abel Vesa Link: https://lore.kernel.org/r/20231122-phy-qualcomm-v6-v6-20-v7-new-offsets-v3-7-dfd1c375ef61@linaro.org Signed-off-by: Vinod Koul --- .../qualcomm/phy-qcom-qmp-qserdes-txrx-v7.h | 78 +++++++++++++++++++ drivers/phy/qualcomm/phy-qcom-qmp.h | 1 + 2 files changed, 79 insertions(+) create mode 100644 drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v7.h diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v7.h b/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v7.h new file mode 100644 index 0000000000000..91f865b11347a --- /dev/null +++ b/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-v7.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023, Linaro Limited + */ + +#ifndef QCOM_PHY_QMP_QSERDES_TXRX_V7_H_ +#define QCOM_PHY_QMP_QSERDES_TXRX_V7_H_ + +#define QSERDES_V7_TX_CLKBUF_ENABLE 0x08 +#define QSERDES_V7_TX_RESET_TSYNC_EN 0x1c +#define QSERDES_V7_TX_PRE_STALL_LDO_BOOST_EN 0x20 +#define QSERDES_V7_TX_TX_BAND 0x24 +#define QSERDES_V7_TX_INTERFACE_SELECT 0x2c +#define QSERDES_V7_TX_RES_CODE_LANE_TX 0x34 +#define QSERDES_V7_TX_RES_CODE_LANE_RX 0x38 +#define QSERDES_V7_TX_RES_CODE_LANE_OFFSET_TX 0x3c +#define QSERDES_V7_TX_RES_CODE_LANE_OFFSET_RX 0x40 +#define QSERDES_V7_TX_PARRATE_REC_DETECT_IDLE_EN 0x60 +#define QSERDES_V7_TX_BIST_PATTERN7 0x7c +#define QSERDES_V7_TX_LANE_MODE_1 0x84 +#define QSERDES_V7_TX_LANE_MODE_2 0x88 +#define QSERDES_V7_TX_LANE_MODE_3 0x8c +#define QSERDES_V7_TX_LANE_MODE_4 0x90 +#define QSERDES_V7_TX_LANE_MODE_5 0x94 +#define QSERDES_V7_TX_RCV_DETECT_LVL_2 0xa4 +#define QSERDES_V7_TX_TRAN_DRVR_EMP_EN 0xc0 +#define QSERDES_V7_TX_TX_INTERFACE_MODE 0xc4 +#define QSERDES_V7_TX_VMODE_CTRL1 0xc8 +#define QSERDES_V7_TX_PI_QEC_CTRL 0xe4 + +#define QSERDES_V7_RX_UCDR_FO_GAIN 0x08 +#define QSERDES_V7_RX_UCDR_SO_GAIN 0x14 +#define QSERDES_V7_RX_UCDR_FASTLOCK_FO_GAIN 0x30 +#define QSERDES_V7_RX_UCDR_SO_SATURATION_AND_ENABLE 0x34 +#define QSERDES_V7_RX_UCDR_FASTLOCK_COUNT_LOW 0x3c +#define QSERDES_V7_RX_UCDR_FASTLOCK_COUNT_HIGH 0x40 +#define QSERDES_V7_RX_UCDR_PI_CONTROLS 0x44 +#define QSERDES_V7_RX_UCDR_SB2_THRESH1 0x4c +#define QSERDES_V7_RX_UCDR_SB2_THRESH2 0x50 +#define QSERDES_V7_RX_UCDR_SB2_GAIN1 0x54 +#define QSERDES_V7_RX_UCDR_SB2_GAIN2 0x58 +#define QSERDES_V7_RX_AUX_DATA_TCOARSE_TFINE 0x60 +#define QSERDES_V7_RX_TX_ADAPT_POST_THRESH 0xcc +#define QSERDES_V7_RX_VGA_CAL_CNTRL1 0xd4 +#define QSERDES_V7_RX_VGA_CAL_CNTRL2 0xd8 +#define QSERDES_V7_RX_GM_CAL 0xdc +#define QSERDES_V7_RX_RX_EQU_ADAPTOR_CNTRL2 0xec +#define QSERDES_V7_RX_RX_EQU_ADAPTOR_CNTRL3 0xf0 +#define QSERDES_V7_RX_RX_EQU_ADAPTOR_CNTRL4 0xf4 +#define QSERDES_V7_RX_RX_IDAC_TSETTLE_LOW 0xf8 +#define QSERDES_V7_RX_RX_IDAC_TSETTLE_HIGH 0xfc +#define QSERDES_V7_RX_RX_EQ_OFFSET_ADAPTOR_CNTRL1 0x110 +#define QSERDES_V7_RX_SIDGET_ENABLES 0x118 +#define QSERDES_V7_RX_SIGDET_CNTRL 0x11c +#define QSERDES_V7_RX_SIGDET_DEGLITCH_CNTRL 0x124 +#define QSERDES_V7_RX_RX_MODE_00_LOW 0x15c +#define QSERDES_V7_RX_RX_MODE_00_HIGH 0x160 +#define QSERDES_V7_RX_RX_MODE_00_HIGH2 0x164 +#define QSERDES_V7_RX_RX_MODE_00_HIGH3 0x168 +#define QSERDES_V7_RX_RX_MODE_00_HIGH4 0x16c +#define QSERDES_V7_RX_RX_MODE_01_LOW 0x170 +#define QSERDES_V7_RX_RX_MODE_01_HIGH 0x174 +#define QSERDES_V7_RX_RX_MODE_01_HIGH2 0x178 +#define QSERDES_V7_RX_RX_MODE_01_HIGH3 0x17c +#define QSERDES_V7_RX_RX_MODE_01_HIGH4 0x180 +#define QSERDES_V7_RX_RX_MODE_10_LOW 0x184 +#define QSERDES_V7_RX_RX_MODE_10_HIGH 0x188 +#define QSERDES_V7_RX_RX_MODE_10_HIGH2 0x18c +#define QSERDES_V7_RX_RX_MODE_10_HIGH3 0x190 +#define QSERDES_V7_RX_RX_MODE_10_HIGH4 0x194 +#define QSERDES_V7_RX_DFE_EN_TIMER 0x1a0 +#define QSERDES_V7_RX_DFE_CTLE_POST_CAL_OFFSET 0x1a4 +#define QSERDES_V7_RX_DCC_CTRL1 0x1a8 +#define QSERDES_V7_RX_VTH_CODE 0x1b0 +#define QSERDES_V7_RX_SIGDET_CAL_CTRL1 0x1e4 +#define QSERDES_V7_RX_SIGDET_CAL_TRIM 0x1f8 + +#endif diff --git a/drivers/phy/qualcomm/phy-qcom-qmp.h b/drivers/phy/qualcomm/phy-qcom-qmp.h index 63b3cbfcb50f2..6923496cbfee2 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp.h +++ b/drivers/phy/qualcomm/phy-qcom-qmp.h @@ -28,6 +28,7 @@ #include "phy-qcom-qmp-qserdes-ln-shrd-v6.h" #include "phy-qcom-qmp-qserdes-com-v7.h" +#include "phy-qcom-qmp-qserdes-txrx-v7.h" #include "phy-qcom-qmp-qserdes-pll.h" -- GitLab From 54c899f0d647e5724b02486243da40134dc91c45 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 4 Dec 2023 15:49:17 -0800 Subject: [PATCH 0469/1290] phy: renesas: phy-rcar-gen2: use select for GENERIC_PHY Change the last "depends on GENERIC_PHY" to use select, like the other 170+ Kconfig users do. This can help prevent circular dependency issues. Signed-off-by: Randy Dunlap Cc: Vinod Koul Cc: Kishon Vijay Abraham I Cc: linux-phy@lists.infradead.org Link: https://lore.kernel.org/r/20231204234917.23509-1-rdunlap@infradead.org Signed-off-by: Vinod Koul --- drivers/phy/renesas/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/phy/renesas/Kconfig b/drivers/phy/renesas/Kconfig index 36505fc5f386e..e342eef0640b7 100644 --- a/drivers/phy/renesas/Kconfig +++ b/drivers/phy/renesas/Kconfig @@ -13,7 +13,7 @@ config PHY_R8A779F0_ETHERNET_SERDES config PHY_RCAR_GEN2 tristate "Renesas R-Car generation 2 USB PHY driver" depends on ARCH_RENESAS - depends on GENERIC_PHY + select GENERIC_PHY help Support for USB PHY found on Renesas R-Car generation 2 SoCs. -- GitLab From 21a1d02579ae75fd45555b84d20ba55632a14a19 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 18 Dec 2023 14:05:53 +0100 Subject: [PATCH 0470/1290] dt-bindings: phy: qcom,sc8280xp-qmp-usb43dp-phy: fix path to header Fix the path to bindings header in description. Fixes: e1c4c5436b4a ("dt-bindings: phy: qcom,qmp-usb3-dp: fix sc8280xp binding") Cc: stable@vger.kernel.org Signed-off-by: Krzysztof Kozlowski Acked-by: Rob Herring Reviewed-by: Johan Hovold Link: https://lore.kernel.org/r/20231218130553.45893-1-krzysztof.kozlowski@linaro.org Signed-off-by: Vinod Koul --- .../bindings/phy/qcom,sc8280xp-qmp-usb43dp-phy.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-usb43dp-phy.yaml b/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-usb43dp-phy.yaml index ae83cb8cb21f8..59d34a2a21962 100644 --- a/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-usb43dp-phy.yaml +++ b/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-usb43dp-phy.yaml @@ -63,12 +63,12 @@ properties: "#clock-cells": const: 1 description: - See include/dt-bindings/dt-bindings/phy/phy-qcom-qmp.h + See include/dt-bindings/phy/phy-qcom-qmp.h "#phy-cells": const: 1 description: - See include/dt-bindings/dt-bindings/phy/phy-qcom-qmp.h + See include/dt-bindings/phy/phy-qcom-qmp.h orientation-switch: description: -- GitLab From 5301b7a04040b0a6191856c765146e0a9ab88ebc Mon Sep 17 00:00:00 2001 From: Can Guo Date: Sat, 2 Dec 2023 04:36:15 -0800 Subject: [PATCH 0471/1290] phy: qualcomm: phy-qcom-qmp-ufs: Rectify SM8550 UFS HS-G4 PHY Settings The registers, which are being touched in current SM8550 UFS PHY settings, and the values being programmed are mainly the ones working for HS-G4 mode, meanwhile, there are also a few ones somehow taken from HS-G5 PHY settings. However, even consider HS-G4 mode only, some of them are incorrect and some are missing. Rectify the HS-G4 PHY settings by strictly aligning with the SM8550 UFS PHY Hardware Programming Guide suggested HS-G4 PHY settings. Fixes: 1679bfef906f ("phy: qcom-qmp-ufs: Add SM8550 support") Reviewed-by: Dmitry Baryshkov Reviewed-by: Abel Vesa Reviewed-by: Manivannan Sadhasivam Signed-off-by: Can Guo Tested-by: Neil Armstrong # on SM8550-QRD Link: https://lore.kernel.org/r/1701520577-31163-10-git-send-email-quic_cang@quicinc.com Signed-off-by: Vinod Koul --- .../phy-qcom-qmp-qserdes-txrx-ufs-v6.h | 1 + drivers/phy/qualcomm/phy-qcom-qmp-ufs.c | 28 +++++++++++++------ 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-ufs-v6.h b/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-ufs-v6.h index ae220fd04d10d..35d497fd9f9a4 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-ufs-v6.h +++ b/drivers/phy/qualcomm/phy-qcom-qmp-qserdes-txrx-ufs-v6.h @@ -11,6 +11,7 @@ #define QSERDES_UFS_V6_TX_RES_CODE_LANE_OFFSET_TX 0x30 #define QSERDES_UFS_V6_TX_RES_CODE_LANE_OFFSET_RX 0x34 #define QSERDES_UFS_V6_TX_LANE_MODE_1 0x7c +#define QSERDES_UFS_V6_TX_FR_DCC_CTRL 0x108 #define QSERDES_UFS_V6_RX_UCDR_FASTLOCK_FO_GAIN_RATE2 0x08 #define QSERDES_UFS_V6_RX_UCDR_FASTLOCK_FO_GAIN_RATE4 0x10 diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-ufs.c b/drivers/phy/qualcomm/phy-qcom-qmp-ufs.c index 5f79d188b435e..3c2e6255e26f6 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp-ufs.c +++ b/drivers/phy/qualcomm/phy-qcom-qmp-ufs.c @@ -763,22 +763,26 @@ static const struct qmp_phy_init_tbl sm8550_ufsphy_serdes[] = { QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_CCTRL_MODE0, 0x14), QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP1_MODE0, 0x7f), QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP2_MODE0, 0x06), - QMP_PHY_INIT_CFG(QSERDES_V6_COM_DEC_START_MODE0, 0x4c), - QMP_PHY_INIT_CFG(QSERDES_V6_COM_CP_CTRL_MODE0, 0x0a), - QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_RCTRL_MODE0, 0x18), - QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_CCTRL_MODE0, 0x14), - QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP1_MODE0, 0x99), - QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP2_MODE0, 0x07), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DEC_START_MODE1, 0x4c), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_CP_CTRL_MODE1, 0x0a), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_RCTRL_MODE1, 0x18), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_CCTRL_MODE1, 0x14), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP1_MODE1, 0x99), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP2_MODE1, 0x07), +}; + +static const struct qmp_phy_init_tbl sm8550_ufsphy_hs_b_serdes[] = { + QMP_PHY_INIT_CFG(QSERDES_V6_COM_VCO_TUNE_MAP, 0x44), }; static const struct qmp_phy_init_tbl sm8550_ufsphy_tx[] = { - QMP_PHY_INIT_CFG(QSERDES_V6_TX_LANE_MODE_1, 0x05), + QMP_PHY_INIT_CFG(QSERDES_UFS_V6_TX_LANE_MODE_1, 0x05), QMP_PHY_INIT_CFG(QSERDES_UFS_V6_TX_RES_CODE_LANE_OFFSET_TX, 0x07), + QMP_PHY_INIT_CFG(QSERDES_UFS_V6_TX_FR_DCC_CTRL, 0x4c), }; static const struct qmp_phy_init_tbl sm8550_ufsphy_rx[] = { - QMP_PHY_INIT_CFG(QSERDES_UFS_V6_RX_UCDR_FASTLOCK_FO_GAIN_RATE2, 0x0c), - QMP_PHY_INIT_CFG(QSERDES_UFS_V6_RX_UCDR_FASTLOCK_FO_GAIN_RATE4, 0x0f), + QMP_PHY_INIT_CFG(QSERDES_UFS_V6_RX_UCDR_FO_GAIN_RATE2, 0x0c), QMP_PHY_INIT_CFG(QSERDES_UFS_V6_RX_VGA_CAL_MAN_VAL, 0x0e), QMP_PHY_INIT_CFG(QSERDES_UFS_V6_RX_MODE_RATE_0_1_B0, 0xc2), @@ -801,6 +805,8 @@ static const struct qmp_phy_init_tbl sm8550_ufsphy_pcs[] = { QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_TX_MID_TERM_CTRL1, 0x43), QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_PLL_CNTL, 0x2b), QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_MULTI_LANE_CTRL1, 0x02), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_TX_HSGEAR_CAPABILITY, 0x04), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_UFS_RX_HSGEAR_CAPABILITY, 0x04), }; static const struct qmp_phy_init_tbl sm8650_ufsphy_serdes[] = { @@ -1357,6 +1363,10 @@ static const struct qmp_phy_cfg sm8550_ufsphy_cfg = { .pcs = sm8550_ufsphy_pcs, .pcs_num = ARRAY_SIZE(sm8550_ufsphy_pcs), }, + .tbls_hs_b = { + .serdes = sm8550_ufsphy_hs_b_serdes, + .serdes_num = ARRAY_SIZE(sm8550_ufsphy_hs_b_serdes), + }, .clk_list = sdm845_ufs_phy_clk_l, .num_clks = ARRAY_SIZE(sdm845_ufs_phy_clk_l), .vreg_list = qmp_phy_vreg_l, -- GitLab From 57f31e911eaa5e682c0a03253f8b4348adee52cb Mon Sep 17 00:00:00 2001 From: Wang Jinchao Date: Fri, 15 Dec 2023 14:09:00 +0800 Subject: [PATCH 0472/1290] phy: phy-can-transceiver: insert space after include Maintain Consistent Formatting: Insert Space after #include Signed-off-by: Wang Jinchao Reviewed-by: Marc Kleine-Budde Link: https://lore.kernel.org/r/202312151407+0800-wangjinchao@xfusion.com Signed-off-by: Vinod Koul --- drivers/phy/phy-can-transceiver.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/phy/phy-can-transceiver.c b/drivers/phy/phy-can-transceiver.c index 840b7f8a31c5f..ee4ce42496985 100644 --- a/drivers/phy/phy-can-transceiver.c +++ b/drivers/phy/phy-can-transceiver.c @@ -6,11 +6,11 @@ * */ #include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include #include struct can_transceiver_data { -- GitLab From cc230a4cd8e91f64c90b5494dfd76848197418ed Mon Sep 17 00:00:00 2001 From: Chunfeng Yun Date: Mon, 11 Dec 2023 10:56:23 +0800 Subject: [PATCH 0473/1290] dt-bindings: phy: mediatek: tphy: add a property for force-mode switch Due to some old SoCs with shared t-phy between usb3 and pcie only support force-mode switch, and shared and non-shared t-phy may exist at the same time on a SoC, can't use compatible to distinguish between shared and non-shared t-phy, add a property to supported it. Currently, only support switch from default pcie mode to usb3 mode. But now prefer to use "mediatek,syscon-type" on new SoC as far as possible. Signed-off-by: Chunfeng Yun Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20231211025624.28991-1-chunfeng.yun@mediatek.com Signed-off-by: Vinod Koul --- Documentation/devicetree/bindings/phy/mediatek,tphy.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/devicetree/bindings/phy/mediatek,tphy.yaml b/Documentation/devicetree/bindings/phy/mediatek,tphy.yaml index 2bb91542e984e..acba0720125dd 100644 --- a/Documentation/devicetree/bindings/phy/mediatek,tphy.yaml +++ b/Documentation/devicetree/bindings/phy/mediatek,tphy.yaml @@ -235,6 +235,15 @@ patternProperties: Specify the flag to enable BC1.2 if support it type: boolean + mediatek,force-mode: + description: + The force mode is used to manually switch the shared phy mode between + USB3 and PCIe, when USB3 phy type is selected by the consumer, and + force-mode is set, will cause phy's power and pipe toggled and force + phy as USB3 mode which switched from default PCIe mode. But perfer to + use the property "mediatek,syscon-type" for newer SoCs that support it. + type: boolean + mediatek,syscon-type: $ref: /schemas/types.yaml#/definitions/phandle-array maxItems: 1 -- GitLab From 9b27303003f5af0d378f29ccccea57c7d65cc642 Mon Sep 17 00:00:00 2001 From: Chunfeng Yun Date: Mon, 11 Dec 2023 10:56:24 +0800 Subject: [PATCH 0474/1290] phy: mediatek: tphy: add support force phy mode switch this is used to be compatible with old SoCs, such as mt8195, which shares t-phy between usb3 and pcie controller, usually, it's default mode is pcie rc mode, and could use force mode to switch into usb3 mode, because pericfg layer doesn't provide mode switch, also no efuse or jumper can be used; Currently, only support switch from default pcie mode to usb3; Note: don't use this way on new SoCs, use pericfg layer's mode switch instead (by perperty "mediatek,syscon-type"). Signed-off-by: Chunfeng Yun Link: https://lore.kernel.org/r/20231211025624.28991-2-chunfeng.yun@mediatek.com Signed-off-by: Vinod Koul --- drivers/phy/mediatek/phy-mtk-tphy.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/drivers/phy/mediatek/phy-mtk-tphy.c b/drivers/phy/mediatek/phy-mtk-tphy.c index 05eab9014132f..a4746f6cb8a18 100644 --- a/drivers/phy/mediatek/phy-mtk-tphy.c +++ b/drivers/phy/mediatek/phy-mtk-tphy.c @@ -185,6 +185,10 @@ #define P3D_RG_CDR_BIR_LTD1 GENMASK(28, 24) #define P3D_RG_CDR_BIR_LTD0 GENMASK(12, 8) +#define U3P_U3_PHYD_TOP1 0x100 +#define P3D_RG_PHY_MODE GENMASK(2, 1) +#define P3D_RG_FORCE_PHY_MODE BIT(0) + #define U3P_U3_PHYD_RXDET1 0x128 #define P3D_RG_RXDET_STB2_SET GENMASK(17, 9) @@ -327,6 +331,7 @@ struct mtk_phy_instance { int discth; int pre_emphasis; bool bc12_en; + bool type_force_mode; }; struct mtk_tphy { @@ -768,6 +773,23 @@ static void u3_phy_instance_init(struct mtk_tphy *tphy, void __iomem *phya = u3_banks->phya; void __iomem *phyd = u3_banks->phyd; + if (instance->type_force_mode) { + /* force phy as usb mode, default is pcie rc mode */ + mtk_phy_update_field(phyd + U3P_U3_PHYD_TOP1, P3D_RG_PHY_MODE, 1); + mtk_phy_set_bits(phyd + U3P_U3_PHYD_TOP1, P3D_RG_FORCE_PHY_MODE); + /* power down phy by ip and pipe reset */ + mtk_phy_set_bits(u3_banks->chip + U3P_U3_CHIP_GPIO_CTLD, + P3C_FORCE_IP_SW_RST | P3C_MCU_BUS_CK_GATE_EN); + mtk_phy_set_bits(u3_banks->chip + U3P_U3_CHIP_GPIO_CTLE, + P3C_RG_SWRST_U3_PHYD | P3C_RG_SWRST_U3_PHYD_FORCE_EN); + udelay(10); + /* power on phy again */ + mtk_phy_clear_bits(u3_banks->chip + U3P_U3_CHIP_GPIO_CTLD, + P3C_FORCE_IP_SW_RST | P3C_MCU_BUS_CK_GATE_EN); + mtk_phy_clear_bits(u3_banks->chip + U3P_U3_CHIP_GPIO_CTLE, + P3C_RG_SWRST_U3_PHYD | P3C_RG_SWRST_U3_PHYD_FORCE_EN); + } + /* gating PCIe Analog XTAL clock */ mtk_phy_set_bits(u3_banks->spllc + U3P_SPLLC_XTALCTL3, XC3_RG_U3_XTAL_RX_PWD | XC3_RG_U3_FRC_XTAL_RX_PWD); @@ -1120,6 +1142,9 @@ static void phy_parse_property(struct mtk_tphy *tphy, { struct device *dev = &instance->phy->dev; + if (instance->type == PHY_TYPE_USB3) + instance->type_force_mode = device_property_read_bool(dev, "mediatek,force-mode"); + if (instance->type != PHY_TYPE_USB2) return; -- GitLab From ec80c175c096eb752d581ef0aafb12ed46010b2a Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Thu, 7 Dec 2023 14:49:10 +0200 Subject: [PATCH 0475/1290] dt-bindings: phy: qcom: snps-eusb2: Document the X1E80100 compatible Add the X1E80100 compatible to the list of supported PHYs. Signed-off-by: Abel Vesa Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20231122-phy-qualcomm-eusb2-x1e80100-v2-1-3ba9a8e5ade4@linaro.org Signed-off-by: Vinod Koul --- Documentation/devicetree/bindings/phy/qcom,snps-eusb2-phy.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/phy/qcom,snps-eusb2-phy.yaml b/Documentation/devicetree/bindings/phy/qcom,snps-eusb2-phy.yaml index 8f5d7362046c9..b82f7f5731ed4 100644 --- a/Documentation/devicetree/bindings/phy/qcom,snps-eusb2-phy.yaml +++ b/Documentation/devicetree/bindings/phy/qcom,snps-eusb2-phy.yaml @@ -19,6 +19,7 @@ properties: - enum: - qcom,sdx75-snps-eusb2-phy - qcom,sm8650-snps-eusb2-phy + - qcom,x1e80100-snps-eusb2-phy - const: qcom,sm8550-snps-eusb2-phy - const: qcom,sm8550-snps-eusb2-phy -- GitLab From f11aeb9d49632afc7abd9dfea6bcf5b3dd8addc1 Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Thu, 7 Dec 2023 15:16:41 +0200 Subject: [PATCH 0476/1290] dt-bindings: phy: qcom,sc8280xp-qmp-usb43dp-phy: Document X1E80100 compatible Add the X1E80100 compatible to the list. Signed-off-by: Abel Vesa Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20231201-x1e80100-phy-combo-v1-1-6938ec41f3ac@linaro.org Signed-off-by: Vinod Koul --- .../devicetree/bindings/phy/qcom,sc8280xp-qmp-usb43dp-phy.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-usb43dp-phy.yaml b/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-usb43dp-phy.yaml index 59d34a2a21962..2d0d7e9e64311 100644 --- a/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-usb43dp-phy.yaml +++ b/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-usb43dp-phy.yaml @@ -28,6 +28,7 @@ properties: - qcom,sm8450-qmp-usb3-dp-phy - qcom,sm8550-qmp-usb3-dp-phy - qcom,sm8650-qmp-usb3-dp-phy + - qcom,x1e80100-qmp-usb3-dp-phy reg: maxItems: 1 @@ -130,6 +131,7 @@ allOf: - qcom,sm6350-qmp-usb3-dp-phy - qcom,sm8550-qmp-usb3-dp-phy - qcom,sm8650-qmp-usb3-dp-phy + - qcom,x1e80100-qmp-usb3-dp-phy then: required: - power-domains -- GitLab From d7b3579f84f74e0f7d88d180d4e15c679786b648 Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Thu, 7 Dec 2023 15:16:42 +0200 Subject: [PATCH 0477/1290] phy: qcom-qmp-combo: Add x1e80100 USB/DP combo phys The X1E80100 has three copies of an USB/DP compbo PHY, add support for this to the Qualcomm QMP PHY driver. Signed-off-by: Abel Vesa Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20231201-x1e80100-phy-combo-v1-2-6938ec41f3ac@linaro.org Signed-off-by: Vinod Koul --- drivers/phy/qualcomm/phy-qcom-qmp-combo.c | 170 ++++++++++++++++++++++ 1 file changed, 170 insertions(+) diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-combo.c b/drivers/phy/qualcomm/phy-qcom-qmp-combo.c index 0417856b8e7bc..cd88f7b51088e 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp-combo.c +++ b/drivers/phy/qualcomm/phy-qcom-qmp-combo.c @@ -1203,6 +1203,127 @@ static const struct qmp_phy_init_tbl sc8280xp_usb43dp_pcs_tbl[] = { QMP_PHY_INIT_CFG(QPHY_V5_PCS_USB3_RXEQTRAINING_DFE_TIME_S2, 0x07), }; +static const struct qmp_phy_init_tbl x1e80100_usb43dp_serdes_tbl[] = { + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_EN_CENTER, 0x01), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_PER1, 0x62), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_PER2, 0x02), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_STEP_SIZE1_MODE0, 0xc2), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_STEP_SIZE2_MODE0, 0x03), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_STEP_SIZE1_MODE1, 0xc2), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SSC_STEP_SIZE2_MODE1, 0x03), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SYSCLK_BUF_ENABLE, 0x0a), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_CP_CTRL_MODE0, 0x02), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_CP_CTRL_MODE1, 0x02), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_RCTRL_MODE0, 0x16), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_RCTRL_MODE1, 0x16), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_CCTRL_MODE0, 0x36), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_CCTRL_MODE1, 0x36), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SYSCLK_EN_SEL, 0x1a), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP_EN, 0x04), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP_CFG, 0x04), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP1_MODE0, 0x08), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP2_MODE0, 0x1a), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP1_MODE1, 0x16), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_LOCK_CMP2_MODE1, 0x41), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DEC_START_MODE0, 0x82), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DEC_START_MSB_MODE0, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DEC_START_MODE1, 0x82), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DEC_START_MSB_MODE1, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DIV_FRAC_START1_MODE0, 0x55), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DIV_FRAC_START2_MODE0, 0x55), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DIV_FRAC_START3_MODE0, 0x03), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DIV_FRAC_START1_MODE1, 0x55), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DIV_FRAC_START2_MODE1, 0x55), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_DIV_FRAC_START3_MODE1, 0x03), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_VCO_TUNE_MAP, 0x14), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_VCO_TUNE1_MODE0, 0xba), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_VCO_TUNE2_MODE0, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_VCO_TUNE1_MODE1, 0xba), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_VCO_TUNE2_MODE1, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_HSCLK_SEL_1, 0x13), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_HSCLK_HS_SWITCH_SEL_1, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_CORE_CLK_DIV_MODE0, 0x0a), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_CORECLK_DIV_MODE1, 0x04), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_CORE_CLK_EN, 0xa0), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_CMN_CONFIG_1, 0x76), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_IVCO, 0x0f), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_PLL_IVCO_MODE1, 0x0f), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_INTEGLOOP_GAIN0_MODE0, 0x20), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_INTEGLOOP_GAIN0_MODE1, 0x20), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_VCO_TUNE_INITVAL2, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_VCO_TUNE_MAXVAL2, 0x01), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_SVS_MODE_CLK_SEL, 0x0a), + QMP_PHY_INIT_CFG(QSERDES_V6_COM_BG_TIMER, 0x0a), +}; + +static const struct qmp_phy_init_tbl x1e80100_usb43dp_tx_tbl[] = { + QMP_PHY_INIT_CFG(QSERDES_V6_N4_TX_LANE_MODE_1, 0x05), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_TX_LANE_MODE_2, 0x50), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_TX_LANE_MODE_3, 0x50), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_TX_RES_CODE_LANE_OFFSET_TX, 0x1f), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_TX_RES_CODE_LANE_OFFSET_RX, 0x0a), +}; + +static const struct qmp_phy_init_tbl x1e80100_usb43dp_rx_tbl[] = { + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_SIGDET_CNTRL, 0x04), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_SIGDET_DEGLITCH_CNTRL, 0x0e), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_SIGDET_ENABLES, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_MODE_RATE_0_1_B0, 0xc3), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_MODE_RATE_0_1_B1, 0xc3), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_MODE_RATE_0_1_B2, 0xd8), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_MODE_RATE_0_1_B3, 0x9e), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_MODE_RATE_0_1_B4, 0x36), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_MODE_RATE_0_1_B5, 0xb6), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_MODE_RATE_0_1_B6, 0x64), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_MODE_RATE2_B0, 0xd6), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_MODE_RATE2_B1, 0xee), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_MODE_RATE2_B2, 0x18), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_MODE_RATE2_B3, 0x9a), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_MODE_RATE2_B4, 0x04), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_MODE_RATE2_B5, 0x36), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_MODE_RATE2_B6, 0xe3), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_IVCM_CAL_CODE_OVERRIDE, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_RX_IVCM_CAL_CTRL2, 0x80), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_RX_SUMMER_CAL_SPD_MODE, 0x2f), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_DFE_CTLE_POST_CAL_OFFSET, 0x08), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_UCDR_PI_CONTROLS, 0x15), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_UCDR_PI_CTRL1, 0xd0), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_UCDR_PI_CTRL2, 0x48), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_UCDR_SB2_GAIN2_RATE2, 0x0a), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_RX_IVCM_POSTCAL_OFFSET, 0x7c), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_VGA_CAL_CNTRL1, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_VGA_CAL_MAN_VAL, 0x04), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_DFE_DAC_ENABLE1, 0x88), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_DFE_3, 0x45), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_GM_CAL, 0x0d), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_UCDR_FO_GAIN_RATE2, 0x09), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_UCDR_SO_GAIN_RATE2, 0x05), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_Q_PI_INTRINSIC_BIAS_RATE32, 0x2f), + QMP_PHY_INIT_CFG(QSERDES_V6_N4_RX_RX_BKUP_CTRL1, 0x14), +}; + +static const struct qmp_phy_init_tbl x1e80100_usb43dp_pcs_tbl[] = { + QMP_PHY_INIT_CFG(QPHY_V6_PCS_RCVR_DTCT_DLY_P1U2_L, 0xe7), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_RCVR_DTCT_DLY_P1U2_H, 0x03), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_LOCK_DETECT_CONFIG1, 0xc4), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_LOCK_DETECT_CONFIG2, 0x89), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_LOCK_DETECT_CONFIG3, 0x20), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_LOCK_DETECT_CONFIG6, 0x13), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_REFGEN_REQ_CONFIG1, 0x21), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_RX_SIGDET_LVL, 0x55), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_CDR_RESET_TIME, 0x0a), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_ALIGN_DETECT_CONFIG1, 0xd4), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_ALIGN_DETECT_CONFIG2, 0x30), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_PCS_TX_RX_CONFIG, 0x0c), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_EQ_CONFIG1, 0x4b), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_EQ_CONFIG5, 0x10), +}; + +static const struct qmp_phy_init_tbl x1e80100_usb43dp_pcs_usb_tbl[] = { + QMP_PHY_INIT_CFG(QPHY_V6_PCS_USB3_LFPS_DET_HIGH_COUNT_VAL, 0xf8), + QMP_PHY_INIT_CFG(QPHY_V6_PCS_USB3_RXEQTRAINING_DFE_TIME_S2, 0x07), +}; + /* list of regulators */ struct qmp_regulator_data { const char *name; @@ -1684,6 +1805,51 @@ static const struct qmp_phy_cfg sc8280xp_usb43dpphy_cfg = { .regs = qmp_v5_5nm_usb3phy_regs_layout, }; +static const struct qmp_phy_cfg x1e80100_usb3dpphy_cfg = { + .offsets = &qmp_combo_offsets_v5, + + .serdes_tbl = x1e80100_usb43dp_serdes_tbl, + .serdes_tbl_num = ARRAY_SIZE(x1e80100_usb43dp_serdes_tbl), + .tx_tbl = x1e80100_usb43dp_tx_tbl, + .tx_tbl_num = ARRAY_SIZE(x1e80100_usb43dp_tx_tbl), + .rx_tbl = x1e80100_usb43dp_rx_tbl, + .rx_tbl_num = ARRAY_SIZE(x1e80100_usb43dp_rx_tbl), + .pcs_tbl = x1e80100_usb43dp_pcs_tbl, + .pcs_tbl_num = ARRAY_SIZE(x1e80100_usb43dp_pcs_tbl), + .pcs_usb_tbl = x1e80100_usb43dp_pcs_usb_tbl, + .pcs_usb_tbl_num = ARRAY_SIZE(x1e80100_usb43dp_pcs_usb_tbl), + + .dp_serdes_tbl = qmp_v6_dp_serdes_tbl, + .dp_serdes_tbl_num = ARRAY_SIZE(qmp_v6_dp_serdes_tbl), + .dp_tx_tbl = qmp_v6_dp_tx_tbl, + .dp_tx_tbl_num = ARRAY_SIZE(qmp_v6_dp_tx_tbl), + + .serdes_tbl_rbr = qmp_v6_dp_serdes_tbl_rbr, + .serdes_tbl_rbr_num = ARRAY_SIZE(qmp_v6_dp_serdes_tbl_rbr), + .serdes_tbl_hbr = qmp_v6_dp_serdes_tbl_hbr, + .serdes_tbl_hbr_num = ARRAY_SIZE(qmp_v6_dp_serdes_tbl_hbr), + .serdes_tbl_hbr2 = qmp_v6_dp_serdes_tbl_hbr2, + .serdes_tbl_hbr2_num = ARRAY_SIZE(qmp_v6_dp_serdes_tbl_hbr2), + .serdes_tbl_hbr3 = qmp_v6_dp_serdes_tbl_hbr3, + .serdes_tbl_hbr3_num = ARRAY_SIZE(qmp_v6_dp_serdes_tbl_hbr3), + + .swing_hbr_rbr = &qmp_dp_v5_voltage_swing_hbr_rbr, + .pre_emphasis_hbr_rbr = &qmp_dp_v5_pre_emphasis_hbr_rbr, + .swing_hbr3_hbr2 = &qmp_dp_v5_voltage_swing_hbr3_hbr2, + .pre_emphasis_hbr3_hbr2 = &qmp_dp_v5_pre_emphasis_hbr3_hbr2, + + .dp_aux_init = qmp_v4_dp_aux_init, + .configure_dp_tx = qmp_v4_configure_dp_tx, + .configure_dp_phy = qmp_v4_configure_dp_phy, + .calibrate_dp_phy = qmp_v4_calibrate_dp_phy, + + .reset_list = msm8996_usb3phy_reset_l, + .num_resets = ARRAY_SIZE(msm8996_usb3phy_reset_l), + .vreg_list = qmp_phy_vreg_l, + .num_vregs = ARRAY_SIZE(qmp_phy_vreg_l), + .regs = qmp_v45_usb3phy_regs_layout, +}; + static const struct qmp_phy_cfg sm6350_usb3dpphy_cfg = { .offsets = &qmp_combo_offsets_v3, @@ -3562,6 +3728,10 @@ static const struct of_device_id qmp_combo_of_match_table[] = { .compatible = "qcom,sm8650-qmp-usb3-dp-phy", .data = &sm8550_usb3dpphy_cfg, }, + { + .compatible = "qcom,x1e80100-qmp-usb3-dp-phy", + .data = &x1e80100_usb3dpphy_cfg, + }, { } }; MODULE_DEVICE_TABLE(of, qmp_combo_of_match_table); -- GitLab From c5ffffd714373e7c6b39d3b005dbfbaadbbb4d2d Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Thu, 7 Dec 2023 14:34:11 +0200 Subject: [PATCH 0478/1290] dt-bindings: phy: qcom,sc8280xp-qmp-usb3-uni: Add X1E80100 USB PHY binding Add compatible string for Qualcomm QMP Super Speed (SS) UNI PHY found in X1E80100. Reviewed-by: Krzysztof Kozlowski Signed-off-by: Abel Vesa Link: https://lore.kernel.org/r/20231122-phy-qualcomm-usb3-uniphy-x1e80100-v3-1-273814c300f8@linaro.org Signed-off-by: Vinod Koul --- .../bindings/phy/qcom,sc8280xp-qmp-usb3-uni-phy.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-usb3-uni-phy.yaml b/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-usb3-uni-phy.yaml index 57702f7f2a46c..15d82c67f157b 100644 --- a/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-usb3-uni-phy.yaml +++ b/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-usb3-uni-phy.yaml @@ -32,6 +32,7 @@ properties: - qcom,sm8150-qmp-usb3-uni-phy - qcom,sm8250-qmp-usb3-uni-phy - qcom,sm8350-qmp-usb3-uni-phy + - qcom,x1e80100-qmp-usb3-uni-phy reg: @@ -135,6 +136,7 @@ allOf: - qcom,sm8150-qmp-usb3-uni-phy - qcom,sm8250-qmp-usb3-uni-phy - qcom,sm8350-qmp-usb3-uni-phy + - qcom,x1e80100-qmp-usb3-uni-phy then: properties: clocks: @@ -171,6 +173,7 @@ allOf: enum: - qcom,sa8775p-qmp-usb3-uni-phy - qcom,sc8280xp-qmp-usb3-uni-phy + - qcom,x1e80100-qmp-usb3-uni-phy then: required: - power-domains -- GitLab From 2daa9555ba9858c29b9734b3a104c338b718feab Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Thu, 7 Dec 2023 14:34:12 +0200 Subject: [PATCH 0479/1290] phy: qcom-qmp-usb: Add Qualcomm X1E80100 USB3 PHY support The X1E80100 platform has two instances of the USB3 UNI phy attached to the multi-port USB controller, add definition for these. Reviewed-by: Dmitry Baryshkov Signed-off-by: Abel Vesa Link: https://lore.kernel.org/r/20231122-phy-qualcomm-usb3-uniphy-x1e80100-v3-2-273814c300f8@linaro.org Signed-off-by: Vinod Koul --- drivers/phy/qualcomm/phy-qcom-qmp-usb.c | 171 ++++++++++++++++++++++++ 1 file changed, 171 insertions(+) diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-usb.c b/drivers/phy/qualcomm/phy-qcom-qmp-usb.c index 77e91b3eae79e..243cc2b9a0fb6 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp-usb.c +++ b/drivers/phy/qualcomm/phy-qcom-qmp-usb.c @@ -25,6 +25,7 @@ #include "phy-qcom-qmp-pcs-usb-v4.h" #include "phy-qcom-qmp-pcs-usb-v5.h" #include "phy-qcom-qmp-pcs-usb-v6.h" +#include "phy-qcom-qmp-pcs-usb-v7.h" /* QPHY_SW_RESET bit */ #define SW_RESET BIT(0) @@ -163,6 +164,17 @@ static const unsigned int qmp_v6_usb3phy_regs_layout[QPHY_LAYOUT_SIZE] = { [QPHY_PCS_LFPS_RXTERM_IRQ_CLEAR] = QPHY_V6_PCS_USB3_LFPS_RXTERM_IRQ_CLEAR, }; +static const unsigned int qmp_v7_usb3phy_regs_layout[QPHY_LAYOUT_SIZE] = { + [QPHY_SW_RESET] = QPHY_V7_PCS_SW_RESET, + [QPHY_START_CTRL] = QPHY_V7_PCS_START_CONTROL, + [QPHY_PCS_STATUS] = QPHY_V7_PCS_PCS_STATUS1, + [QPHY_PCS_POWER_DOWN_CONTROL] = QPHY_V7_PCS_POWER_DOWN_CONTROL, + + /* In PCS_USB */ + [QPHY_PCS_AUTONOMOUS_MODE_CTRL] = QPHY_V7_PCS_USB3_AUTONOMOUS_MODE_CTRL, + [QPHY_PCS_LFPS_RXTERM_IRQ_CLEAR] = QPHY_V7_PCS_USB3_LFPS_RXTERM_IRQ_CLEAR, +}; + static const struct qmp_phy_init_tbl ipq9574_usb3_serdes_tbl[] = { QMP_PHY_INIT_CFG(QSERDES_COM_SYSCLK_EN_SEL, 0x1a), QMP_PHY_INIT_CFG(QSERDES_COM_BIAS_EN_CLKBUFLR_EN, 0x08), @@ -1301,6 +1313,134 @@ static const struct qmp_phy_init_tbl sa8775p_usb3_uniphy_pcs_usb_tbl[] = { QMP_PHY_INIT_CFG(QPHY_V5_PCS_USB3_POWER_STATE_CONFIG1, 0x6f), }; +static const struct qmp_phy_init_tbl x1e80100_usb3_uniphy_serdes_tbl[] = { + QMP_PHY_INIT_CFG(QSERDES_V7_COM_SSC_STEP_SIZE1_MODE1, 0xc0), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_SSC_STEP_SIZE2_MODE1, 0x01), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_CP_CTRL_MODE1, 0x02), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_PLL_RCTRL_MODE1, 0x16), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_PLL_CCTRL_MODE1, 0x36), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_CORECLK_DIV_MODE1, 0x04), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_LOCK_CMP1_MODE1, 0x16), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_LOCK_CMP2_MODE1, 0x41), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_DEC_START_MODE1, 0x41), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_DIV_FRAC_START1_MODE1, 0x55), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_DIV_FRAC_START2_MODE1, 0x75), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_DIV_FRAC_START3_MODE1, 0x01), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_HSCLK_SEL_1, 0x01), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_VCO_TUNE1_MODE1, 0x25), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_VCO_TUNE2_MODE1, 0x02), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_BIN_VCOCAL_CMP_CODE1_MODE1, 0x5c), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_BIN_VCOCAL_CMP_CODE2_MODE1, 0x0f), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_BIN_VCOCAL_CMP_CODE1_MODE0, 0x5c), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_BIN_VCOCAL_CMP_CODE2_MODE0, 0x0f), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_SSC_STEP_SIZE1_MODE0, 0xc0), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_SSC_STEP_SIZE2_MODE0, 0x01), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_CP_CTRL_MODE0, 0x02), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_PLL_RCTRL_MODE0, 0x16), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_PLL_CCTRL_MODE0, 0x36), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_LOCK_CMP1_MODE0, 0x08), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_LOCK_CMP2_MODE0, 0x1a), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_DEC_START_MODE0, 0x41), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_DIV_FRAC_START1_MODE0, 0x55), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_DIV_FRAC_START2_MODE0, 0x75), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_DIV_FRAC_START3_MODE0, 0x01), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_VCO_TUNE1_MODE0, 0x25), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_VCO_TUNE2_MODE0, 0x02), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_BG_TIMER, 0x0a), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_SSC_EN_CENTER, 0x01), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_SSC_PER1, 0x62), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_SSC_PER2, 0x02), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_SYSCLK_BUF_ENABLE, 0x0a), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_SYSCLK_EN_SEL, 0x1a), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_LOCK_CMP_CFG, 0x14), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_VCO_TUNE_MAP, 0x04), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_CORE_CLK_EN, 0x20), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_CMN_CONFIG_1, 0x16), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_AUTO_GAIN_ADJ_CTRL_1, 0xb6), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_AUTO_GAIN_ADJ_CTRL_2, 0x4b), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_AUTO_GAIN_ADJ_CTRL_3, 0x37), + QMP_PHY_INIT_CFG(QSERDES_V7_COM_ADDITIONAL_MISC, 0x0c), +}; + +static const struct qmp_phy_init_tbl x1e80100_usb3_uniphy_tx_tbl[] = { + QMP_PHY_INIT_CFG(QSERDES_V7_TX_RES_CODE_LANE_TX, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V7_TX_RES_CODE_LANE_RX, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V7_TX_RES_CODE_LANE_OFFSET_TX, 0x1f), + QMP_PHY_INIT_CFG(QSERDES_V7_TX_RES_CODE_LANE_OFFSET_RX, 0x09), + QMP_PHY_INIT_CFG(QSERDES_V7_TX_LANE_MODE_1, 0xf5), + QMP_PHY_INIT_CFG(QSERDES_V7_TX_LANE_MODE_3, 0x3f), + QMP_PHY_INIT_CFG(QSERDES_V7_TX_LANE_MODE_4, 0x3f), + QMP_PHY_INIT_CFG(QSERDES_V7_TX_LANE_MODE_5, 0x5f), + QMP_PHY_INIT_CFG(QSERDES_V7_TX_RCV_DETECT_LVL_2, 0x12), + QMP_PHY_INIT_CFG(QSERDES_V7_TX_PI_QEC_CTRL, 0x21), +}; + +static const struct qmp_phy_init_tbl x1e80100_usb3_uniphy_rx_tbl[] = { + QMP_PHY_INIT_CFG(QSERDES_V7_RX_UCDR_FO_GAIN, 0x0a), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_UCDR_SO_GAIN, 0x06), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_UCDR_FASTLOCK_FO_GAIN, 0x2f), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_UCDR_SO_SATURATION_AND_ENABLE, 0x7f), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_UCDR_FASTLOCK_COUNT_LOW, 0xff), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_UCDR_FASTLOCK_COUNT_HIGH, 0x0f), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_UCDR_PI_CONTROLS, 0x99), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_UCDR_SB2_THRESH1, 0x08), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_UCDR_SB2_THRESH2, 0x08), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_UCDR_SB2_GAIN1, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_UCDR_SB2_GAIN2, 0x0a), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_AUX_DATA_TCOARSE_TFINE, 0xa0), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_VGA_CAL_CNTRL1, 0x54), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_VGA_CAL_CNTRL2, 0x0f), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_GM_CAL, 0x13), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_RX_EQU_ADAPTOR_CNTRL2, 0x0f), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_RX_EQU_ADAPTOR_CNTRL3, 0x4a), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_RX_EQU_ADAPTOR_CNTRL4, 0x0a), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_RX_IDAC_TSETTLE_LOW, 0x07), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_RX_IDAC_TSETTLE_HIGH, 0x00), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_RX_EQ_OFFSET_ADAPTOR_CNTRL1, 0x47), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_SIGDET_CNTRL, 0x04), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_SIGDET_DEGLITCH_CNTRL, 0x0e), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_RX_MODE_00_LOW, 0x3f), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_RX_MODE_00_HIGH, 0xbf), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_RX_MODE_00_HIGH2, 0xff), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_RX_MODE_00_HIGH3, 0xdf), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_RX_MODE_00_HIGH4, 0xed), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_RX_MODE_01_LOW, 0xdc), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_RX_MODE_01_HIGH, 0x5c), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_RX_MODE_01_HIGH2, 0x9c), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_RX_MODE_01_HIGH3, 0x1d), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_RX_MODE_01_HIGH4, 0x09), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_DFE_EN_TIMER, 0x04), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_DFE_CTLE_POST_CAL_OFFSET, 0x38), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_DCC_CTRL1, 0x0c), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_VTH_CODE, 0x10), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_SIGDET_CAL_CTRL1, 0x14), + QMP_PHY_INIT_CFG(QSERDES_V7_RX_SIGDET_CAL_TRIM, 0x08), +}; + +static const struct qmp_phy_init_tbl x1e80100_usb3_uniphy_pcs_tbl[] = { + QMP_PHY_INIT_CFG(QPHY_V7_PCS_LOCK_DETECT_CONFIG1, 0xc4), + QMP_PHY_INIT_CFG(QPHY_V7_PCS_LOCK_DETECT_CONFIG2, 0x89), + QMP_PHY_INIT_CFG(QPHY_V7_PCS_LOCK_DETECT_CONFIG3, 0x20), + QMP_PHY_INIT_CFG(QPHY_V7_PCS_LOCK_DETECT_CONFIG6, 0x13), + QMP_PHY_INIT_CFG(QPHY_V7_PCS_REFGEN_REQ_CONFIG1, 0x21), + QMP_PHY_INIT_CFG(QPHY_V7_PCS_RX_SIGDET_LVL, 0xaa), + QMP_PHY_INIT_CFG(QPHY_V7_PCS_RCVR_DTCT_DLY_P1U2_L, 0xe7), + QMP_PHY_INIT_CFG(QPHY_V7_PCS_RCVR_DTCT_DLY_P1U2_H, 0x03), + QMP_PHY_INIT_CFG(QPHY_V7_PCS_CDR_RESET_TIME, 0x0a), + QMP_PHY_INIT_CFG(QPHY_V7_PCS_ALIGN_DETECT_CONFIG1, 0x88), + QMP_PHY_INIT_CFG(QPHY_V7_PCS_ALIGN_DETECT_CONFIG2, 0x13), + QMP_PHY_INIT_CFG(QPHY_V7_PCS_PCS_TX_RX_CONFIG, 0x0c), + QMP_PHY_INIT_CFG(QPHY_V7_PCS_EQ_CONFIG1, 0x4b), + QMP_PHY_INIT_CFG(QPHY_V7_PCS_EQ_CONFIG5, 0x10), +}; + +static const struct qmp_phy_init_tbl x1e80100_usb3_uniphy_pcs_usb_tbl[] = { + QMP_PHY_INIT_CFG(QPHY_V7_PCS_USB3_LFPS_DET_HIGH_COUNT_VAL, 0xf8), + QMP_PHY_INIT_CFG(QPHY_V7_PCS_USB3_RXEQTRAINING_DFE_TIME_S2, 0x07), + QMP_PHY_INIT_CFG(QPHY_V7_PCS_USB3_RCVR_DTCT_DLY_U3_L, 0x40), + QMP_PHY_INIT_CFG(QPHY_V7_PCS_USB3_RCVR_DTCT_DLY_U3_H, 0x00), +}; + struct qmp_usb_offsets { u16 serdes; u16 pcs; @@ -1465,6 +1605,14 @@ static const struct qmp_usb_offsets qmp_usb_offsets_v6 = { .rx = 0x1000, }; +static const struct qmp_usb_offsets qmp_usb_offsets_v7 = { + .serdes = 0, + .pcs = 0x0200, + .pcs_usb = 0x1200, + .tx = 0x0e00, + .rx = 0x1000, +}; + static const struct qmp_phy_cfg ipq8074_usb3phy_cfg = { .lanes = 1, @@ -1752,6 +1900,26 @@ static const struct qmp_phy_cfg qcm2290_usb3phy_cfg = { .regs = qmp_v3_usb3phy_regs_layout_qcm2290, }; +static const struct qmp_phy_cfg x1e80100_usb3_uniphy_cfg = { + .lanes = 1, + + .offsets = &qmp_usb_offsets_v7, + + .serdes_tbl = x1e80100_usb3_uniphy_serdes_tbl, + .serdes_tbl_num = ARRAY_SIZE(x1e80100_usb3_uniphy_serdes_tbl), + .tx_tbl = x1e80100_usb3_uniphy_tx_tbl, + .tx_tbl_num = ARRAY_SIZE(x1e80100_usb3_uniphy_tx_tbl), + .rx_tbl = x1e80100_usb3_uniphy_rx_tbl, + .rx_tbl_num = ARRAY_SIZE(x1e80100_usb3_uniphy_rx_tbl), + .pcs_tbl = x1e80100_usb3_uniphy_pcs_tbl, + .pcs_tbl_num = ARRAY_SIZE(x1e80100_usb3_uniphy_pcs_tbl), + .pcs_usb_tbl = x1e80100_usb3_uniphy_pcs_usb_tbl, + .pcs_usb_tbl_num = ARRAY_SIZE(x1e80100_usb3_uniphy_pcs_usb_tbl), + .vreg_list = qmp_phy_vreg_l, + .num_vregs = ARRAY_SIZE(qmp_phy_vreg_l), + .regs = qmp_v7_usb3phy_regs_layout, +}; + static void qmp_usb_configure_lane(void __iomem *base, const struct qmp_phy_init_tbl tbl[], int num, @@ -2441,6 +2609,9 @@ static const struct of_device_id qmp_usb_of_match_table[] = { }, { .compatible = "qcom,sm8350-qmp-usb3-uni-phy", .data = &sm8350_usb3_uniphy_cfg, + }, { + .compatible = "qcom,x1e80100-qmp-usb3-uni-phy", + .data = &x1e80100_usb3_uniphy_cfg, }, { }, }; -- GitLab From ac254dfb983deb7840bc7267418d1ae231f5694f Mon Sep 17 00:00:00 2001 From: "JiaLong.Yang" Date: Thu, 21 Dec 2023 14:02:41 +0800 Subject: [PATCH 0480/1290] perf vendor events powerpc: Add PVN for HX-C2000 CPU with Power8 Architecture HX-C2000 is a new CPU made by HEXIN Technologies Co., Ltd. And a new PVN 0x0066 has been applied from the OpenPower Community for this CPU. Here is a patch to make perf tool run in the CPU. Reviewed-by: Madhavan Srinivasan Signed-off-by: JiaLong.Yang Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: shenghui.qu@shingroup.cn Cc: Zhao Ke Cc: zhijie.ren@shingroup.cn Link: https://lore.kernel.org/r/20231221060242.4532-1-jialong.yang@shingroup.cn Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/powerpc/mapfile.csv | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/perf/pmu-events/arch/powerpc/mapfile.csv b/tools/perf/pmu-events/arch/powerpc/mapfile.csv index f4908af7ad66b..599a588dbeb40 100644 --- a/tools/perf/pmu-events/arch/powerpc/mapfile.csv +++ b/tools/perf/pmu-events/arch/powerpc/mapfile.csv @@ -11,8 +11,7 @@ # # Multiple PVRs could map to a single JSON file. # - -# Power8 entries 0x004[bcd][[:xdigit:]]{4},1,power8,core +0x0066[[:xdigit:]]{4},1,power8,core 0x004e[[:xdigit:]]{4},1,power9,core 0x0080[[:xdigit:]]{4},1,power10,core -- GitLab From 855c2e1d1842f0101a051378094548ca581d7a7d Mon Sep 17 00:00:00 2001 From: Jan Kuliga Date: Mon, 18 Dec 2023 12:39:39 +0100 Subject: [PATCH 0481/1290] dmaengine: xilinx: xdma: Rework xdma_terminate_all() Simplify xdma_xfer_stop(). Stop the dma engine and clear its status register unconditionally - just do what its name states. This change also allows to call it without grabbing a lock, which minimizes the total time spent with a spinlock held. Delete the currently processed vd.node from the vc.desc_issued list prior to passing it to vchan_terminate_vdesc(). In case there's more than one descriptor pending on vc.desc_issued list, calling vchan_terminate_desc() results in losing the link between vc.desc_issued list head and the second descriptor on the list. Doing so results in resources leakege, as vchan_dma_desc_free_list() won't be able to properly free memory resources attached to descriptors, resulting in dma_pool_destroy() failure. Don't call vchan_dma_desc_free_list() from within xdma_terminate_all(). Move all terminated descriptors to the vc.desc_terminated list instead. This allows to postpone freeing memory resources associated with descriptors until the call to vchan_synchronize(), which is called from xdma_synchronize() callback. This is the right way to do it - xdma_terminate_all() should return as soon as possible, while freeing resources (that may be time consuming in case of large number of descriptors) can be done safely later. Fixes: f5c392d106e7 ("dmaengine: xilinx: xdma: Add terminate_all/synchronize callbacks") Signed-off-by: Jan Kuliga Link: https://lore.kernel.org/r/20231218113943.9099-5-jankul@alatek.krakow.pl Signed-off-by: Vinod Koul --- drivers/dma/xilinx/xdma.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/dma/xilinx/xdma.c b/drivers/dma/xilinx/xdma.c index c22701e76b693..0c73508638738 100644 --- a/drivers/dma/xilinx/xdma.c +++ b/drivers/dma/xilinx/xdma.c @@ -379,12 +379,9 @@ static int xdma_xfer_start(struct xdma_chan *xchan) */ static int xdma_xfer_stop(struct xdma_chan *xchan) { - struct virt_dma_desc *vd = vchan_next_desc(&xchan->vchan); - struct xdma_device *xdev = xchan->xdev_hdl; int ret; - - if (!vd || !xchan->busy) - return -EINVAL; + u32 val; + struct xdma_device *xdev = xchan->xdev_hdl; /* clear run stop bit to prevent any further auto-triggering */ ret = regmap_write(xdev->rmap, xchan->base + XDMA_CHAN_CONTROL_W1C, @@ -392,7 +389,10 @@ static int xdma_xfer_stop(struct xdma_chan *xchan) if (ret) return ret; - xchan->busy = false; + /* Clear the channel status register */ + ret = regmap_read(xdev->rmap, xchan->base + XDMA_CHAN_STATUS_RC, &val); + if (ret) + return ret; return 0; } @@ -505,25 +505,25 @@ static void xdma_issue_pending(struct dma_chan *chan) static int xdma_terminate_all(struct dma_chan *chan) { struct xdma_chan *xdma_chan = to_xdma_chan(chan); - struct xdma_desc *desc = NULL; struct virt_dma_desc *vd; unsigned long flags; LIST_HEAD(head); - spin_lock_irqsave(&xdma_chan->vchan.lock, flags); xdma_xfer_stop(xdma_chan); + spin_lock_irqsave(&xdma_chan->vchan.lock, flags); + + xdma_chan->busy = false; vd = vchan_next_desc(&xdma_chan->vchan); - if (vd) - desc = to_xdma_desc(vd); - if (desc) { - dma_cookie_complete(&desc->vdesc.tx); - vchan_terminate_vdesc(&desc->vdesc); + if (vd) { + list_del(&vd->node); + dma_cookie_complete(&vd->tx); + vchan_terminate_vdesc(vd); } - vchan_get_all_descriptors(&xdma_chan->vchan, &head); + list_splice_tail(&head, &xdma_chan->vchan.desc_terminated); + spin_unlock_irqrestore(&xdma_chan->vchan.lock, flags); - vchan_dma_desc_free_list(&xdma_chan->vchan, &head); return 0; } -- GitLab From d0f22a3f55044f91b98e5536aa2c4d51d41cf8e7 Mon Sep 17 00:00:00 2001 From: Jan Kuliga Date: Mon, 18 Dec 2023 12:39:40 +0100 Subject: [PATCH 0482/1290] dmaengine: xilinx: xdma: Add error checking in xdma_channel_isr() Check and clear the status register value before proceeding any further in xdma_channel_isr(). It is necessary to do it since the interrupt may occur on any error condition enabled at the start of a transfer. Signed-off-by: Jan Kuliga Link: https://lore.kernel.org/r/20231218113943.9099-6-jankul@alatek.krakow.pl Signed-off-by: Vinod Koul --- drivers/dma/xilinx/xdma.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/dma/xilinx/xdma.c b/drivers/dma/xilinx/xdma.c index 0c73508638738..9a1d2939a333f 100644 --- a/drivers/dma/xilinx/xdma.c +++ b/drivers/dma/xilinx/xdma.c @@ -811,6 +811,18 @@ static irqreturn_t xdma_channel_isr(int irq, void *dev_id) desc = to_xdma_desc(vd); xdev = xchan->xdev_hdl; + /* Clear-on-read the status register */ + ret = regmap_read(xdev->rmap, xchan->base + XDMA_CHAN_STATUS_RC, &st); + if (ret) + goto out; + + st &= XDMA_CHAN_STATUS_MASK; + if ((st & XDMA_CHAN_ERROR_MASK) || + !(st & (CHAN_CTRL_IE_DESC_COMPLETED | CHAN_CTRL_IE_DESC_STOPPED))) { + xdma_err(xdev, "channel error, status register value: 0x%x", st); + goto out; + } + ret = regmap_read(xdev->rmap, xchan->base + XDMA_CHAN_COMPLETED_DESC, &complete_desc_num); if (ret) @@ -818,14 +830,6 @@ static irqreturn_t xdma_channel_isr(int irq, void *dev_id) if (desc->cyclic) { desc->completed_desc_num = complete_desc_num; - - ret = regmap_read(xdev->rmap, xchan->base + XDMA_CHAN_STATUS, - &st); - if (ret) - goto out; - - regmap_write(xdev->rmap, xchan->base + XDMA_CHAN_STATUS, st); - vchan_cyclic_callback(vd); } else { xchan->busy = false; -- GitLab From fd0e1d83a813d48285d39eae0053b38828cf7e1a Mon Sep 17 00:00:00 2001 From: Jan Kuliga Date: Mon, 18 Dec 2023 12:39:41 +0100 Subject: [PATCH 0483/1290] dmaengine: xilinx: xdma: Add transfer error reporting Extend the capability of transfer status reporting. Introduce error flag, which allows to report error in case of a interrupt-reported error condition. Signed-off-by: Jan Kuliga Link: https://lore.kernel.org/r/20231218113943.9099-7-jankul@alatek.krakow.pl Signed-off-by: Vinod Koul --- drivers/dma/xilinx/xdma.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/drivers/dma/xilinx/xdma.c b/drivers/dma/xilinx/xdma.c index 9a1d2939a333f..9f8597ed9be24 100644 --- a/drivers/dma/xilinx/xdma.c +++ b/drivers/dma/xilinx/xdma.c @@ -85,6 +85,7 @@ struct xdma_chan { * @cyclic: Cyclic transfer vs. scatter-gather * @periods: Number of periods in the cyclic transfer * @period_size: Size of a period in bytes in cyclic transfers + * @error: tx error flag */ struct xdma_desc { struct virt_dma_desc vdesc; @@ -97,6 +98,7 @@ struct xdma_desc { bool cyclic; u32 periods; u32 period_size; + bool error; }; #define XDMA_DEV_STATUS_REG_DMA BIT(0) @@ -274,6 +276,7 @@ xdma_alloc_desc(struct xdma_chan *chan, u32 desc_num, bool cyclic) sw_desc->chan = chan; sw_desc->desc_num = desc_num; sw_desc->cyclic = cyclic; + sw_desc->error = false; dblk_num = DIV_ROUND_UP(desc_num, XDMA_DESC_ADJACENT); sw_desc->desc_blocks = kcalloc(dblk_num, sizeof(*sw_desc->desc_blocks), GFP_NOWAIT); @@ -769,20 +772,20 @@ static enum dma_status xdma_tx_status(struct dma_chan *chan, dma_cookie_t cookie spin_lock_irqsave(&xdma_chan->vchan.lock, flags); vd = vchan_find_desc(&xdma_chan->vchan, cookie); - if (vd) - desc = to_xdma_desc(vd); - if (!desc || !desc->cyclic) { - spin_unlock_irqrestore(&xdma_chan->vchan.lock, flags); - return ret; - } - - period_idx = desc->completed_desc_num % desc->periods; - residue = (desc->periods - period_idx) * desc->period_size; + if (!vd) + goto out; + desc = to_xdma_desc(vd); + if (desc->error) { + ret = DMA_ERROR; + } else if (desc->cyclic) { + period_idx = desc->completed_desc_num % desc->periods; + residue = (desc->periods - period_idx) * desc->period_size; + dma_set_residue(state, residue); + } +out: spin_unlock_irqrestore(&xdma_chan->vchan.lock, flags); - dma_set_residue(state, residue); - return ret; } @@ -819,6 +822,7 @@ static irqreturn_t xdma_channel_isr(int irq, void *dev_id) st &= XDMA_CHAN_STATUS_MASK; if ((st & XDMA_CHAN_ERROR_MASK) || !(st & (CHAN_CTRL_IE_DESC_COMPLETED | CHAN_CTRL_IE_DESC_STOPPED))) { + desc->error = true; xdma_err(xdev, "channel error, status register value: 0x%x", st); goto out; } -- GitLab From 3e184e64c2e5145e51dc57872b0a64e3a6736888 Mon Sep 17 00:00:00 2001 From: Jan Kuliga Date: Mon, 18 Dec 2023 12:39:42 +0100 Subject: [PATCH 0484/1290] dmaengine: xilinx: xdma: Prepare the introduction of interleaved DMA transfers Make generic code generic. As descriptor-filling logic stays the same regardless of a dmaengine's type of transfer, it is possible to write the descriptor-filling function in a generic way, so that it can be used for every single type of transfer preparation callback. Signed-off-by: Jan Kuliga Link: https://lore.kernel.org/r/20231218113943.9099-8-jankul@alatek.krakow.pl Signed-off-by: Vinod Koul --- drivers/dma/xilinx/xdma.c | 101 +++++++++++++++++++++----------------- 1 file changed, 57 insertions(+), 44 deletions(-) diff --git a/drivers/dma/xilinx/xdma.c b/drivers/dma/xilinx/xdma.c index 9f8597ed9be24..618cc9af6eb94 100644 --- a/drivers/dma/xilinx/xdma.c +++ b/drivers/dma/xilinx/xdma.c @@ -542,6 +542,43 @@ static void xdma_synchronize(struct dma_chan *chan) vchan_synchronize(&xdma_chan->vchan); } +/** + * xdma_fill_descs - Fill hardware descriptors with contiguous memory block addresses + * @sw_desc - tx descriptor state container + * @src_addr - Value for a ->src_addr field of a first descriptor + * @dst_addr - Value for a ->dst_addr field of a first descriptor + * @size - Total size of a contiguous memory block + * @filled_descs_num - Number of filled hardware descriptors for corresponding sw_desc + */ +static inline u32 xdma_fill_descs(struct xdma_desc *sw_desc, u64 src_addr, + u64 dst_addr, u32 size, u32 filled_descs_num) +{ + u32 left = size, len, desc_num = filled_descs_num; + struct xdma_desc_block *dblk; + struct xdma_hw_desc *desc; + + dblk = sw_desc->desc_blocks + (desc_num / XDMA_DESC_ADJACENT); + desc = dblk->virt_addr; + desc += desc_num & XDMA_DESC_ADJACENT_MASK; + do { + len = min_t(u32, left, XDMA_DESC_BLEN_MAX); + /* set hardware descriptor */ + desc->bytes = cpu_to_le32(len); + desc->src_addr = cpu_to_le64(src_addr); + desc->dst_addr = cpu_to_le64(dst_addr); + if (!(++desc_num & XDMA_DESC_ADJACENT_MASK)) + desc = (++dblk)->virt_addr; + else + desc++; + + src_addr += len; + dst_addr += len; + left -= len; + } while (left); + + return desc_num - filled_descs_num; +} + /** * xdma_prep_device_sg - prepare a descriptor for a DMA transaction * @chan: DMA channel pointer @@ -558,13 +595,10 @@ xdma_prep_device_sg(struct dma_chan *chan, struct scatterlist *sgl, { struct xdma_chan *xdma_chan = to_xdma_chan(chan); struct dma_async_tx_descriptor *tx_desc; - u32 desc_num = 0, i, len, rest; - struct xdma_desc_block *dblk; - struct xdma_hw_desc *desc; struct xdma_desc *sw_desc; - u64 dev_addr, *src, *dst; + u32 desc_num = 0, i; + u64 addr, dev_addr, *src, *dst; struct scatterlist *sg; - u64 addr; for_each_sg(sgl, sg, sg_len, i) desc_num += DIV_ROUND_UP(sg_dma_len(sg), XDMA_DESC_BLEN_MAX); @@ -584,32 +618,11 @@ xdma_prep_device_sg(struct dma_chan *chan, struct scatterlist *sgl, dst = &addr; } - dblk = sw_desc->desc_blocks; - desc = dblk->virt_addr; - desc_num = 1; + desc_num = 0; for_each_sg(sgl, sg, sg_len, i) { addr = sg_dma_address(sg); - rest = sg_dma_len(sg); - - do { - len = min_t(u32, rest, XDMA_DESC_BLEN_MAX); - /* set hardware descriptor */ - desc->bytes = cpu_to_le32(len); - desc->src_addr = cpu_to_le64(*src); - desc->dst_addr = cpu_to_le64(*dst); - - if (!(desc_num & XDMA_DESC_ADJACENT_MASK)) { - dblk++; - desc = dblk->virt_addr; - } else { - desc++; - } - - desc_num++; - dev_addr += len; - addr += len; - rest -= len; - } while (rest); + desc_num += xdma_fill_descs(sw_desc, *src, *dst, sg_dma_len(sg), desc_num); + dev_addr += sg_dma_len(sg); } tx_desc = vchan_tx_prep(&xdma_chan->vchan, &sw_desc->vdesc, flags); @@ -643,9 +656,9 @@ xdma_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t address, struct xdma_device *xdev = xdma_chan->xdev_hdl; unsigned int periods = size / period_size; struct dma_async_tx_descriptor *tx_desc; - struct xdma_desc_block *dblk; - struct xdma_hw_desc *desc; struct xdma_desc *sw_desc; + u64 addr, dev_addr, *src, *dst; + u32 desc_num; unsigned int i; /* @@ -670,21 +683,21 @@ xdma_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t address, sw_desc->period_size = period_size; sw_desc->dir = dir; - dblk = sw_desc->desc_blocks; - desc = dblk->virt_addr; + addr = address; + if (dir == DMA_MEM_TO_DEV) { + dev_addr = xdma_chan->cfg.dst_addr; + src = &addr; + dst = &dev_addr; + } else { + dev_addr = xdma_chan->cfg.src_addr; + src = &dev_addr; + dst = &addr; + } - /* fill hardware descriptor */ + desc_num = 0; for (i = 0; i < periods; i++) { - desc->bytes = cpu_to_le32(period_size); - if (dir == DMA_MEM_TO_DEV) { - desc->src_addr = cpu_to_le64(address + i * period_size); - desc->dst_addr = cpu_to_le64(xdma_chan->cfg.dst_addr); - } else { - desc->src_addr = cpu_to_le64(xdma_chan->cfg.src_addr); - desc->dst_addr = cpu_to_le64(address + i * period_size); - } - - desc++; + desc_num += xdma_fill_descs(sw_desc, *src, *dst, period_size, desc_num); + addr += i * period_size; } tx_desc = vchan_tx_prep(&xdma_chan->vchan, &sw_desc->vdesc, flags); -- GitLab From 2f8f90cd2f8d237c51c2775a53ef0d8c8acaa707 Mon Sep 17 00:00:00 2001 From: Jan Kuliga Date: Mon, 18 Dec 2023 12:39:43 +0100 Subject: [PATCH 0485/1290] dmaengine: xilinx: xdma: Implement interleaved DMA transfers Interleaved DMA functionality allows dmaengine clients' to express DMA transfers in an arbitrary way. This is extremely useful in FPGA environments, where a greater transfer flexibility is needed. For instance, in one FPGA design there may be need to do DMA to/from a FIFO at a fixed address, and also to do DMA to/from a (non)contiguous RAM memory. Introduce separate tx preparation callback and add tx-flags handling logic. Their behavior is based on the description of interleaved DMA transfers in both source code and the DMAEngine's documentation. Since XDMA is a fully-fledged scatter-gather dma engine, the logic of xdma_prep_interleaved_dma() is fairly simple and similar to the other tx preparation callbacks. The whole tx-flags handling logic resides in xdma_channel_isr(). Transfer of a single frame from a interleaved DMA transfer template is pretty similar to the single sg transaction. Therefore, the transaction of the whole interleaved DMA transfer template is basically a cyclic dma transaction with finite cycles/periods (equal to the frame of count) of a single sg transfers. Signed-off-by: Jan Kuliga Link: https://lore.kernel.org/r/20231218113943.9099-9-jankul@alatek.krakow.pl Signed-off-by: Vinod Koul --- drivers/dma/xilinx/xdma.c | 107 ++++++++++++++++++++++++++++++++++---- 1 file changed, 98 insertions(+), 9 deletions(-) diff --git a/drivers/dma/xilinx/xdma.c b/drivers/dma/xilinx/xdma.c index 618cc9af6eb94..9360b85131ef9 100644 --- a/drivers/dma/xilinx/xdma.c +++ b/drivers/dma/xilinx/xdma.c @@ -83,8 +83,10 @@ struct xdma_chan { * @desc_num: Number of hardware descriptors * @completed_desc_num: Completed hardware descriptors * @cyclic: Cyclic transfer vs. scatter-gather + * @interleaved_dma: Interleaved DMA transfer * @periods: Number of periods in the cyclic transfer * @period_size: Size of a period in bytes in cyclic transfers + * @frames_left: Number of frames left in interleaved DMA transfer * @error: tx error flag */ struct xdma_desc { @@ -96,8 +98,10 @@ struct xdma_desc { u32 desc_num; u32 completed_desc_num; bool cyclic; + bool interleaved_dma; u32 periods; u32 period_size; + u32 frames_left; bool error; }; @@ -607,6 +611,8 @@ xdma_prep_device_sg(struct dma_chan *chan, struct scatterlist *sgl, if (!sw_desc) return NULL; sw_desc->dir = dir; + sw_desc->cyclic = false; + sw_desc->interleaved_dma = false; if (dir == DMA_MEM_TO_DEV) { dev_addr = xdma_chan->cfg.dst_addr; @@ -682,6 +688,7 @@ xdma_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t address, sw_desc->periods = periods; sw_desc->period_size = period_size; sw_desc->dir = dir; + sw_desc->interleaved_dma = false; addr = address; if (dir == DMA_MEM_TO_DEV) { @@ -712,6 +719,57 @@ failed: return NULL; } +/** + * xdma_prep_interleaved_dma - Prepare virtual descriptor for interleaved DMA transfers + * @chan: DMA channel + * @xt: DMA transfer template + * @flags: tx flags + */ +struct dma_async_tx_descriptor * +xdma_prep_interleaved_dma(struct dma_chan *chan, + struct dma_interleaved_template *xt, + unsigned long flags) +{ + int i; + u32 desc_num = 0, period_size = 0; + struct dma_async_tx_descriptor *tx_desc; + struct xdma_chan *xchan = to_xdma_chan(chan); + struct xdma_desc *sw_desc; + u64 src_addr, dst_addr; + + for (i = 0; i < xt->frame_size; ++i) + desc_num += DIV_ROUND_UP(xt->sgl[i].size, XDMA_DESC_BLEN_MAX); + + sw_desc = xdma_alloc_desc(xchan, desc_num, false); + if (!sw_desc) + return NULL; + sw_desc->dir = xt->dir; + sw_desc->interleaved_dma = true; + sw_desc->cyclic = flags & DMA_PREP_REPEAT; + sw_desc->frames_left = xt->numf; + sw_desc->periods = xt->numf; + + desc_num = 0; + src_addr = xt->src_start; + dst_addr = xt->dst_start; + for (i = 0; i < xt->frame_size; ++i) { + desc_num += xdma_fill_descs(sw_desc, src_addr, dst_addr, xt->sgl[i].size, desc_num); + src_addr += dmaengine_get_src_icg(xt, &xt->sgl[i]) + xt->src_inc ? + xt->sgl[i].size : 0; + dst_addr += dmaengine_get_dst_icg(xt, &xt->sgl[i]) + xt->dst_inc ? + xt->sgl[i].size : 0; + period_size += xt->sgl[i].size; + } + sw_desc->period_size = period_size; + + tx_desc = vchan_tx_prep(&xchan->vchan, &sw_desc->vdesc, flags); + if (tx_desc) + return tx_desc; + + xdma_free_desc(&sw_desc->vdesc); + return NULL; +} + /** * xdma_device_config - Configure the DMA channel * @chan: DMA channel @@ -811,11 +869,12 @@ static irqreturn_t xdma_channel_isr(int irq, void *dev_id) { struct xdma_chan *xchan = dev_id; u32 complete_desc_num = 0; - struct xdma_device *xdev; - struct virt_dma_desc *vd; + struct xdma_device *xdev = xchan->xdev_hdl; + struct virt_dma_desc *vd, *next_vd; struct xdma_desc *desc; int ret; u32 st; + bool repeat_tx; spin_lock(&xchan->vchan.lock); @@ -824,9 +883,6 @@ static irqreturn_t xdma_channel_isr(int irq, void *dev_id) if (!vd) goto out; - desc = to_xdma_desc(vd); - xdev = xchan->xdev_hdl; - /* Clear-on-read the status register */ ret = regmap_read(xdev->rmap, xchan->base + XDMA_CHAN_STATUS_RC, &st); if (ret) @@ -845,10 +901,36 @@ static irqreturn_t xdma_channel_isr(int irq, void *dev_id) if (ret) goto out; - if (desc->cyclic) { - desc->completed_desc_num = complete_desc_num; - vchan_cyclic_callback(vd); - } else { + desc = to_xdma_desc(vd); + if (desc->interleaved_dma) { + xchan->busy = false; + desc->completed_desc_num += complete_desc_num; + if (complete_desc_num == XDMA_DESC_BLOCK_NUM * XDMA_DESC_ADJACENT) { + xdma_xfer_start(xchan); + goto out; + } + + /* last desc of any frame */ + desc->frames_left--; + if (desc->frames_left) + goto out; + + /* last desc of the last frame */ + repeat_tx = vd->tx.flags & DMA_PREP_REPEAT; + next_vd = list_first_entry_or_null(&vd->node, struct virt_dma_desc, node); + if (next_vd) + repeat_tx = repeat_tx && !(next_vd->tx.flags & DMA_PREP_LOAD_EOT); + if (repeat_tx) { + desc->frames_left = desc->periods; + desc->completed_desc_num = 0; + vchan_cyclic_callback(vd); + } else { + list_del(&vd->node); + vchan_cookie_complete(vd); + } + /* start (or continue) the tx of a first desc on the vc.desc_issued list, if any */ + xdma_xfer_start(xchan); + } else if (!desc->cyclic) { xchan->busy = false; desc->completed_desc_num += complete_desc_num; @@ -865,6 +947,9 @@ static irqreturn_t xdma_channel_isr(int irq, void *dev_id) /* transfer the rest of data */ xdma_xfer_start(xchan); + } else { + desc->completed_desc_num = complete_desc_num; + vchan_cyclic_callback(vd); } out: @@ -1163,6 +1248,9 @@ static int xdma_probe(struct platform_device *pdev) dma_cap_set(DMA_SLAVE, xdev->dma_dev.cap_mask); dma_cap_set(DMA_PRIVATE, xdev->dma_dev.cap_mask); dma_cap_set(DMA_CYCLIC, xdev->dma_dev.cap_mask); + dma_cap_set(DMA_INTERLEAVE, xdev->dma_dev.cap_mask); + dma_cap_set(DMA_REPEAT, xdev->dma_dev.cap_mask); + dma_cap_set(DMA_LOAD_EOT, xdev->dma_dev.cap_mask); xdev->dma_dev.dev = &pdev->dev; xdev->dma_dev.residue_granularity = DMA_RESIDUE_GRANULARITY_SEGMENT; @@ -1178,6 +1266,7 @@ static int xdma_probe(struct platform_device *pdev) xdev->dma_dev.filter.mapcnt = pdata->device_map_cnt; xdev->dma_dev.filter.fn = xdma_filter_fn; xdev->dma_dev.device_prep_dma_cyclic = xdma_prep_dma_cyclic; + xdev->dma_dev.device_prep_interleaved_dma = xdma_prep_interleaved_dma; ret = dma_async_device_register(&xdev->dma_dev); if (ret) { -- GitLab From 22a9d9585812440211b0b34a6bc02ade62314be4 Mon Sep 17 00:00:00 2001 From: Bumyong Lee Date: Tue, 19 Dec 2023 14:50:26 +0900 Subject: [PATCH 0486/1290] dmaengine: pl330: issue_pending waits until WFP state According to DMA-330 errata notice[1] 71930, DMAKILL cannot clear internal signal, named pipeline_req_active. it makes that pl330 would wait forever in WFP state although dma already send dma request if pl330 gets dma request before entering WFP state. The errata suggests that polling until entering WFP state as workaround and then peripherals allows to issue dma request. [1]: https://developer.arm.com/documentation/genc008428/latest Signed-off-by: Bumyong Lee Link: https://lore.kernel.org/r/20231219055026.118695-1-bumyong.lee@samsung.com Signed-off-by: Vinod Koul --- drivers/dma/pl330.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c index 3cf0b38387ae5..c29744bfdf2c2 100644 --- a/drivers/dma/pl330.c +++ b/drivers/dma/pl330.c @@ -1053,6 +1053,9 @@ static bool _trigger(struct pl330_thread *thrd) thrd->req_running = idx; + if (desc->rqtype == DMA_MEM_TO_DEV || desc->rqtype == DMA_DEV_TO_MEM) + UNTIL(thrd, PL330_STATE_WFP); + return true; } -- GitLab From bbcd7b588b0bf967f90df60fccd16c9c49b768ea Mon Sep 17 00:00:00 2001 From: Vinod Koul Date: Fri, 22 Dec 2023 15:10:17 +0530 Subject: [PATCH 0487/1290] dmaengine: xilinx: xdma: Workaround truncation compilation error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Increase length to be copied to be large enough to overcome the following compilation error. The buf is large enough for this purpose. drivers/dma/xilinx/xilinx_dpdma.c: In function ‘xilinx_dpdma_debugfs_desc_done_irq_read’: drivers/dma/xilinx/xilinx_dpdma.c:313:39: error: ‘snprintf’ output may be truncated before the last format character [-Werror=format-truncation=] 313 | snprintf(buf, out_str_len, "%d", | ^ drivers/dma/xilinx/xilinx_dpdma.c:313:9: note: ‘snprintf’ output between 2 and 6 bytes into a destination of size 5 313 | snprintf(buf, out_str_len, "%d", | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 314 | dpdma_debugfs.xilinx_dpdma_irq_done_count); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Vinod Koul Link: https://lore.kernel.org/r/20231222094017.731917-1-vkoul@kernel.org Signed-off-by: Vinod Koul --- drivers/dma/xilinx/xilinx_dpdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/xilinx/xilinx_dpdma.c b/drivers/dma/xilinx/xilinx_dpdma.c index 69587d85a7cd2..b82815e64d24e 100644 --- a/drivers/dma/xilinx/xilinx_dpdma.c +++ b/drivers/dma/xilinx/xilinx_dpdma.c @@ -309,7 +309,7 @@ static ssize_t xilinx_dpdma_debugfs_desc_done_irq_read(char *buf) out_str_len = strlen(XILINX_DPDMA_DEBUGFS_UINT16_MAX_STR); out_str_len = min_t(size_t, XILINX_DPDMA_DEBUGFS_READ_MAX_SIZE, - out_str_len); + out_str_len + 1); snprintf(buf, out_str_len, "%d", dpdma_debugfs.xilinx_dpdma_irq_done_count); -- GitLab From 3d0b2176e04261ab4ac095ff2a17db077fc1e46d Mon Sep 17 00:00:00 2001 From: Vinod Koul Date: Fri, 22 Dec 2023 15:10:01 +0530 Subject: [PATCH 0488/1290] dmaengine: xilinx: xdma: statify xdma_prep_interleaved_dma xdma_prep_interleaved_dma() was local to file but not declared static, leading to warning: drivers/dma/xilinx/xdma.c:729:1: warning: no previous prototype for 'xdma_prep_interleaved_dma' [-Wmissing-prototypes] 729 | xdma_prep_interleaved_dma(struct dma_chan *chan Reported-by: Stephen Rothwell Signed-off-by: Vinod Koul Link: https://lore.kernel.org/r/20231222094001.731889-1-vkoul@kernel.org Signed-off-by: Vinod Koul --- drivers/dma/xilinx/xdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/xilinx/xdma.c b/drivers/dma/xilinx/xdma.c index 9360b85131ef9..4ebc90b41bdb9 100644 --- a/drivers/dma/xilinx/xdma.c +++ b/drivers/dma/xilinx/xdma.c @@ -725,7 +725,7 @@ failed: * @xt: DMA transfer template * @flags: tx flags */ -struct dma_async_tx_descriptor * +static struct dma_async_tx_descriptor * xdma_prep_interleaved_dma(struct dma_chan *chan, struct dma_interleaved_template *xt, unsigned long flags) -- GitLab From d719915ad9706a16adde231789a1d46fc12fb9c7 Mon Sep 17 00:00:00 2001 From: Chintan Vankar Date: Thu, 21 Dec 2023 15:59:55 +0530 Subject: [PATCH 0489/1290] phy: ti: gmii-sel: Enable SGMII mode for J784S4 TI's J784S4 SoC supports SGMII mode with the CPSW9G instance of the CPSW Ethernet Switch. Thus, enable it by adding SGMII mode to the list of the corresponding extra_modes member. Signed-off-by: Chintan Vankar Reviewed-by: Roger Quadros Link: https://lore.kernel.org/r/20231221102956.754617-1-c-vankar@ti.com Signed-off-by: Vinod Koul --- drivers/phy/ti/phy-gmii-sel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/phy/ti/phy-gmii-sel.c b/drivers/phy/ti/phy-gmii-sel.c index 555b323f45da1..05004c8a758f2 100644 --- a/drivers/phy/ti/phy-gmii-sel.c +++ b/drivers/phy/ti/phy-gmii-sel.c @@ -247,7 +247,7 @@ static const struct phy_gmii_sel_soc_data phy_gmii_sel_cpsw9g_soc_j784s4 = { .use_of_data = true, .regfields = phy_gmii_sel_fields_am654, - .extra_modes = BIT(PHY_INTERFACE_MODE_QSGMII) | + .extra_modes = BIT(PHY_INTERFACE_MODE_QSGMII) | BIT(PHY_INTERFACE_MODE_SGMII) | BIT(PHY_INTERFACE_MODE_USXGMII), .num_ports = 8, .num_qsgmii_main_ports = 2, -- GitLab From 2029e71482fcd94dcc7df2c66c7fa635479748bf Mon Sep 17 00:00:00 2001 From: Chintan Vankar Date: Thu, 21 Dec 2023 15:59:56 +0530 Subject: [PATCH 0490/1290] phy: ti: j721e-wiz: Add SGMII support in WIZ driver for J784S4 Enable full rate divider configuration support for J784S4_WIZ_10G for SGMII. Signed-off-by: Chintan Vankar Reviewed-by: Roger Quadros Link: https://lore.kernel.org/r/20231221102956.754617-2-c-vankar@ti.com Signed-off-by: Vinod Koul --- drivers/phy/ti/phy-j721e-wiz.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/phy/ti/phy-j721e-wiz.c b/drivers/phy/ti/phy-j721e-wiz.c index fc3cd98c60ff4..00d7e6a6de03a 100644 --- a/drivers/phy/ti/phy-j721e-wiz.c +++ b/drivers/phy/ti/phy-j721e-wiz.c @@ -1240,6 +1240,7 @@ static int wiz_phy_fullrt_div(struct wiz *wiz, int lane) case J721E_WIZ_10G: case J7200_WIZ_10G: case J721S2_WIZ_10G: + case J784S4_WIZ_10G: if (wiz->lane_phy_type[lane] == PHY_TYPE_SGMII) return regmap_field_write(wiz->p0_fullrt_div[lane], 0x2); break; -- GitLab From 26547691107eda45b0f20ee79bad19bbe5fcbfd7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Fri, 22 Dec 2023 11:35:22 -0500 Subject: [PATCH 0491/1290] tracing/selftests: Remove exec permissions from trace_marker.tc test The tests in the selftests should not have the exec permissions set. The trace_marker.tc test accidentally was committed with the exec permission. Set the permission to that file back to just read/write. No functional nor code changes. Link: https://lore.kernel.org/linux-trace-kernel/20231222112831.4c7fa500@gandalf.local.home/ Signed-off-by: Steven Rostedt (Google) --- tools/testing/selftests/ftrace/test.d/00basic/trace_marker.tc | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 tools/testing/selftests/ftrace/test.d/00basic/trace_marker.tc diff --git a/tools/testing/selftests/ftrace/test.d/00basic/trace_marker.tc b/tools/testing/selftests/ftrace/test.d/00basic/trace_marker.tc old mode 100755 new mode 100644 -- GitLab From 60e43fe5285e2077ce9904d78cd42a230d03b788 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 21 Dec 2023 15:02:31 -0700 Subject: [PATCH 0492/1290] lib/firmware_table: tables: Add CDAT table parsing support The CDAT table is very similar to ACPI tables when it comes to sub-table and entry structures. The helper functions can be also used to parse the CDAT table. Add support to the helper functions to deal with an external CDAT table, and also handle the endieness since CDAT can be processed by a BE host. Export a function cdat_table_parse() for CXL driver to parse a CDAT table. In order to minimize ACPICA code changes, __force is being utilized to deal with the case of a big endian (BE) host parsing a CDAT. All CDAT data structure variables are being force casted to __leX as appropriate. Cc: Rafael J. Wysocki Cc: Len Brown Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/170319615131.2212653.10932785667981494238.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- drivers/acpi/tables.c | 5 +-- include/linux/fw_table.h | 21 ++++++++++- lib/fw_table.c | 75 +++++++++++++++++++++++++++++++++------- 3 files changed, 86 insertions(+), 15 deletions(-) diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c index c1516337f6682..b07f7d091d133 100644 --- a/drivers/acpi/tables.c +++ b/drivers/acpi/tables.c @@ -251,8 +251,9 @@ int __init_or_acpilib acpi_table_parse_entries_array( return -ENODEV; } - count = acpi_parse_entries_array(id, table_size, table_header, - proc, proc_num, max_entries); + count = acpi_parse_entries_array(id, table_size, + (union fw_table_header *)table_header, + proc, proc_num, max_entries); acpi_put_table(table_header); return count; diff --git a/include/linux/fw_table.h b/include/linux/fw_table.h index ca49947f0a775..95421860397a2 100644 --- a/include/linux/fw_table.h +++ b/include/linux/fw_table.h @@ -25,16 +25,35 @@ struct acpi_subtable_proc { int count; }; +union fw_table_header { + struct acpi_table_header acpi; + struct acpi_table_cdat cdat; +}; + union acpi_subtable_headers { struct acpi_subtable_header common; struct acpi_hmat_structure hmat; struct acpi_prmt_module_header prmt; struct acpi_cedt_header cedt; + struct acpi_cdat_header cdat; }; int acpi_parse_entries_array(char *id, unsigned long table_size, - struct acpi_table_header *table_header, + union fw_table_header *table_header, struct acpi_subtable_proc *proc, int proc_num, unsigned int max_entries); +int cdat_table_parse(enum acpi_cdat_type type, + acpi_tbl_entry_handler_arg handler_arg, void *arg, + struct acpi_table_cdat *table_header); + +/* CXL is the only non-ACPI consumer of the FIRMWARE_TABLE library */ +#if IS_ENABLED(CONFIG_ACPI) && !IS_ENABLED(CONFIG_CXL_BUS) +#define EXPORT_SYMBOL_FWTBL_LIB(x) EXPORT_SYMBOL_ACPI_LIB(x) +#define __init_or_fwtbl_lib __init_or_acpilib +#else +#define EXPORT_SYMBOL_FWTBL_LIB(x) EXPORT_SYMBOL_NS_GPL(x, CXL) +#define __init_or_fwtbl_lib +#endif + #endif diff --git a/lib/fw_table.c b/lib/fw_table.c index 294df54e33b6f..1e5e0b2f70120 100644 --- a/lib/fw_table.c +++ b/lib/fw_table.c @@ -12,12 +12,14 @@ #include #include #include +#include enum acpi_subtable_type { ACPI_SUBTABLE_COMMON, ACPI_SUBTABLE_HMAT, ACPI_SUBTABLE_PRMT, ACPI_SUBTABLE_CEDT, + CDAT_SUBTABLE, }; struct acpi_subtable_entry { @@ -25,7 +27,7 @@ struct acpi_subtable_entry { enum acpi_subtable_type type; }; -static unsigned long __init_or_acpilib +static unsigned long __init_or_fwtbl_lib acpi_get_entry_type(struct acpi_subtable_entry *entry) { switch (entry->type) { @@ -37,11 +39,13 @@ acpi_get_entry_type(struct acpi_subtable_entry *entry) return 0; case ACPI_SUBTABLE_CEDT: return entry->hdr->cedt.type; + case CDAT_SUBTABLE: + return entry->hdr->cdat.type; } return 0; } -static unsigned long __init_or_acpilib +static unsigned long __init_or_fwtbl_lib acpi_get_entry_length(struct acpi_subtable_entry *entry) { switch (entry->type) { @@ -53,11 +57,16 @@ acpi_get_entry_length(struct acpi_subtable_entry *entry) return entry->hdr->prmt.length; case ACPI_SUBTABLE_CEDT: return entry->hdr->cedt.length; + case CDAT_SUBTABLE: { + __le16 length = (__force __le16)entry->hdr->cdat.length; + + return le16_to_cpu(length); + } } return 0; } -static unsigned long __init_or_acpilib +static unsigned long __init_or_fwtbl_lib acpi_get_subtable_header_length(struct acpi_subtable_entry *entry) { switch (entry->type) { @@ -69,11 +78,13 @@ acpi_get_subtable_header_length(struct acpi_subtable_entry *entry) return sizeof(entry->hdr->prmt); case ACPI_SUBTABLE_CEDT: return sizeof(entry->hdr->cedt); + case CDAT_SUBTABLE: + return sizeof(entry->hdr->cdat); } return 0; } -static enum acpi_subtable_type __init_or_acpilib +static enum acpi_subtable_type __init_or_fwtbl_lib acpi_get_subtable_type(char *id) { if (strncmp(id, ACPI_SIG_HMAT, 4) == 0) @@ -82,17 +93,32 @@ acpi_get_subtable_type(char *id) return ACPI_SUBTABLE_PRMT; if (strncmp(id, ACPI_SIG_CEDT, 4) == 0) return ACPI_SUBTABLE_CEDT; + if (strncmp(id, ACPI_SIG_CDAT, 4) == 0) + return CDAT_SUBTABLE; return ACPI_SUBTABLE_COMMON; } -static __init_or_acpilib bool has_handler(struct acpi_subtable_proc *proc) +static unsigned long __init_or_fwtbl_lib +acpi_table_get_length(enum acpi_subtable_type type, + union fw_table_header *header) +{ + if (type == CDAT_SUBTABLE) { + __le32 length = (__force __le32)header->cdat.length; + + return le32_to_cpu(length); + } + + return header->acpi.length; +} + +static __init_or_fwtbl_lib bool has_handler(struct acpi_subtable_proc *proc) { return proc->handler || proc->handler_arg; } -static __init_or_acpilib int call_handler(struct acpi_subtable_proc *proc, - union acpi_subtable_headers *hdr, - unsigned long end) +static __init_or_fwtbl_lib int call_handler(struct acpi_subtable_proc *proc, + union acpi_subtable_headers *hdr, + unsigned long end) { if (proc->handler) return proc->handler(hdr, end); @@ -124,23 +150,26 @@ static __init_or_acpilib int call_handler(struct acpi_subtable_proc *proc, * On success returns sum of all matching entries for all proc handlers. * Otherwise, -ENODEV or -EINVAL is returned. */ -int __init_or_acpilib +int __init_or_fwtbl_lib acpi_parse_entries_array(char *id, unsigned long table_size, - struct acpi_table_header *table_header, + union fw_table_header *table_header, struct acpi_subtable_proc *proc, int proc_num, unsigned int max_entries) { unsigned long table_end, subtable_len, entry_len; struct acpi_subtable_entry entry; + enum acpi_subtable_type type; int count = 0; int errs = 0; int i; - table_end = (unsigned long)table_header + table_header->length; + type = acpi_get_subtable_type(id); + table_end = (unsigned long)table_header + + acpi_table_get_length(type, table_header); /* Parse all entries looking for a match. */ - entry.type = acpi_get_subtable_type(id); + entry.type = type; entry.hdr = (union acpi_subtable_headers *) ((unsigned long)table_header + table_size); subtable_len = acpi_get_subtable_header_length(&entry); @@ -186,3 +215,25 @@ acpi_parse_entries_array(char *id, unsigned long table_size, return errs ? -EINVAL : count; } + +int __init_or_fwtbl_lib +cdat_table_parse(enum acpi_cdat_type type, + acpi_tbl_entry_handler_arg handler_arg, + void *arg, + struct acpi_table_cdat *table_header) +{ + struct acpi_subtable_proc proc = { + .id = type, + .handler_arg = handler_arg, + .arg = arg, + }; + + if (!table_header) + return -EINVAL; + + return acpi_parse_entries_array(ACPI_SIG_CDAT, + sizeof(struct acpi_table_cdat), + (union fw_table_header *)table_header, + &proc, 1, 0); +} +EXPORT_SYMBOL_FWTBL_LIB(cdat_table_parse); -- GitLab From 6a954e94d038f41d79c4e04348c95774d1c9337d Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 21 Dec 2023 15:02:37 -0700 Subject: [PATCH 0493/1290] base/node / acpi: Change 'node_hmem_attrs' to 'access_coordinates' Dan Williams suggested changing the struct 'node_hmem_attrs' to 'access_coordinates' [1]. The struct is a container of r/w-latency and r/w-bandwidth numbers. Moving forward, this container will also be used by CXL to store the performance characteristics of each link hop in the PCIE/CXL topology. So, where node_hmem_attrs is just the access parameters of a memory-node, access_coordinates applies more broadly to hardware topology characteristics. The observation is that seemed like an exercise in having the application identify "where" it falls on a spectrum of bandwidth and latency needs. For the tuple of read/write-latency and read/write-bandwidth, "coordinates" is not a perfect fit. Sometimes it is just conveying values in isolation and not a "location" relative to other performance points, but in the end this data is used to identify the performance operation point of a given memory-node. [2] Link: http://lore.kernel.org/r/64471313421f7_1b66294d5@dwillia2-xfh.jf.intel.com.notmuch/ Link: https://lore.kernel.org/linux-cxl/645e6215ee0de_1e6f2945e@dwillia2-xfh.jf.intel.com.notmuch/ Suggested-by: Dan Williams Reviewed-by: Dan Williams Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/170319615734.2212653.15319394025985499185.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- drivers/acpi/numa/hmat.c | 28 ++++++++++++++-------------- drivers/base/node.c | 12 ++++++------ include/linux/memory-tiers.h | 10 +++++----- include/linux/node.h | 8 ++++---- mm/memory-tiers.c | 12 ++++++------ 5 files changed, 35 insertions(+), 35 deletions(-) diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 9ef5f1bdcfdbc..83bc2b69401bf 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -63,7 +63,7 @@ struct memory_target { unsigned int memory_pxm; unsigned int processor_pxm; struct resource memregions; - struct node_hmem_attrs hmem_attrs[2]; + struct access_coordinate coord[2]; struct list_head caches; struct node_cache_attrs cache_attrs; bool registered; @@ -228,24 +228,24 @@ static void hmat_update_target_access(struct memory_target *target, { switch (type) { case ACPI_HMAT_ACCESS_LATENCY: - target->hmem_attrs[access].read_latency = value; - target->hmem_attrs[access].write_latency = value; + target->coord[access].read_latency = value; + target->coord[access].write_latency = value; break; case ACPI_HMAT_READ_LATENCY: - target->hmem_attrs[access].read_latency = value; + target->coord[access].read_latency = value; break; case ACPI_HMAT_WRITE_LATENCY: - target->hmem_attrs[access].write_latency = value; + target->coord[access].write_latency = value; break; case ACPI_HMAT_ACCESS_BANDWIDTH: - target->hmem_attrs[access].read_bandwidth = value; - target->hmem_attrs[access].write_bandwidth = value; + target->coord[access].read_bandwidth = value; + target->coord[access].write_bandwidth = value; break; case ACPI_HMAT_READ_BANDWIDTH: - target->hmem_attrs[access].read_bandwidth = value; + target->coord[access].read_bandwidth = value; break; case ACPI_HMAT_WRITE_BANDWIDTH: - target->hmem_attrs[access].write_bandwidth = value; + target->coord[access].write_bandwidth = value; break; default: break; @@ -681,7 +681,7 @@ static void hmat_register_target_cache(struct memory_target *target) static void hmat_register_target_perf(struct memory_target *target, int access) { unsigned mem_nid = pxm_to_node(target->memory_pxm); - node_set_perf_attrs(mem_nid, &target->hmem_attrs[access], access); + node_set_perf_attrs(mem_nid, &target->coord[access], access); } static void hmat_register_target_devices(struct memory_target *target) @@ -765,7 +765,7 @@ static int hmat_set_default_dram_perf(void) int rc; int nid, pxm; struct memory_target *target; - struct node_hmem_attrs *attrs; + struct access_coordinate *attrs; if (!default_dram_type) return -EIO; @@ -775,7 +775,7 @@ static int hmat_set_default_dram_perf(void) target = find_mem_target(pxm); if (!target) continue; - attrs = &target->hmem_attrs[1]; + attrs = &target->coord[1]; rc = mt_set_default_dram_perf(nid, attrs, "ACPI HMAT"); if (rc) return rc; @@ -789,7 +789,7 @@ static int hmat_calculate_adistance(struct notifier_block *self, { static DECLARE_BITMAP(p_nodes, MAX_NUMNODES); struct memory_target *target; - struct node_hmem_attrs *perf; + struct access_coordinate *perf; int *adist = data; int pxm; @@ -802,7 +802,7 @@ static int hmat_calculate_adistance(struct notifier_block *self, hmat_update_target_attrs(target, p_nodes, 1); mutex_unlock(&target_lock); - perf = &target->hmem_attrs[1]; + perf = &target->coord[1]; if (mt_perf_to_adistance(perf, adist)) return NOTIFY_OK; diff --git a/drivers/base/node.c b/drivers/base/node.c index 493d533f83755..cb2b6cc7f6e69 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -74,14 +74,14 @@ static BIN_ATTR_RO(cpulist, CPULIST_FILE_MAX_BYTES); * @dev: Device for this memory access class * @list_node: List element in the node's access list * @access: The access class rank - * @hmem_attrs: Heterogeneous memory performance attributes + * @coord: Heterogeneous memory performance coordinates */ struct node_access_nodes { struct device dev; struct list_head list_node; unsigned int access; #ifdef CONFIG_HMEM_REPORTING - struct node_hmem_attrs hmem_attrs; + struct access_coordinate coord; #endif }; #define to_access_nodes(dev) container_of(dev, struct node_access_nodes, dev) @@ -167,7 +167,7 @@ static ssize_t property##_show(struct device *dev, \ char *buf) \ { \ return sysfs_emit(buf, "%u\n", \ - to_access_nodes(dev)->hmem_attrs.property); \ + to_access_nodes(dev)->coord.property); \ } \ static DEVICE_ATTR_RO(property) @@ -187,10 +187,10 @@ static struct attribute *access_attrs[] = { /** * node_set_perf_attrs - Set the performance values for given access class * @nid: Node identifier to be set - * @hmem_attrs: Heterogeneous memory performance attributes + * @coord: Heterogeneous memory performance coordinates * @access: The access class the for the given attributes */ -void node_set_perf_attrs(unsigned int nid, struct node_hmem_attrs *hmem_attrs, +void node_set_perf_attrs(unsigned int nid, struct access_coordinate *coord, unsigned int access) { struct node_access_nodes *c; @@ -205,7 +205,7 @@ void node_set_perf_attrs(unsigned int nid, struct node_hmem_attrs *hmem_attrs, if (!c) return; - c->hmem_attrs = *hmem_attrs; + c->coord = *coord; for (i = 0; access_attrs[i] != NULL; i++) { if (sysfs_add_file_to_group(&c->dev.kobj, access_attrs[i], "initiators")) { diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index 1e39d27bee418..69e7819000827 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -33,7 +33,7 @@ struct memory_dev_type { struct kref kref; }; -struct node_hmem_attrs; +struct access_coordinate; #ifdef CONFIG_NUMA extern bool numa_demotion_enabled; @@ -45,9 +45,9 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype); int register_mt_adistance_algorithm(struct notifier_block *nb); int unregister_mt_adistance_algorithm(struct notifier_block *nb); int mt_calc_adistance(int node, int *adist); -int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf, +int mt_set_default_dram_perf(int nid, struct access_coordinate *perf, const char *source); -int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist); +int mt_perf_to_adistance(struct access_coordinate *perf, int *adist); #ifdef CONFIG_MIGRATION int next_demotion_node(int node); void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets); @@ -126,13 +126,13 @@ static inline int mt_calc_adistance(int node, int *adist) return NOTIFY_DONE; } -static inline int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf, +static inline int mt_set_default_dram_perf(int nid, struct access_coordinate *perf, const char *source) { return -EIO; } -static inline int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist) +static inline int mt_perf_to_adistance(struct access_coordinate *perf, int *adist) { return -EIO; } diff --git a/include/linux/node.h b/include/linux/node.h index 427a5975cf405..25b66d705ee2e 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -20,14 +20,14 @@ #include /** - * struct node_hmem_attrs - heterogeneous memory performance attributes + * struct access_coordinate - generic performance coordinates container * * @read_bandwidth: Read bandwidth in MB/s * @write_bandwidth: Write bandwidth in MB/s * @read_latency: Read latency in nanoseconds * @write_latency: Write latency in nanoseconds */ -struct node_hmem_attrs { +struct access_coordinate { unsigned int read_bandwidth; unsigned int write_bandwidth; unsigned int read_latency; @@ -65,7 +65,7 @@ struct node_cache_attrs { #ifdef CONFIG_HMEM_REPORTING void node_add_cache(unsigned int nid, struct node_cache_attrs *cache_attrs); -void node_set_perf_attrs(unsigned int nid, struct node_hmem_attrs *hmem_attrs, +void node_set_perf_attrs(unsigned int nid, struct access_coordinate *coord, unsigned access); #else static inline void node_add_cache(unsigned int nid, @@ -74,7 +74,7 @@ static inline void node_add_cache(unsigned int nid, } static inline void node_set_perf_attrs(unsigned int nid, - struct node_hmem_attrs *hmem_attrs, + struct access_coordinate *coord, unsigned access) { } diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 8d5291add2bce..5462d9e3c84c7 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -109,7 +109,7 @@ static struct demotion_nodes *node_demotion __read_mostly; static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms); static bool default_dram_perf_error; -static struct node_hmem_attrs default_dram_perf; +static struct access_coordinate default_dram_perf; static int default_dram_perf_ref_nid = NUMA_NO_NODE; static const char *default_dram_perf_ref_source; @@ -601,15 +601,15 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype) } EXPORT_SYMBOL_GPL(clear_node_memory_type); -static void dump_hmem_attrs(struct node_hmem_attrs *attrs, const char *prefix) +static void dump_hmem_attrs(struct access_coordinate *coord, const char *prefix) { pr_info( "%sread_latency: %u, write_latency: %u, read_bandwidth: %u, write_bandwidth: %u\n", - prefix, attrs->read_latency, attrs->write_latency, - attrs->read_bandwidth, attrs->write_bandwidth); + prefix, coord->read_latency, coord->write_latency, + coord->read_bandwidth, coord->write_bandwidth); } -int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf, +int mt_set_default_dram_perf(int nid, struct access_coordinate *perf, const char *source) { int rc = 0; @@ -666,7 +666,7 @@ out: return rc; } -int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist) +int mt_perf_to_adistance(struct access_coordinate *perf, int *adist) { if (default_dram_perf_error) return -EIO; -- GitLab From 69b789b64456093819f730b3f9c13a593a5485d9 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 21 Dec 2023 15:02:43 -0700 Subject: [PATCH 0494/1290] acpi: numa: Create enum for memory_target access coordinates indexing Create enums to provide named indexing for the access coordinate array. This is in preparation for adding generic port support which will add a third index in the array to keep the generic port attributes separate from the memory attributes. Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/170319616332.2212653.3872789279950567889.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- drivers/acpi/numa/hmat.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 83bc2b69401bf..ca7aedfbb5f2d 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -58,12 +58,18 @@ struct target_cache { struct node_cache_attrs cache_attrs; }; +enum { + NODE_ACCESS_CLASS_0 = 0, + NODE_ACCESS_CLASS_1, + NODE_ACCESS_CLASS_MAX, +}; + struct memory_target { struct list_head node; unsigned int memory_pxm; unsigned int processor_pxm; struct resource memregions; - struct access_coordinate coord[2]; + struct access_coordinate coord[NODE_ACCESS_CLASS_MAX]; struct list_head caches; struct node_cache_attrs cache_attrs; bool registered; @@ -339,10 +345,12 @@ static __init int hmat_parse_locality(union acpi_subtable_headers *header, if (mem_hier == ACPI_HMAT_MEMORY) { target = find_mem_target(targs[targ]); if (target && target->processor_pxm == inits[init]) { - hmat_update_target_access(target, type, value, 0); + hmat_update_target_access(target, type, value, + NODE_ACCESS_CLASS_0); /* If the node has a CPU, update access 1 */ if (node_state(pxm_to_node(inits[init]), N_CPU)) - hmat_update_target_access(target, type, value, 1); + hmat_update_target_access(target, type, value, + NODE_ACCESS_CLASS_1); } } } @@ -726,8 +734,8 @@ static void hmat_register_target(struct memory_target *target) if (!target->registered) { hmat_register_target_initiators(target); hmat_register_target_cache(target); - hmat_register_target_perf(target, 0); - hmat_register_target_perf(target, 1); + hmat_register_target_perf(target, NODE_ACCESS_CLASS_0); + hmat_register_target_perf(target, NODE_ACCESS_CLASS_1); target->registered = true; } mutex_unlock(&target_lock); -- GitLab From 6373c48b8c9dfb5c1e09fdb538e700d9cc91c45e Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 21 Dec 2023 15:02:49 -0700 Subject: [PATCH 0495/1290] acpi: numa: Add genport target allocation to the HMAT parsing Add SRAT parsing for the HMAT init in order to collect the device handle from the Generic Port Affinity Structure. The device handle will serve as the key to search for target data. Consolidate the common code with alloc_memory_target() in a helper function alloc_target(). Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/170319616951.2212653.14862375982250406464.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- drivers/acpi/numa/hmat.c | 59 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 3 deletions(-) diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index ca7aedfbb5f2d..21722cbec324d 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -72,6 +72,7 @@ struct memory_target { struct access_coordinate coord[NODE_ACCESS_CLASS_MAX]; struct list_head caches; struct node_cache_attrs cache_attrs; + u8 gen_port_device_handle[ACPI_SRAT_DEVICE_HANDLE_SIZE]; bool registered; }; @@ -126,8 +127,7 @@ static __init void alloc_memory_initiator(unsigned int cpu_pxm) list_add_tail(&initiator->node, &initiators); } -static __init void alloc_memory_target(unsigned int mem_pxm, - resource_size_t start, resource_size_t len) +static __init struct memory_target *alloc_target(unsigned int mem_pxm) { struct memory_target *target; @@ -135,7 +135,7 @@ static __init void alloc_memory_target(unsigned int mem_pxm, if (!target) { target = kzalloc(sizeof(*target), GFP_KERNEL); if (!target) - return; + return NULL; target->memory_pxm = mem_pxm; target->processor_pxm = PXM_INVAL; target->memregions = (struct resource) { @@ -148,6 +148,19 @@ static __init void alloc_memory_target(unsigned int mem_pxm, INIT_LIST_HEAD(&target->caches); } + return target; +} + +static __init void alloc_memory_target(unsigned int mem_pxm, + resource_size_t start, + resource_size_t len) +{ + struct memory_target *target; + + target = alloc_target(mem_pxm); + if (!target) + return; + /* * There are potentially multiple ranges per PXM, so record each * in the per-target memregions resource tree. @@ -158,6 +171,18 @@ static __init void alloc_memory_target(unsigned int mem_pxm, start, start + len, mem_pxm); } +static __init void alloc_genport_target(unsigned int mem_pxm, u8 *handle) +{ + struct memory_target *target; + + target = alloc_target(mem_pxm); + if (!target) + return; + + memcpy(target->gen_port_device_handle, handle, + ACPI_SRAT_DEVICE_HANDLE_SIZE); +} + static __init const char *hmat_data_type(u8 type) { switch (type) { @@ -499,6 +524,27 @@ static __init int srat_parse_mem_affinity(union acpi_subtable_headers *header, return 0; } +static __init int srat_parse_genport_affinity(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_srat_generic_affinity *ga = (void *)header; + + if (!ga) + return -EINVAL; + + if (!(ga->flags & ACPI_SRAT_GENERIC_AFFINITY_ENABLED)) + return 0; + + /* Skip PCI device_handle for now */ + if (ga->device_handle_type != 0) + return 0; + + alloc_genport_target(ga->proximity_domain, + (u8 *)ga->device_handle); + + return 0; +} + static u32 hmat_initiator_perf(struct memory_target *target, struct memory_initiator *initiator, struct acpi_hmat_locality *hmat_loc) @@ -878,6 +924,13 @@ static __init int hmat_init(void) ACPI_SRAT_TYPE_MEMORY_AFFINITY, srat_parse_mem_affinity, 0) < 0) goto out_put; + + if (acpi_table_parse_entries(ACPI_SIG_SRAT, + sizeof(struct acpi_table_srat), + ACPI_SRAT_TYPE_GENERIC_PORT_AFFINITY, + srat_parse_genport_affinity, 0) < 0) + goto out_put; + acpi_put_table(tbl); status = acpi_get_table(ACPI_SIG_HMAT, 0, &tbl); -- GitLab From 79205651120620c2683f90c25ef3d2ac8e454026 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 21 Dec 2023 15:02:55 -0700 Subject: [PATCH 0496/1290] acpi: Break out nesting for hmat_parse_locality() Refactor hmat_parse_locality() to break up the deep nesting of the function. Suggested-by: Jonathan Cameron Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/170319617537.2212653.10625501075519862509.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- drivers/acpi/numa/hmat.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 21722cbec324d..4cae2e84251a2 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -322,11 +322,28 @@ static __init void hmat_add_locality(struct acpi_hmat_locality *hmat_loc) } } +static __init void hmat_update_target(unsigned int tgt_pxm, unsigned int init_pxm, + u8 mem_hier, u8 type, u32 value) +{ + struct memory_target *target = find_mem_target(tgt_pxm); + + if (mem_hier != ACPI_HMAT_MEMORY) + return; + + if (target && target->processor_pxm == init_pxm) { + hmat_update_target_access(target, type, value, + NODE_ACCESS_CLASS_0); + /* If the node has a CPU, update access 1 */ + if (node_state(pxm_to_node(init_pxm), N_CPU)) + hmat_update_target_access(target, type, value, + NODE_ACCESS_CLASS_1); + } +} + static __init int hmat_parse_locality(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_hmat_locality *hmat_loc = (void *)header; - struct memory_target *target; unsigned int init, targ, total_size, ipds, tpds; u32 *inits, *targs, value; u16 *entries; @@ -367,17 +384,8 @@ static __init int hmat_parse_locality(union acpi_subtable_headers *header, inits[init], targs[targ], value, hmat_data_type_suffix(type)); - if (mem_hier == ACPI_HMAT_MEMORY) { - target = find_mem_target(targs[targ]); - if (target && target->processor_pxm == inits[init]) { - hmat_update_target_access(target, type, value, - NODE_ACCESS_CLASS_0); - /* If the node has a CPU, update access 1 */ - if (node_state(pxm_to_node(inits[init]), N_CPU)) - hmat_update_target_access(target, type, value, - NODE_ACCESS_CLASS_1); - } - } + hmat_update_target(targs[targ], inits[init], + mem_hier, type, value); } } -- GitLab From a3a3e341f169511823f7b2d140a0bdfbd620dcbd Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 21 Dec 2023 15:03:01 -0700 Subject: [PATCH 0497/1290] acpi: numa: Add setting of generic port system locality attributes Add generic port support for the parsing of HMAT system locality sub-table. The attributes will be added to the third array member of the access coordinates in order to not mix with the existing memory attributes. It only provides the system locality attributes from initiator to the generic port targets and is missing the rest of the data to the actual memory device. The complete attributes will be updated when a memory device is attached and the system locality information is calculated end to end. Through hmat_update_target_attrs(), the best performance attributes will be setup in target->coord. Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/170319618135.2212653.13778540010384821833.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- drivers/acpi/numa/hmat.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 4cae2e84251a2..8a1802e078f3c 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -61,6 +61,7 @@ struct target_cache { enum { NODE_ACCESS_CLASS_0 = 0, NODE_ACCESS_CLASS_1, + NODE_ACCESS_CLASS_GENPORT_SINK, NODE_ACCESS_CLASS_MAX, }; @@ -654,6 +655,11 @@ static void hmat_update_target_attrs(struct memory_target *target, u32 best = 0; int i; + /* Don't update for generic port if there's no device handle */ + if (access == NODE_ACCESS_CLASS_GENPORT_SINK && + !(*(u16 *)target->gen_port_device_handle)) + return; + bitmap_zero(p_nodes, MAX_NUMNODES); /* * If the Address Range Structure provides a local processor pxm, set @@ -723,6 +729,14 @@ static void __hmat_register_target_initiators(struct memory_target *target, } } +static void hmat_register_generic_target_initiators(struct memory_target *target) +{ + static DECLARE_BITMAP(p_nodes, MAX_NUMNODES); + + __hmat_register_target_initiators(target, p_nodes, + NODE_ACCESS_CLASS_GENPORT_SINK); +} + static void hmat_register_target_initiators(struct memory_target *target) { static DECLARE_BITMAP(p_nodes, MAX_NUMNODES); @@ -774,6 +788,17 @@ static void hmat_register_target(struct memory_target *target) */ hmat_register_target_devices(target); + /* + * Register generic port perf numbers. The nid may not be + * initialized and is still NUMA_NO_NODE. + */ + mutex_lock(&target_lock); + if (*(u16 *)target->gen_port_device_handle) { + hmat_register_generic_target_initiators(target); + target->registered = true; + } + mutex_unlock(&target_lock); + /* * Skip offline nodes. This can happen when memory * marked EFI_MEMORY_SP, "specific purpose", is applied -- GitLab From ca53543d8e340070fb37fde93f36ed9012c76b90 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 21 Dec 2023 15:03:07 -0700 Subject: [PATCH 0498/1290] acpi: numa: Add helper function to retrieve the performance attributes Add helper to retrieve the performance attributes based on the device handle. The helper function is exported so the CXL driver can use that to acquire the performance data between the CPU and the CXL host bridge. Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/170319618721.2212653.5552947472849081786.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- drivers/acpi/numa/hmat.c | 41 ++++++++++++++++++++++++++++++++++++++++ include/linux/acpi.h | 11 +++++++++++ 2 files changed, 52 insertions(+) diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 8a1802e078f3c..d6b85f0f6082f 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -108,6 +108,47 @@ static struct memory_target *find_mem_target(unsigned int mem_pxm) return NULL; } +static struct memory_target *acpi_find_genport_target(u32 uid) +{ + struct memory_target *target; + u32 target_uid; + u8 *uid_ptr; + + list_for_each_entry(target, &targets, node) { + uid_ptr = target->gen_port_device_handle + 8; + target_uid = *(u32 *)uid_ptr; + if (uid == target_uid) + return target; + } + + return NULL; +} + +/** + * acpi_get_genport_coordinates - Retrieve the access coordinates for a generic port + * @uid: ACPI unique id + * @coord: The access coordinates written back out for the generic port + * + * Return: 0 on success. Errno on failure. + * + * Only supports device handles that are ACPI. Assume ACPI0016 HID for CXL. + */ +int acpi_get_genport_coordinates(u32 uid, + struct access_coordinate *coord) +{ + struct memory_target *target; + + guard(mutex)(&target_lock); + target = acpi_find_genport_target(uid); + if (!target) + return -ENOENT; + + *coord = target->coord[NODE_ACCESS_CLASS_GENPORT_SINK]; + + return 0; +} +EXPORT_SYMBOL_NS_GPL(acpi_get_genport_coordinates, CXL); + static __init void alloc_memory_initiator(unsigned int cpu_pxm) { struct memory_initiator *initiator; diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 4db54e928b36d..8b0761c682f99 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -15,6 +15,7 @@ #include #include #include +#include struct irq_domain; struct irq_domain_ops; @@ -424,6 +425,16 @@ extern int acpi_blacklisted(void); extern void acpi_osi_setup(char *str); extern bool acpi_osi_is_win8(void); +#ifdef CONFIG_ACPI_HMAT +int acpi_get_genport_coordinates(u32 uid, struct access_coordinate *coord); +#else +static inline int acpi_get_genport_coordinates(u32 uid, + struct access_coordinate *coord) +{ + return -EOPNOTSUPP; +} +#endif + #ifdef CONFIG_ACPI_NUMA int acpi_map_pxm_to_node(int pxm); int acpi_get_node(acpi_handle handle); -- GitLab From ad6f04c0269b0b7908f09621d3b3c90def39a297 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 21 Dec 2023 15:03:13 -0700 Subject: [PATCH 0499/1290] cxl: Add callback to parse the DSMAS subtables from CDAT Provide a callback function to the CDAT parser in order to parse the Device Scoped Memory Affinity Structure (DSMAS). Each DSMAS structure contains the DPA range and its associated attributes in each entry. See the CDAT specification for details. The device handle and the DPA range is saved and to be associated with the DSLBIS locality data when the DSLBIS entries are parsed. The xarray is a local variable. When the total path performance data is calculated and storred this xarray can be discarded. Coherent Device Attribute Table 1.03 2.1 Device Scoped memory Affinity Structure (DSMAS) Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/170319619355.2212653.2675953129671561293.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- drivers/cxl/Kconfig | 3 ++ drivers/cxl/core/Makefile | 1 + drivers/cxl/core/cdat.c | 92 +++++++++++++++++++++++++++++++++++++++ drivers/cxl/cxl.h | 2 + drivers/cxl/port.c | 1 + tools/testing/cxl/Kbuild | 1 + 6 files changed, 100 insertions(+) create mode 100644 drivers/cxl/core/cdat.c diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig index 8ea1d340e4385..67998dbd1d46b 100644 --- a/drivers/cxl/Kconfig +++ b/drivers/cxl/Kconfig @@ -5,6 +5,7 @@ menuconfig CXL_BUS select FW_LOADER select FW_UPLOAD select PCI_DOE + select FIRMWARE_TABLE help CXL is a bus that is electrically compatible with PCI Express, but layers three protocols on that signalling (CXL.io, CXL.cache, and @@ -54,8 +55,10 @@ config CXL_MEM_RAW_COMMANDS config CXL_ACPI tristate "CXL ACPI: Platform Support" depends on ACPI + depends on ACPI_NUMA default CXL_BUS select ACPI_TABLE_LIB + select ACPI_HMAT help Enable support for host managed device memory (HDM) resources published by a platform's ACPI CXL memory layout description. See diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile index 1f66b5d4d9355..9259bcc6773c8 100644 --- a/drivers/cxl/core/Makefile +++ b/drivers/cxl/core/Makefile @@ -13,5 +13,6 @@ cxl_core-y += mbox.o cxl_core-y += pci.o cxl_core-y += hdm.o cxl_core-y += pmu.o +cxl_core-y += cdat.o cxl_core-$(CONFIG_TRACING) += trace.o cxl_core-$(CONFIG_CXL_REGION) += region.o diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c new file mode 100644 index 0000000000000..9bf4f53bf77fe --- /dev/null +++ b/drivers/cxl/core/cdat.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2023 Intel Corporation. All rights reserved. */ +#include +#include +#include +#include "cxlpci.h" +#include "cxl.h" + +struct dsmas_entry { + struct range dpa_range; + u8 handle; +}; + +static int cdat_dsmas_handler(union acpi_subtable_headers *header, void *arg, + const unsigned long end) +{ + struct acpi_cdat_header *hdr = &header->cdat; + struct acpi_cdat_dsmas *dsmas; + int size = sizeof(*hdr) + sizeof(*dsmas); + struct xarray *dsmas_xa = arg; + struct dsmas_entry *dent; + u16 len; + int rc; + + len = le16_to_cpu((__force __le16)hdr->length); + if (len != size || (unsigned long)hdr + len > end) { + pr_warn("Malformed DSMAS table length: (%u:%u)\n", size, len); + return -EINVAL; + } + + /* Skip common header */ + dsmas = (struct acpi_cdat_dsmas *)(hdr + 1); + + dent = kzalloc(sizeof(*dent), GFP_KERNEL); + if (!dent) + return -ENOMEM; + + dent->handle = dsmas->dsmad_handle; + dent->dpa_range.start = le64_to_cpu((__force __le64)dsmas->dpa_base_address); + dent->dpa_range.end = le64_to_cpu((__force __le64)dsmas->dpa_base_address) + + le64_to_cpu((__force __le64)dsmas->dpa_length) - 1; + + rc = xa_insert(dsmas_xa, dent->handle, dent, GFP_KERNEL); + if (rc) { + kfree(dent); + return rc; + } + + return 0; +} + +static int cxl_cdat_endpoint_process(struct cxl_port *port, + struct xarray *dsmas_xa) +{ + return cdat_table_parse(ACPI_CDAT_TYPE_DSMAS, cdat_dsmas_handler, + dsmas_xa, port->cdat.table); +} + +static void discard_dsmas(struct xarray *xa) +{ + unsigned long index; + void *ent; + + xa_for_each(xa, index, ent) { + xa_erase(xa, index); + kfree(ent); + } + xa_destroy(xa); +} +DEFINE_FREE(dsmas, struct xarray *, if (_T) discard_dsmas(_T)) + +void cxl_endpoint_parse_cdat(struct cxl_port *port) +{ + struct xarray __dsmas_xa; + struct xarray *dsmas_xa __free(dsmas) = &__dsmas_xa; + int rc; + + xa_init(&__dsmas_xa); + if (!port->cdat.table) + return; + + rc = cxl_cdat_endpoint_process(port, dsmas_xa); + if (rc < 0) { + dev_dbg(&port->dev, "Failed to parse CDAT: %d\n", rc); + return; + } + + /* Performance data processing */ +} +EXPORT_SYMBOL_NS_GPL(cxl_endpoint_parse_cdat, CXL); + +MODULE_IMPORT_NS(CXL); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 687043ece1018..be3b5eda875c6 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -839,6 +839,8 @@ static inline struct cxl_dax_region *to_cxl_dax_region(struct device *dev) } #endif +void cxl_endpoint_parse_cdat(struct cxl_port *port); + /* * Unit test builds overrides this to __weak, find the 'strong' version * of these symbols in tools/testing/cxl/. diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c index 47bc8e0b85907..a889c4e6cb272 100644 --- a/drivers/cxl/port.c +++ b/drivers/cxl/port.c @@ -109,6 +109,7 @@ static int cxl_endpoint_port_probe(struct cxl_port *port) /* Cache the data early to ensure is_visible() works */ read_cdat_data(port); + cxl_endpoint_parse_cdat(port); get_device(&cxlmd->dev); rc = devm_add_action_or_reset(&port->dev, schedule_detach, cxlmd); diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 95dc58b94178b..0b12c36902d82 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -58,6 +58,7 @@ cxl_core-y += $(CXL_CORE_SRC)/mbox.o cxl_core-y += $(CXL_CORE_SRC)/pci.o cxl_core-y += $(CXL_CORE_SRC)/hdm.o cxl_core-y += $(CXL_CORE_SRC)/pmu.o +cxl_core-y += $(CXL_CORE_SRC)/cdat.o cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o cxl_core-y += config_check.o -- GitLab From 63cef81b9dca6ddf1c34d697016f830ddcfadf28 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 21 Dec 2023 15:03:20 -0700 Subject: [PATCH 0500/1290] cxl: Add callback to parse the DSLBIS subtable from CDAT Provide a callback to parse the Device Scoped Latency and Bandwidth Information Structure (DSLBIS) in the CDAT structures. The DSLBIS contains the bandwidth and latency information that's tied to a DSMAS handle. The driver will retrieve the read and write latency and bandwidth associated with the DSMAS which is tied to a DPA range. Coherent Device Attribute Table 1.03 2.1 Device Scoped Latency and Bandwidth Information Structure (DSLBIS) Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/170319620005.2212653.7475488478229720542.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- drivers/cxl/core/cdat.c | 102 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 100 insertions(+), 2 deletions(-) diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index 9bf4f53bf77fe..97d8ef8848c66 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -3,12 +3,15 @@ #include #include #include +#include +#include #include "cxlpci.h" #include "cxl.h" struct dsmas_entry { struct range dpa_range; u8 handle; + struct access_coordinate coord; }; static int cdat_dsmas_handler(union acpi_subtable_headers *header, void *arg, @@ -49,11 +52,106 @@ static int cdat_dsmas_handler(union acpi_subtable_headers *header, void *arg, return 0; } +static void cxl_access_coordinate_set(struct access_coordinate *coord, + int access, unsigned int val) +{ + switch (access) { + case ACPI_HMAT_ACCESS_LATENCY: + coord->read_latency = val; + coord->write_latency = val; + break; + case ACPI_HMAT_READ_LATENCY: + coord->read_latency = val; + break; + case ACPI_HMAT_WRITE_LATENCY: + coord->write_latency = val; + break; + case ACPI_HMAT_ACCESS_BANDWIDTH: + coord->read_bandwidth = val; + coord->write_bandwidth = val; + break; + case ACPI_HMAT_READ_BANDWIDTH: + coord->read_bandwidth = val; + break; + case ACPI_HMAT_WRITE_BANDWIDTH: + coord->write_bandwidth = val; + break; + } +} + +static int cdat_dslbis_handler(union acpi_subtable_headers *header, void *arg, + const unsigned long end) +{ + struct acpi_cdat_header *hdr = &header->cdat; + struct acpi_cdat_dslbis *dslbis; + int size = sizeof(*hdr) + sizeof(*dslbis); + struct xarray *dsmas_xa = arg; + struct dsmas_entry *dent; + __le64 le_base; + __le16 le_val; + u64 val; + u16 len; + int rc; + + len = le16_to_cpu((__force __le16)hdr->length); + if (len != size || (unsigned long)hdr + len > end) { + pr_warn("Malformed DSLBIS table length: (%u:%u)\n", size, len); + return -EINVAL; + } + + /* Skip common header */ + dslbis = (struct acpi_cdat_dslbis *)(hdr + 1); + + /* Skip unrecognized data type */ + if (dslbis->data_type > ACPI_HMAT_WRITE_BANDWIDTH) + return 0; + + /* Not a memory type, skip */ + if ((dslbis->flags & ACPI_HMAT_MEMORY_HIERARCHY) != ACPI_HMAT_MEMORY) + return 0; + + dent = xa_load(dsmas_xa, dslbis->handle); + if (!dent) { + pr_warn("No matching DSMAS entry for DSLBIS entry.\n"); + return 0; + } + + le_base = (__force __le64)dslbis->entry_base_unit; + le_val = (__force __le16)dslbis->entry[0]; + rc = check_mul_overflow(le64_to_cpu(le_base), + le16_to_cpu(le_val), &val); + if (rc) + pr_warn("DSLBIS value overflowed.\n"); + + cxl_access_coordinate_set(&dent->coord, dslbis->data_type, val); + + return 0; +} + +static int cdat_table_parse_output(int rc) +{ + if (rc < 0) + return rc; + if (rc == 0) + return -ENOENT; + + return 0; +} + static int cxl_cdat_endpoint_process(struct cxl_port *port, struct xarray *dsmas_xa) { - return cdat_table_parse(ACPI_CDAT_TYPE_DSMAS, cdat_dsmas_handler, - dsmas_xa, port->cdat.table); + int rc; + + rc = cdat_table_parse(ACPI_CDAT_TYPE_DSMAS, cdat_dsmas_handler, + dsmas_xa, port->cdat.table); + rc = cdat_table_parse_output(rc); + if (rc) + return rc; + + rc = cdat_table_parse(ACPI_CDAT_TYPE_DSLBIS, cdat_dslbis_handler, + dsmas_xa, port->cdat.table); + return cdat_table_parse_output(rc); } static void discard_dsmas(struct xarray *xa) -- GitLab From 80aa780dda20618be76162bf991d49cf962fda38 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 21 Dec 2023 15:03:26 -0700 Subject: [PATCH 0501/1290] cxl: Add callback to parse the SSLBIS subtable from CDAT Provide a callback to parse the Switched Scoped Latency and Bandwidth Information Structure (SSLBIS) in the CDAT structures. The SSLBIS contains the bandwidth and latency information that's tied to the CXL switch that the data table has been read from. The extracted values are stored to the cxl_dport correlated by the port_id depending on the SSLBIS entry. Coherent Device Attribute Table 1.03 2.1 Switched Scoped Latency and Bandwidth Information Structure (DSLBIS) Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/170319620635.2212653.5194389158785365150.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- drivers/cxl/core/cdat.c | 98 +++++++++++++++++++++++++++++++++++++++++ drivers/cxl/cxl.h | 4 ++ drivers/cxl/port.c | 2 + 3 files changed, 104 insertions(+) diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index 97d8ef8848c66..b3ab47d250e16 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -187,4 +187,102 @@ void cxl_endpoint_parse_cdat(struct cxl_port *port) } EXPORT_SYMBOL_NS_GPL(cxl_endpoint_parse_cdat, CXL); +static int cdat_sslbis_handler(union acpi_subtable_headers *header, void *arg, + const unsigned long end) +{ + struct acpi_cdat_sslbis *sslbis; + int size = sizeof(header->cdat) + sizeof(*sslbis); + struct cxl_port *port = arg; + struct device *dev = &port->dev; + struct acpi_cdat_sslbe *entry; + int remain, entries, i; + u16 len; + + len = le16_to_cpu((__force __le16)header->cdat.length); + remain = len - size; + if (!remain || remain % sizeof(*entry) || + (unsigned long)header + len > end) { + dev_warn(dev, "Malformed SSLBIS table length: (%u)\n", len); + return -EINVAL; + } + + /* Skip common header */ + sslbis = (struct acpi_cdat_sslbis *)((unsigned long)header + + sizeof(header->cdat)); + + /* Unrecognized data type, we can skip */ + if (sslbis->data_type > ACPI_HMAT_WRITE_BANDWIDTH) + return 0; + + entries = remain / sizeof(*entry); + entry = (struct acpi_cdat_sslbe *)((unsigned long)header + sizeof(*sslbis)); + + for (i = 0; i < entries; i++) { + u16 x = le16_to_cpu((__force __le16)entry->portx_id); + u16 y = le16_to_cpu((__force __le16)entry->porty_id); + __le64 le_base; + __le16 le_val; + struct cxl_dport *dport; + unsigned long index; + u16 dsp_id; + u64 val; + + switch (x) { + case ACPI_CDAT_SSLBIS_US_PORT: + dsp_id = y; + break; + case ACPI_CDAT_SSLBIS_ANY_PORT: + switch (y) { + case ACPI_CDAT_SSLBIS_US_PORT: + dsp_id = x; + break; + case ACPI_CDAT_SSLBIS_ANY_PORT: + dsp_id = ACPI_CDAT_SSLBIS_ANY_PORT; + break; + default: + dsp_id = y; + break; + } + break; + default: + dsp_id = x; + break; + } + + le_base = (__force __le64)sslbis->entry_base_unit; + le_val = (__force __le16)entry->latency_or_bandwidth; + + if (check_mul_overflow(le64_to_cpu(le_base), + le16_to_cpu(le_val), &val)) + dev_warn(dev, "SSLBIS value overflowed!\n"); + + xa_for_each(&port->dports, index, dport) { + if (dsp_id == ACPI_CDAT_SSLBIS_ANY_PORT || + dsp_id == dport->port_id) + cxl_access_coordinate_set(&dport->sw_coord, + sslbis->data_type, + val); + } + + entry++; + } + + return 0; +} + +void cxl_switch_parse_cdat(struct cxl_port *port) +{ + int rc; + + if (!port->cdat.table) + return; + + rc = cdat_table_parse(ACPI_CDAT_TYPE_SSLBIS, cdat_sslbis_handler, + port, port->cdat.table); + rc = cdat_table_parse_output(rc); + if (rc) + dev_dbg(&port->dev, "Failed to parse SSLBIS: %d\n", rc); +} +EXPORT_SYMBOL_NS_GPL(cxl_switch_parse_cdat, CXL); + MODULE_IMPORT_NS(CXL); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index be3b5eda875c6..22f664b9f4c63 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -8,6 +8,7 @@ #include #include #include +#include #include /** @@ -634,6 +635,7 @@ struct cxl_rcrb_info { * @rch: Indicate whether this dport was enumerated in RCH or VH mode * @port: reference to cxl_port that contains this downstream port * @regs: Dport parsed register blocks + * @sw_coord: access coordinates (performance) for switch from CDAT */ struct cxl_dport { struct device *dport_dev; @@ -643,6 +645,7 @@ struct cxl_dport { bool rch; struct cxl_port *port; struct cxl_regs regs; + struct access_coordinate sw_coord; }; /** @@ -840,6 +843,7 @@ static inline struct cxl_dax_region *to_cxl_dax_region(struct device *dev) #endif void cxl_endpoint_parse_cdat(struct cxl_port *port); +void cxl_switch_parse_cdat(struct cxl_port *port); /* * Unit test builds overrides this to __weak, find the 'strong' version diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c index a889c4e6cb272..da3c3a08bd626 100644 --- a/drivers/cxl/port.c +++ b/drivers/cxl/port.c @@ -69,6 +69,8 @@ static int cxl_switch_port_probe(struct cxl_port *port) if (rc < 0) return rc; + cxl_switch_parse_cdat(port); + cxlhdm = devm_cxl_setup_hdm(port, NULL); if (!IS_ERR(cxlhdm)) return devm_cxl_enumerate_decoders(cxlhdm, NULL); -- GitLab From 790815902ec61ba1715fd67d3cb9036e13c942bc Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 21 Dec 2023 15:03:32 -0700 Subject: [PATCH 0502/1290] cxl: Add support for _DSM Function for retrieving QTG ID CXL spec v3.0 9.17.3 CXL Root Device Specific Methods (_DSM) Add support to retrieve QTG ID via ACPI _DSM call. The _DSM call requires an input of an ACPI package with 4 dwords (read latency, write latency, read bandwidth, write bandwidth). The call returns a package with 1 WORD that provides the max supported QTG ID and a package that may contain 0 or more WORDs as the recommended QTG IDs in the recommended order. Create a cxl_root container for the root cxl_port and provide a callback ->get_qos_class() in order to retrieve the QoS class. For the ACPI case, the _DSM helper is used to retrieve the QTG ID and returned. A devm_cxl_add_root() function is added for root port setup and registration of the cxl_root callback operation(s). Signed-off-by: Dave Jiang Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/170319621294.2212653.1649682083061569256.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- drivers/cxl/acpi.c | 132 +++++++++++++++++++++++++++++++++++++++- drivers/cxl/core/port.c | 49 ++++++++++++--- drivers/cxl/cxl.h | 25 ++++++++ 3 files changed, 193 insertions(+), 13 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 2034eb4ce83fb..2f7de910ce572 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include "cxlpci.h" #include "cxl.h" @@ -17,6 +18,10 @@ struct cxl_cxims_data { u64 xormaps[] __counted_by(nr_maps); }; +static const guid_t acpi_cxl_qtg_id_guid = + GUID_INIT(0xF365F9A6, 0xA7DE, 0x4071, + 0xA6, 0x6A, 0xB4, 0x0C, 0x0B, 0x4F, 0x8E, 0x52); + /* * Find a targets entry (n) in the host bridge interleave list. * CXL Specification 3.0 Table 9-22 @@ -194,6 +199,125 @@ struct cxl_cfmws_context { int id; }; +/** + * cxl_acpi_evaluate_qtg_dsm - Retrieve QTG ids via ACPI _DSM + * @handle: ACPI handle + * @coord: performance access coordinates + * @entries: number of QTG IDs to return + * @qos_class: int array provided by caller to return QTG IDs + * + * Return: number of QTG IDs returned, or -errno for errors + * + * Issue QTG _DSM with accompanied bandwidth and latency data in order to get + * the QTG IDs that are suitable for the performance point in order of most + * suitable to least suitable. Write back array of QTG IDs and return the + * actual number of QTG IDs written back. + */ +static int +cxl_acpi_evaluate_qtg_dsm(acpi_handle handle, struct access_coordinate *coord, + int entries, int *qos_class) +{ + union acpi_object *out_obj, *out_buf, *obj; + union acpi_object in_array[4] = { + [0].integer = { ACPI_TYPE_INTEGER, coord->read_latency }, + [1].integer = { ACPI_TYPE_INTEGER, coord->write_latency }, + [2].integer = { ACPI_TYPE_INTEGER, coord->read_bandwidth }, + [3].integer = { ACPI_TYPE_INTEGER, coord->write_bandwidth }, + }; + union acpi_object in_obj = { + .package = { + .type = ACPI_TYPE_PACKAGE, + .count = 4, + .elements = in_array, + }, + }; + int count, pkg_entries, i; + u16 max_qtg; + int rc; + + if (!entries) + return -EINVAL; + + out_obj = acpi_evaluate_dsm(handle, &acpi_cxl_qtg_id_guid, 1, 1, &in_obj); + if (!out_obj) + return -ENXIO; + + if (out_obj->type != ACPI_TYPE_PACKAGE) { + rc = -ENXIO; + goto out; + } + + /* Check Max QTG ID */ + obj = &out_obj->package.elements[0]; + if (obj->type != ACPI_TYPE_INTEGER) { + rc = -ENXIO; + goto out; + } + + max_qtg = obj->integer.value; + + /* It's legal to have 0 QTG entries */ + pkg_entries = out_obj->package.count; + if (pkg_entries <= 1) { + rc = 0; + goto out; + } + + /* Retrieve QTG IDs package */ + obj = &out_obj->package.elements[1]; + if (obj->type != ACPI_TYPE_PACKAGE) { + rc = -ENXIO; + goto out; + } + + pkg_entries = obj->package.count; + count = min(entries, pkg_entries); + for (i = 0; i < count; i++) { + u16 qtg_id; + + out_buf = &obj->package.elements[i]; + if (out_buf->type != ACPI_TYPE_INTEGER) { + rc = -ENXIO; + goto out; + } + + qtg_id = out_buf->integer.value; + if (qtg_id > max_qtg) + pr_warn("QTG ID %u greater than MAX %u\n", + qtg_id, max_qtg); + + qos_class[i] = qtg_id; + } + rc = count; + +out: + ACPI_FREE(out_obj); + return rc; +} + +static int cxl_acpi_qos_class(struct cxl_port *root_port, + struct access_coordinate *coord, int entries, + int *qos_class) +{ + acpi_handle handle; + struct device *dev; + + dev = root_port->uport_dev; + + if (!dev_is_platform(dev)) + return -ENODEV; + + handle = ACPI_HANDLE(dev); + if (!handle) + return -ENODEV; + + return cxl_acpi_evaluate_qtg_dsm(handle, coord, entries, qos_class); +} + +static const struct cxl_root_ops acpi_root_ops = { + .qos_class = cxl_acpi_qos_class, +}; + static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg, const unsigned long end) { @@ -656,6 +780,7 @@ static int cxl_acpi_probe(struct platform_device *pdev) { int rc; struct resource *cxl_res; + struct cxl_root *cxl_root; struct cxl_port *root_port; struct device *host = &pdev->dev; struct acpi_device *adev = ACPI_COMPANION(host); @@ -675,9 +800,10 @@ static int cxl_acpi_probe(struct platform_device *pdev) cxl_res->end = -1; cxl_res->flags = IORESOURCE_MEM; - root_port = devm_cxl_add_port(host, host, CXL_RESOURCE_NONE, NULL); - if (IS_ERR(root_port)) - return PTR_ERR(root_port); + cxl_root = devm_cxl_add_root(host, &acpi_root_ops); + if (IS_ERR(cxl_root)) + return PTR_ERR(cxl_root); + root_port = &cxl_root->port; rc = bus_for_each_dev(adev->dev.bus, NULL, root_port, add_host_bridge_dport); diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index b7c93bb18f6e7..9393cbf046528 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -541,7 +541,10 @@ static void cxl_port_release(struct device *dev) xa_destroy(&port->dports); xa_destroy(&port->regions); ida_free(&cxl_port_ida, port->id); - kfree(port); + if (is_cxl_root(port)) + kfree(to_cxl_root(port)); + else + kfree(port); } static ssize_t decoders_committed_show(struct device *dev, @@ -669,17 +672,31 @@ static struct lock_class_key cxl_port_key; static struct cxl_port *cxl_port_alloc(struct device *uport_dev, struct cxl_dport *parent_dport) { - struct cxl_port *port; + struct cxl_root *cxl_root __free(kfree) = NULL; + struct cxl_port *port, *_port __free(kfree) = NULL; struct device *dev; int rc; - port = kzalloc(sizeof(*port), GFP_KERNEL); - if (!port) - return ERR_PTR(-ENOMEM); + /* No parent_dport, root cxl_port */ + if (!parent_dport) { + cxl_root = kzalloc(sizeof(*cxl_root), GFP_KERNEL); + if (!cxl_root) + return ERR_PTR(-ENOMEM); + } else { + _port = kzalloc(sizeof(*port), GFP_KERNEL); + if (!_port) + return ERR_PTR(-ENOMEM); + } rc = ida_alloc(&cxl_port_ida, GFP_KERNEL); if (rc < 0) - goto err; + return ERR_PTR(rc); + + if (cxl_root) + port = &no_free_ptr(cxl_root)->port; + else + port = no_free_ptr(_port); + port->id = rc; port->uport_dev = uport_dev; @@ -731,10 +748,6 @@ static struct cxl_port *cxl_port_alloc(struct device *uport_dev, dev->type = &cxl_port_type; return port; - -err: - kfree(port); - return ERR_PTR(rc); } static int cxl_setup_comp_regs(struct device *host, struct cxl_register_map *map, @@ -884,6 +897,22 @@ struct cxl_port *devm_cxl_add_port(struct device *host, } EXPORT_SYMBOL_NS_GPL(devm_cxl_add_port, CXL); +struct cxl_root *devm_cxl_add_root(struct device *host, + const struct cxl_root_ops *ops) +{ + struct cxl_root *cxl_root; + struct cxl_port *port; + + port = devm_cxl_add_port(host, host, CXL_RESOURCE_NONE, NULL); + if (IS_ERR(port)) + return (struct cxl_root *)port; + + cxl_root = to_cxl_root(port); + cxl_root->ops = ops; + return cxl_root; +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_add_root, CXL); + struct pci_bus *cxl_port_to_pci_bus(struct cxl_port *port) { /* There is no pci_bus associated with a CXL platform-root port */ diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 22f664b9f4c63..abbdcd3a75961 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -615,6 +615,29 @@ struct cxl_port { bool cdat_available; }; +struct cxl_root_ops { + int (*qos_class)(struct cxl_port *root_port, + struct access_coordinate *coord, int entries, + int *qos_class); +}; + +/** + * struct cxl_root - logical collection of root cxl_port items + * + * @port: cxl_port member + * @ops: cxl root operations + */ +struct cxl_root { + struct cxl_port port; + const struct cxl_root_ops *ops; +}; + +static inline struct cxl_root * +to_cxl_root(const struct cxl_port *port) +{ + return container_of(port, struct cxl_root, port); +} + static inline struct cxl_dport * cxl_find_dport_by_dev(struct cxl_port *port, const struct device *dport_dev) { @@ -703,6 +726,8 @@ struct cxl_port *devm_cxl_add_port(struct device *host, struct device *uport_dev, resource_size_t component_reg_phys, struct cxl_dport *parent_dport); +struct cxl_root *devm_cxl_add_root(struct device *host, + const struct cxl_root_ops *ops); struct cxl_port *find_cxl_root(struct cxl_port *port); int devm_cxl_enumerate_ports(struct cxl_memdev *cxlmd); void cxl_bus_rescan(void); -- GitLab From 4d07a05397c8c15c37c8c3abb7afaea1dcd2f0e7 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 21 Dec 2023 15:03:39 -0700 Subject: [PATCH 0503/1290] cxl: Calculate and store PCI link latency for the downstream ports The latency is calculated by dividing the flit size over the bandwidth. Add support to retrieve the flit size for the CXL switch device and calculate the latency of the PCIe link. Cache the latency number with cxl_dport. Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/170319621931.2212653.6800240203604822886.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- drivers/cxl/core/core.h | 2 ++ drivers/cxl/core/pci.c | 36 ++++++++++++++++++++++++++++++++++++ drivers/cxl/core/port.c | 6 ++++++ drivers/cxl/cxl.h | 4 ++++ drivers/cxl/cxlpci.h | 13 +++++++++++++ drivers/pci/pci.c | 38 ++++++++++++++++++++++++++++++++++++-- include/linux/pci.h | 1 + 7 files changed, 98 insertions(+), 2 deletions(-) diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 86d7ba23235e3..3b64fb1b9ed05 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -88,4 +88,6 @@ enum cxl_poison_trace_type { CXL_POISON_TRACE_CLEAR, }; +long cxl_pci_get_latency(struct pci_dev *pdev); + #endif /* __CXL_CORE_H__ */ diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 37e1652afbc7e..6c9c8d92f8f71 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright(c) 2021 Intel Corporation. All rights reserved. */ +#include #include #include #include @@ -979,3 +980,38 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, return PCI_ERS_RESULT_NEED_RESET; } EXPORT_SYMBOL_NS_GPL(cxl_error_detected, CXL); + +static int cxl_flit_size(struct pci_dev *pdev) +{ + if (cxl_pci_flit_256(pdev)) + return 256; + + return 68; +} + +/** + * cxl_pci_get_latency - calculate the link latency for the PCIe link + * @pdev: PCI device + * + * return: calculated latency or 0 for no latency + * + * CXL Memory Device SW Guide v1.0 2.11.4 Link latency calculation + * Link latency = LinkPropagationLatency + FlitLatency + RetimerLatency + * LinkProgationLatency is negligible, so 0 will be used + * RetimerLatency is assumed to be negligible and 0 will be used + * FlitLatency = FlitSize / LinkBandwidth + * FlitSize is defined by spec. CXL rev3.0 4.2.1. + * 68B flit is used up to 32GT/s. >32GT/s, 256B flit size is used. + * The FlitLatency is converted to picoseconds. + */ +long cxl_pci_get_latency(struct pci_dev *pdev) +{ + long bw; + + bw = pcie_link_speed_mbps(pdev); + if (bw < 0) + return 0; + bw /= BITS_PER_BYTE; + + return cxl_flit_size(pdev) * MEGA / bw; +} diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 9393cbf046528..b5ad227fe0d8f 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -854,6 +854,9 @@ static struct cxl_port *__devm_cxl_add_port(struct device *host, if (rc) return ERR_PTR(rc); + if (parent_dport && dev_is_pci(uport_dev)) + port->pci_latency = cxl_pci_get_latency(to_pci_dev(uport_dev)); + return port; err: @@ -1137,6 +1140,9 @@ __devm_cxl_add_dport(struct cxl_port *port, struct device *dport_dev, if (rc) return ERR_PTR(rc); + if (dev_is_pci(dport_dev)) + dport->link_latency = cxl_pci_get_latency(to_pci_dev(dport_dev)); + return dport; } diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index abbdcd3a75961..7da8db919a20d 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -591,6 +591,7 @@ struct cxl_dax_region { * @depth: How deep this port is relative to the root. depth 0 is the root. * @cdat: Cached CDAT data * @cdat_available: Should a CDAT attribute be available in sysfs + * @pci_latency: Upstream latency in picoseconds */ struct cxl_port { struct device dev; @@ -613,6 +614,7 @@ struct cxl_port { size_t length; } cdat; bool cdat_available; + long pci_latency; }; struct cxl_root_ops { @@ -659,6 +661,7 @@ struct cxl_rcrb_info { * @port: reference to cxl_port that contains this downstream port * @regs: Dport parsed register blocks * @sw_coord: access coordinates (performance) for switch from CDAT + * @link_latency: calculated PCIe downstream latency */ struct cxl_dport { struct device *dport_dev; @@ -669,6 +672,7 @@ struct cxl_dport { struct cxl_port *port; struct cxl_regs regs; struct access_coordinate sw_coord; + long link_latency; }; /** diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index 0fa4799ea316c..711b05d9a370e 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -85,6 +85,19 @@ struct cdat_entry_header { __le16 length; } __packed; +/* + * CXL v3.0 6.2.3 Table 6-4 + * The table indicates that if PCIe Flit Mode is set, then CXL is in 256B flits + * mode, otherwise it's 68B flits mode. + */ +static inline bool cxl_pci_flit_256(struct pci_dev *pdev) +{ + u16 lnksta2; + + pcie_capability_read_word(pdev, PCI_EXP_LNKSTA2, &lnksta2); + return lnksta2 & PCI_EXP_LNKSTA2_FLIT; +} + int devm_cxl_port_enumerate_dports(struct cxl_port *port); struct cxl_dev_state; int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm, diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 55bc3576a9856..00817d403ad4e 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -6224,6 +6224,41 @@ int pcie_set_mps(struct pci_dev *dev, int mps) } EXPORT_SYMBOL(pcie_set_mps); +static enum pci_bus_speed to_pcie_link_speed(u16 lnksta) +{ + return pcie_link_speed[FIELD_GET(PCI_EXP_LNKSTA_CLS, lnksta)]; +} + +int pcie_link_speed_mbps(struct pci_dev *pdev) +{ + u16 lnksta; + int err; + + err = pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &lnksta); + if (err) + return err; + + switch (to_pcie_link_speed(lnksta)) { + case PCIE_SPEED_2_5GT: + return 2500; + case PCIE_SPEED_5_0GT: + return 5000; + case PCIE_SPEED_8_0GT: + return 8000; + case PCIE_SPEED_16_0GT: + return 16000; + case PCIE_SPEED_32_0GT: + return 32000; + case PCIE_SPEED_64_0GT: + return 64000; + default: + break; + } + + return -EINVAL; +} +EXPORT_SYMBOL(pcie_link_speed_mbps); + /** * pcie_bandwidth_available - determine minimum link settings of a PCIe * device and its bandwidth limitation @@ -6257,8 +6292,7 @@ u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev **limiting_dev, while (dev) { pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta); - next_speed = pcie_link_speed[FIELD_GET(PCI_EXP_LNKSTA_CLS, - lnksta)]; + next_speed = to_pcie_link_speed(lnksta); next_width = FIELD_GET(PCI_EXP_LNKSTA_NLW, lnksta); next_bw = next_width * PCIE_SPEED2MBS_ENC(next_speed); diff --git a/include/linux/pci.h b/include/linux/pci.h index dea043bc1e383..504a4ba2c29ef 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1364,6 +1364,7 @@ int pcie_set_mps(struct pci_dev *dev, int mps); u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev **limiting_dev, enum pci_bus_speed *speed, enum pcie_link_width *width); +int pcie_link_speed_mbps(struct pci_dev *pdev); void pcie_print_link_status(struct pci_dev *dev); int pcie_reset_flr(struct pci_dev *dev, bool probe); int pcie_flr(struct pci_dev *dev); -- GitLab From f2202f990456acc52b50f6b14c95b232ac14429b Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 21 Dec 2023 15:03:45 -0700 Subject: [PATCH 0504/1290] tools/testing/cxl: Add hostbridge UID string for cxl_test mock hb devices In order to support acpi_device_uid() call, add static string to acpi_device->pnp.unique_id. Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/170319622564.2212653.1534465446670631698.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- tools/testing/cxl/test/cxl.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index f4e517a0c7740..a3cdbb2be038c 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -68,15 +68,19 @@ static struct acpi_device acpi0017_mock; static struct acpi_device host_bridge[NR_BRIDGES] = { [0] = { .handle = &host_bridge[0], + .pnp.unique_id = "0", }, [1] = { .handle = &host_bridge[1], + .pnp.unique_id = "1", }, [2] = { .handle = &host_bridge[2], + .pnp.unique_id = "2", }, [3] = { .handle = &host_bridge[3], + .pnp.unique_id = "3", }, }; -- GitLab From 1037b82fccfe9c001ffa7a883651bb4cde7b705c Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 21 Dec 2023 15:03:51 -0700 Subject: [PATCH 0505/1290] cxl: Store the access coordinates for the generic ports Each CXL host bridge is represented by an ACPI0016 device. A generic port device handle that is an ACPI device is represented by a string of ACPI0016 device HID and UID. Create a device handle from the ACPI device and retrieve the access coordinates from the stored memory targets. The access coordinates are stored under the cxl_dport that is associated with the CXL host bridge. The access coordinates struct is dynamically allocated under cxl_dport in order for code later on to detect whether the data exists or not. Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/170319623196.2212653.17916695743464172534.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- drivers/cxl/acpi.c | 25 +++++++++++++++++++++++++ drivers/cxl/cxl.h | 2 ++ 2 files changed, 27 insertions(+) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 2f7de910ce572..afc712264d1c2 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -513,8 +513,29 @@ static int cxl_get_chbs(struct device *dev, struct acpi_device *hb, return 0; } +static int get_genport_coordinates(struct device *dev, struct cxl_dport *dport) +{ + struct acpi_device *hb = to_cxl_host_bridge(NULL, dev); + u32 uid; + int rc; + + if (kstrtou32(acpi_device_uid(hb), 0, &uid)) + return -EINVAL; + + rc = acpi_get_genport_coordinates(uid, &dport->hb_coord); + if (rc < 0) + return rc; + + /* Adjust back to picoseconds from nanoseconds */ + dport->hb_coord.read_latency *= 1000; + dport->hb_coord.write_latency *= 1000; + + return 0; +} + static int add_host_bridge_dport(struct device *match, void *arg) { + int ret; acpi_status rc; struct device *bridge; struct cxl_dport *dport; @@ -564,6 +585,10 @@ static int add_host_bridge_dport(struct device *match, void *arg) if (IS_ERR(dport)) return PTR_ERR(dport); + ret = get_genport_coordinates(match, dport); + if (ret) + dev_dbg(match, "Failed to get generic port perf coordinates.\n"); + return 0; } diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 7da8db919a20d..dd234f3b9ed4d 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -661,6 +661,7 @@ struct cxl_rcrb_info { * @port: reference to cxl_port that contains this downstream port * @regs: Dport parsed register blocks * @sw_coord: access coordinates (performance) for switch from CDAT + * @hb_coord: access coordinates (performance) from ACPI generic port (host bridge) * @link_latency: calculated PCIe downstream latency */ struct cxl_dport { @@ -672,6 +673,7 @@ struct cxl_dport { struct cxl_port *port; struct cxl_regs regs; struct access_coordinate sw_coord; + struct access_coordinate hb_coord; long link_latency; }; -- GitLab From 14a6960b3e928ccea22d687fb0626237885a20bd Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 21 Dec 2023 15:03:58 -0700 Subject: [PATCH 0506/1290] cxl: Add helper function that calculate performance data for downstream ports The CDAT information from the switch, Switch Scoped Latency and Bandwidth Information Structure (SSLBIS), is parsed and stored under a cxl_dport based on the correlated downstream port id from the SSLBIS entry. Walk the entire CXL port paths and collect all the performance data. Also pick up the link latency number that's stored under the dports. The entire path PCIe bandwidth can be retrieved using the pcie_bandwidth_available() call. Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/170319623824.2212653.10302079766473698427.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- drivers/cxl/core/port.c | 75 +++++++++++++++++++++++++++++++++++++++++ drivers/cxl/cxl.h | 3 ++ 2 files changed, 78 insertions(+) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index b5ad227fe0d8f..8c00fd6be730e 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -2094,6 +2095,80 @@ bool schedule_cxl_memdev_detach(struct cxl_memdev *cxlmd) } EXPORT_SYMBOL_NS_GPL(schedule_cxl_memdev_detach, CXL); +static void combine_coordinates(struct access_coordinate *c1, + struct access_coordinate *c2) +{ + if (c2->write_bandwidth) + c1->write_bandwidth = min(c1->write_bandwidth, + c2->write_bandwidth); + c1->write_latency += c2->write_latency; + + if (c2->read_bandwidth) + c1->read_bandwidth = min(c1->read_bandwidth, + c2->read_bandwidth); + c1->read_latency += c2->read_latency; +} + +/** + * cxl_endpoint_get_perf_coordinates - Retrieve performance numbers stored in dports + * of CXL path + * @port: endpoint cxl_port + * @coord: output performance data + * + * Return: errno on failure, 0 on success. + */ +int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, + struct access_coordinate *coord) +{ + struct access_coordinate c = { + .read_bandwidth = UINT_MAX, + .write_bandwidth = UINT_MAX, + }; + struct cxl_port *iter = port; + struct cxl_dport *dport; + struct pci_dev *pdev; + unsigned int bw; + + if (!is_cxl_endpoint(port)) + return -EINVAL; + + dport = iter->parent_dport; + + /* + * Exit the loop when the parent port of the current port is cxl root. + * The iterative loop starts at the endpoint and gathers the + * latency of the CXL link from the current iter to the next downstream + * port each iteration. If the parent is cxl root then there is + * nothing to gather. + */ + while (iter && !is_cxl_root(to_cxl_port(iter->dev.parent))) { + combine_coordinates(&c, &dport->sw_coord); + c.write_latency += dport->link_latency; + c.read_latency += dport->link_latency; + + iter = to_cxl_port(iter->dev.parent); + dport = iter->parent_dport; + } + + /* Augment with the generic port (host bridge) perf data */ + combine_coordinates(&c, &dport->hb_coord); + + /* Get the calculated PCI paths bandwidth */ + pdev = to_pci_dev(port->uport_dev->parent); + bw = pcie_bandwidth_available(pdev, NULL, NULL, NULL); + if (bw == 0) + return -ENXIO; + bw /= BITS_PER_BYTE; + + c.write_bandwidth = min(c.write_bandwidth, bw); + c.read_bandwidth = min(c.read_bandwidth, bw); + + *coord = c; + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_endpoint_get_perf_coordinates, CXL); + /* for user tooling to ensure port disable work has completed */ static ssize_t flush_store(const struct bus_type *bus, const char *buf, size_t count) { diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index dd234f3b9ed4d..492dbf63935fd 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -876,6 +876,9 @@ static inline struct cxl_dax_region *to_cxl_dax_region(struct device *dev) void cxl_endpoint_parse_cdat(struct cxl_port *port); void cxl_switch_parse_cdat(struct cxl_port *port); +int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, + struct access_coordinate *coord); + /* * Unit test builds overrides this to __weak, find the 'strong' version * of these symbols in tools/testing/cxl/. -- GitLab From 7a4f148dd8d518bc1e012aa738b0ed6244959293 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 21 Dec 2023 15:04:04 -0700 Subject: [PATCH 0507/1290] cxl: Compute the entire CXL path latency and bandwidth data CXL Memory Device SW Guide [1] rev1.0 2.11.2 provides instruction on how to calculate latency and bandwidth for CXL memory device. Calculate minimum bandwidth and total latency for the path from the CXL device to the root port. The QTG id is retrieved by providing the performance data as input and calling the root port callback ->get_qos_class(). The retrieved id is stored with the cxl_port of the CXL device. For example for a device that is directly attached to a host bus: Total Latency = Device Latency (from CDAT) + Dev to Host Bus (HB) Link Latency + Generic Port Latency Min Bandwidth = Min bandwidth for link bandwidth between HB and CXL device, device CDAT bandwidth, and Generic Port Bandwidth For a device that has a switch in between host bus and CXL device: Total Latency = Device (CDAT) Latency + Dev to Switch Link Latency + Switch (CDAT) Latency + Switch to HB Link Latency + Generic Port Latency Min Bandwidth = Min bandwidth for link bandwidth between CXL device to CXL switch, CXL device CDAT bandwidth, CXL switch CDAT bandwidth, CXL switch to HB bandwidth, and Generic Port Bandwidth. [1]: https://cdrdv2-public.intel.com/643805/643805_CXL%20Memory%20Device%20SW%20Guide_Rev1p0.pdf Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/170319624458.2212653.13252496567443656371.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- drivers/cxl/core/cdat.c | 59 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index b3ab47d250e16..43dfef80fb847 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -12,6 +12,9 @@ struct dsmas_entry { struct range dpa_range; u8 handle; struct access_coordinate coord; + + int entries; + int qos_class; }; static int cdat_dsmas_handler(union acpi_subtable_headers *header, void *arg, @@ -154,6 +157,55 @@ static int cxl_cdat_endpoint_process(struct cxl_port *port, return cdat_table_parse_output(rc); } +static int cxl_port_perf_data_calculate(struct cxl_port *port, + struct xarray *dsmas_xa) +{ + struct access_coordinate c; + struct cxl_port *root_port; + struct cxl_root *cxl_root; + struct dsmas_entry *dent; + int valid_entries = 0; + unsigned long index; + int rc; + + rc = cxl_endpoint_get_perf_coordinates(port, &c); + if (rc) { + dev_dbg(&port->dev, "Failed to retrieve perf coordinates.\n"); + return rc; + } + + root_port = find_cxl_root(port); + cxl_root = to_cxl_root(root_port); + if (!cxl_root->ops || !cxl_root->ops->qos_class) + return -EOPNOTSUPP; + + xa_for_each(dsmas_xa, index, dent) { + int qos_class; + + dent->coord.read_latency = dent->coord.read_latency + + c.read_latency; + dent->coord.write_latency = dent->coord.write_latency + + c.write_latency; + dent->coord.read_bandwidth = min_t(int, c.read_bandwidth, + dent->coord.read_bandwidth); + dent->coord.write_bandwidth = min_t(int, c.write_bandwidth, + dent->coord.write_bandwidth); + + dent->entries = 1; + rc = cxl_root->ops->qos_class(root_port, &dent->coord, 1, &qos_class); + if (rc != 1) + continue; + + valid_entries++; + dent->qos_class = qos_class; + } + + if (!valid_entries) + return -ENOENT; + + return 0; +} + static void discard_dsmas(struct xarray *xa) { unsigned long index; @@ -183,7 +235,12 @@ void cxl_endpoint_parse_cdat(struct cxl_port *port) return; } - /* Performance data processing */ + rc = cxl_port_perf_data_calculate(port, dsmas_xa); + if (rc) { + dev_dbg(&port->dev, "Failed to do perf coord calculations.\n"); + return; + } + } EXPORT_SYMBOL_NS_GPL(cxl_endpoint_parse_cdat, CXL); -- GitLab From 86557b7edf77d2a3835136c325c8baa6fe803234 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 21 Dec 2023 15:04:11 -0700 Subject: [PATCH 0508/1290] cxl: Store QTG IDs and related info to the CXL memory device context Once the QTG ID _DSM is executed successfully, the QTG ID is retrieved from the return package. Create a list of entries in the cxl_memdev context and store the QTG ID as qos_class token and the associated DPA range. This information can be exposed to user space via sysfs in order to help region setup for hot-plugged CXL memory devices. Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/170319625109.2212653.11872111896220384056.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- drivers/cxl/core/cdat.c | 69 +++++++++++++++++++++++++++++++++++++++++ drivers/cxl/core/mbox.c | 2 ++ drivers/cxl/cxlmem.h | 21 +++++++++++++ 3 files changed, 92 insertions(+) diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index 43dfef80fb847..6189d967f399c 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -6,6 +6,7 @@ #include #include #include "cxlpci.h" +#include "cxlmem.h" #include "cxl.h" struct dsmas_entry { @@ -206,6 +207,71 @@ static int cxl_port_perf_data_calculate(struct cxl_port *port, return 0; } +static void add_perf_entry(struct device *dev, struct dsmas_entry *dent, + struct list_head *list) +{ + struct cxl_dpa_perf *dpa_perf; + + dpa_perf = kzalloc(sizeof(*dpa_perf), GFP_KERNEL); + if (!dpa_perf) + return; + + dpa_perf->dpa_range = dent->dpa_range; + dpa_perf->coord = dent->coord; + dpa_perf->qos_class = dent->qos_class; + list_add_tail(&dpa_perf->list, list); + dev_dbg(dev, + "DSMAS: dpa: %#llx qos: %d read_bw: %d write_bw %d read_lat: %d write_lat: %d\n", + dent->dpa_range.start, dpa_perf->qos_class, + dent->coord.read_bandwidth, dent->coord.write_bandwidth, + dent->coord.read_latency, dent->coord.write_latency); +} + +static void free_perf_ents(void *data) +{ + struct cxl_memdev_state *mds = data; + struct cxl_dpa_perf *dpa_perf, *n; + LIST_HEAD(discard); + + list_splice_tail_init(&mds->ram_perf_list, &discard); + list_splice_tail_init(&mds->pmem_perf_list, &discard); + list_for_each_entry_safe(dpa_perf, n, &discard, list) { + list_del(&dpa_perf->list); + kfree(dpa_perf); + } +} + +static void cxl_memdev_set_qos_class(struct cxl_dev_state *cxlds, + struct xarray *dsmas_xa) +{ + struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); + struct device *dev = cxlds->dev; + struct range pmem_range = { + .start = cxlds->pmem_res.start, + .end = cxlds->pmem_res.end, + }; + struct range ram_range = { + .start = cxlds->ram_res.start, + .end = cxlds->ram_res.end, + }; + struct dsmas_entry *dent; + unsigned long index; + + xa_for_each(dsmas_xa, index, dent) { + if (resource_size(&cxlds->ram_res) && + range_contains(&ram_range, &dent->dpa_range)) + add_perf_entry(dev, dent, &mds->ram_perf_list); + else if (resource_size(&cxlds->pmem_res) && + range_contains(&pmem_range, &dent->dpa_range)) + add_perf_entry(dev, dent, &mds->pmem_perf_list); + else + dev_dbg(dev, "no partition for dsmas dpa: %#llx\n", + dent->dpa_range.start); + } + + devm_add_action_or_reset(&cxlds->cxlmd->dev, free_perf_ents, mds); +} + static void discard_dsmas(struct xarray *xa) { unsigned long index; @@ -221,6 +287,8 @@ DEFINE_FREE(dsmas, struct xarray *, if (_T) discard_dsmas(_T)) void cxl_endpoint_parse_cdat(struct cxl_port *port) { + struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev); + struct cxl_dev_state *cxlds = cxlmd->cxlds; struct xarray __dsmas_xa; struct xarray *dsmas_xa __free(dsmas) = &__dsmas_xa; int rc; @@ -241,6 +309,7 @@ void cxl_endpoint_parse_cdat(struct cxl_port *port) return; } + cxl_memdev_set_qos_class(cxlds, dsmas_xa); } EXPORT_SYMBOL_NS_GPL(cxl_endpoint_parse_cdat, CXL); diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 36270dcfb42ef..fbaa508ab2451 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -1404,6 +1404,8 @@ struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev) mds->cxlds.reg_map.host = dev; mds->cxlds.reg_map.resource = CXL_RESOURCE_NONE; mds->cxlds.type = CXL_DEVTYPE_CLASSMEM; + INIT_LIST_HEAD(&mds->ram_perf_list); + INIT_LIST_HEAD(&mds->pmem_perf_list); return mds; } diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index a2fcbca253f39..205bc2a016b2b 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -6,6 +6,7 @@ #include #include #include +#include #include "cxl.h" /* CXL 2.0 8.2.8.5.1.1 Memory Device Status Register */ @@ -391,6 +392,20 @@ enum cxl_devtype { CXL_DEVTYPE_CLASSMEM, }; +/** + * struct cxl_dpa_perf - DPA performance property entry + * @list - list entry + * @dpa_range - range for DPA address + * @coord - QoS performance data (i.e. latency, bandwidth) + * @qos_class - QoS Class cookies + */ +struct cxl_dpa_perf { + struct list_head list; + struct range dpa_range; + struct access_coordinate coord; + int qos_class; +}; + /** * struct cxl_dev_state - The driver device state * @@ -455,6 +470,8 @@ struct cxl_dev_state { * @security: security driver state info * @fw: firmware upload / activation state * @mbox_send: @dev specific transport for transmitting mailbox commands + * @ram_perf_list: performance data entries matched to RAM + * @pmem_perf_list: performance data entries matched to PMEM * * See CXL 3.0 8.2.9.8.2 Capacity Configuration and Label Storage for * details on capacity parameters. @@ -475,6 +492,10 @@ struct cxl_memdev_state { u64 active_persistent_bytes; u64 next_volatile_bytes; u64 next_persistent_bytes; + + struct list_head ram_perf_list; + struct list_head pmem_perf_list; + struct cxl_event_state event; struct cxl_poison_state poison; struct cxl_security_state security; -- GitLab From 42834b17cf1f00fa79ff1f02134f9c576a125252 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 21 Dec 2023 15:04:16 -0700 Subject: [PATCH 0509/1290] cxl: Export sysfs attributes for memory device QoS class Export qos_class sysfs attributes for the CXL memory device. The QoS clas should show up as /sys/bus/cxl/devices/memX/ram/qos_class for the volatile partition and /sys/bus/cxl/devices/memX/pmem/qos_class for the persistent partition. The QTG ID is retrieved via _DSM after supplying the calculated bandwidth and latency for the entire CXL path from device to the CPU. This ID is used to match up to the root decoder QoS class to determine which CFMWS the memory range of a hotplugged CXL mem device should be assigned under. While there may be multiple DSMAS exported by the device CDAT, the driver will only expose the first QTG ID per partition in sysfs for now. In the future when multiple QTG IDs are necessary, they can be exposed. [1] [1]: https://lore.kernel.org/linux-cxl/167571650007.587790.10040913293130712882.stgit@djiang5-mobl3.local/T/#md2a47b1ead3e1ba08f50eab29a4af1aed1d215ab Suggested-by: Dan Williams Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/170319625698.2212653.17544381274847420961.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- Documentation/ABI/testing/sysfs-bus-cxl | 34 +++++++++++++ drivers/cxl/mem.c | 67 ++++++++++++++++++++++--- 2 files changed, 95 insertions(+), 6 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl index e76c3600607f8..fff2581b80335 100644 --- a/Documentation/ABI/testing/sysfs-bus-cxl +++ b/Documentation/ABI/testing/sysfs-bus-cxl @@ -28,6 +28,23 @@ Description: Payload in the CXL-2.0 specification. +What: /sys/bus/cxl/devices/memX/ram/qos_class +Date: May, 2023 +KernelVersion: v6.8 +Contact: linux-cxl@vger.kernel.org +Description: + (RO) For CXL host platforms that support "QoS Telemmetry" + this attribute conveys a comma delimited list of platform + specific cookies that identifies a QoS performance class + for the volatile partition of the CXL mem device. These + class-ids can be compared against a similar "qos_class" + published for a root decoder. While it is not required + that the endpoints map their local memory-class to a + matching platform class, mismatches are not recommended + and there are platform specific performance related + side-effects that may result. First class-id is displayed. + + What: /sys/bus/cxl/devices/memX/pmem/size Date: December, 2020 KernelVersion: v5.12 @@ -38,6 +55,23 @@ Description: Payload in the CXL-2.0 specification. +What: /sys/bus/cxl/devices/memX/pmem/qos_class +Date: May, 2023 +KernelVersion: v6.8 +Contact: linux-cxl@vger.kernel.org +Description: + (RO) For CXL host platforms that support "QoS Telemmetry" + this attribute conveys a comma delimited list of platform + specific cookies that identifies a QoS performance class + for the persistent partition of the CXL mem device. These + class-ids can be compared against a similar "qos_class" + published for a root decoder. While it is not required + that the endpoints map their local memory-class to a + matching platform class, mismatches are not recommended + and there are platform specific performance related + side-effects that may result. First class-id is displayed. + + What: /sys/bus/cxl/devices/memX/serial Date: January, 2022 KernelVersion: v5.18 diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index e087febf9af04..c5c9d8e0d88d6 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -215,23 +215,78 @@ static ssize_t trigger_poison_list_store(struct device *dev, } static DEVICE_ATTR_WO(trigger_poison_list); +static ssize_t ram_qos_class_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); + struct cxl_dpa_perf *dpa_perf; + + if (!dev->driver) + return -ENOENT; + + if (list_empty(&mds->ram_perf_list)) + return -ENOENT; + + dpa_perf = list_first_entry(&mds->ram_perf_list, struct cxl_dpa_perf, + list); + + return sysfs_emit(buf, "%d\n", dpa_perf->qos_class); +} + +static struct device_attribute dev_attr_ram_qos_class = + __ATTR(qos_class, 0444, ram_qos_class_show, NULL); + +static ssize_t pmem_qos_class_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); + struct cxl_dpa_perf *dpa_perf; + + if (!dev->driver) + return -ENOENT; + + if (list_empty(&mds->pmem_perf_list)) + return -ENOENT; + + dpa_perf = list_first_entry(&mds->pmem_perf_list, struct cxl_dpa_perf, + list); + + return sysfs_emit(buf, "%d\n", dpa_perf->qos_class); +} + +static struct device_attribute dev_attr_pmem_qos_class = + __ATTR(qos_class, 0444, pmem_qos_class_show, NULL); + static umode_t cxl_mem_visible(struct kobject *kobj, struct attribute *a, int n) { - if (a == &dev_attr_trigger_poison_list.attr) { - struct device *dev = kobj_to_dev(kobj); - struct cxl_memdev *cxlmd = to_cxl_memdev(dev); - struct cxl_memdev_state *mds = - to_cxl_memdev_state(cxlmd->cxlds); + struct device *dev = kobj_to_dev(kobj); + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds); + if (a == &dev_attr_trigger_poison_list.attr) if (!test_bit(CXL_POISON_ENABLED_LIST, mds->poison.enabled_cmds)) return 0; - } + + if (a == &dev_attr_pmem_qos_class.attr) + if (list_empty(&mds->pmem_perf_list)) + return 0; + + if (a == &dev_attr_ram_qos_class.attr) + if (list_empty(&mds->ram_perf_list)) + return 0; + return a->mode; } static struct attribute *cxl_mem_attrs[] = { &dev_attr_trigger_poison_list.attr, + &dev_attr_ram_qos_class.attr, + &dev_attr_pmem_qos_class.attr, NULL }; -- GitLab From 185c1a489f873cb71520fc089401e02dbf302dcd Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 21 Dec 2023 15:04:23 -0700 Subject: [PATCH 0510/1290] cxl: Check qos_class validity on memdev probe Add a check to make sure the qos_class for the device will match one of the root decoders qos_class. If no match is found, then the qos_class for the device is set to invalid. Also add a check to ensure that the device's host bridge matches to one of the root decoder's downstream targets. Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/170319626313.2212653.9021004640856081917.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- drivers/cxl/core/cdat.c | 103 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index 6189d967f399c..cd84d87f597a6 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -272,6 +272,108 @@ static void cxl_memdev_set_qos_class(struct cxl_dev_state *cxlds, devm_add_action_or_reset(&cxlds->cxlmd->dev, free_perf_ents, mds); } +static int match_cxlrd_qos_class(struct device *dev, void *data) +{ + int dev_qos_class = *(int *)data; + struct cxl_root_decoder *cxlrd; + + if (!is_root_decoder(dev)) + return 0; + + cxlrd = to_cxl_root_decoder(dev); + if (cxlrd->qos_class == CXL_QOS_CLASS_INVALID) + return 0; + + if (cxlrd->qos_class == dev_qos_class) + return 1; + + return 0; +} + +static void cxl_qos_match(struct cxl_port *root_port, + struct list_head *work_list, + struct list_head *discard_list) +{ + struct cxl_dpa_perf *dpa_perf, *n; + + list_for_each_entry_safe(dpa_perf, n, work_list, list) { + int rc; + + if (dpa_perf->qos_class == CXL_QOS_CLASS_INVALID) + return; + + rc = device_for_each_child(&root_port->dev, + (void *)&dpa_perf->qos_class, + match_cxlrd_qos_class); + if (!rc) + list_move_tail(&dpa_perf->list, discard_list); + } +} + +static int match_cxlrd_hb(struct device *dev, void *data) +{ + struct device *host_bridge = data; + struct cxl_switch_decoder *cxlsd; + struct cxl_root_decoder *cxlrd; + unsigned int seq; + + if (!is_root_decoder(dev)) + return 0; + + cxlrd = to_cxl_root_decoder(dev); + cxlsd = &cxlrd->cxlsd; + + do { + seq = read_seqbegin(&cxlsd->target_lock); + for (int i = 0; i < cxlsd->nr_targets; i++) { + if (host_bridge == cxlsd->target[i]->dport_dev) + return 1; + } + } while (read_seqretry(&cxlsd->target_lock, seq)); + + return 0; +} + +static void discard_dpa_perf(struct list_head *list) +{ + struct cxl_dpa_perf *dpa_perf, *n; + + list_for_each_entry_safe(dpa_perf, n, list, list) { + list_del(&dpa_perf->list); + kfree(dpa_perf); + } +} +DEFINE_FREE(dpa_perf, struct list_head *, if (!list_empty(_T)) discard_dpa_perf(_T)) + +static int cxl_qos_class_verify(struct cxl_memdev *cxlmd) +{ + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); + struct cxl_port *root_port __free(put_device) = NULL; + LIST_HEAD(__discard); + struct list_head *discard __free(dpa_perf) = &__discard; + int rc; + + root_port = find_cxl_root(cxlmd->endpoint); + if (!root_port) + return -ENODEV; + + /* Check that the QTG IDs are all sane between end device and root decoders */ + cxl_qos_match(root_port, &mds->ram_perf_list, discard); + cxl_qos_match(root_port, &mds->pmem_perf_list, discard); + + /* Check to make sure that the device's host bridge is under a root decoder */ + rc = device_for_each_child(&root_port->dev, + (void *)cxlmd->endpoint->host_bridge, + match_cxlrd_hb); + if (!rc) { + list_splice_tail_init(&mds->ram_perf_list, discard); + list_splice_tail_init(&mds->pmem_perf_list, discard); + } + + return rc; +} + static void discard_dsmas(struct xarray *xa) { unsigned long index; @@ -310,6 +412,7 @@ void cxl_endpoint_parse_cdat(struct cxl_port *port) } cxl_memdev_set_qos_class(cxlds, dsmas_xa); + cxl_qos_class_verify(cxlmd); } EXPORT_SYMBOL_NS_GPL(cxl_endpoint_parse_cdat, CXL); -- GitLab From aefebd19a8422eb9253cf137e526cc3b11c95a88 Mon Sep 17 00:00:00 2001 From: Anshul Dalal Date: Fri, 22 Dec 2023 00:01:08 +0530 Subject: [PATCH 0511/1290] dt-bindings: input: convert drv266x to json-schema Convert devicetree binding documentation for ti drv2665 and drv2667 haptics driver to json-schema. The previously two separate bindings have been merged into a single drv266x.yaml. Signed-off-by: Anshul Dalal Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20231221183109.684325-1-anshulusr@gmail.com Signed-off-by: Dmitry Torokhov --- .../devicetree/bindings/input/ti,drv2665.txt | 17 ------- .../devicetree/bindings/input/ti,drv2667.txt | 17 ------- .../devicetree/bindings/input/ti,drv266x.yaml | 50 +++++++++++++++++++ 3 files changed, 50 insertions(+), 34 deletions(-) delete mode 100644 Documentation/devicetree/bindings/input/ti,drv2665.txt delete mode 100644 Documentation/devicetree/bindings/input/ti,drv2667.txt create mode 100644 Documentation/devicetree/bindings/input/ti,drv266x.yaml diff --git a/Documentation/devicetree/bindings/input/ti,drv2665.txt b/Documentation/devicetree/bindings/input/ti,drv2665.txt deleted file mode 100644 index 1ba97ac043058..0000000000000 --- a/Documentation/devicetree/bindings/input/ti,drv2665.txt +++ /dev/null @@ -1,17 +0,0 @@ -* Texas Instruments - drv2665 Haptics driver - -Required properties: - - compatible - "ti,drv2665" - DRV2665 - - reg - I2C slave address - - vbat-supply - Required supply regulator - -Example: - -haptics: haptics@59 { - compatible = "ti,drv2665"; - reg = <0x59>; - vbat-supply = <&vbat>; -}; - -For more product information please see the link below: -http://www.ti.com/product/drv2665 diff --git a/Documentation/devicetree/bindings/input/ti,drv2667.txt b/Documentation/devicetree/bindings/input/ti,drv2667.txt deleted file mode 100644 index 996382cf994a1..0000000000000 --- a/Documentation/devicetree/bindings/input/ti,drv2667.txt +++ /dev/null @@ -1,17 +0,0 @@ -* Texas Instruments - drv2667 Haptics driver - -Required properties: - - compatible - "ti,drv2667" - DRV2667 - - reg - I2C slave address - - vbat-supply - Required supply regulator - -Example: - -haptics: haptics@59 { - compatible = "ti,drv2667"; - reg = <0x59>; - vbat-supply = <&vbat>; -}; - -For more product information please see the link below: -http://www.ti.com/product/drv2667 diff --git a/Documentation/devicetree/bindings/input/ti,drv266x.yaml b/Documentation/devicetree/bindings/input/ti,drv266x.yaml new file mode 100644 index 0000000000000..da18188243738 --- /dev/null +++ b/Documentation/devicetree/bindings/input/ti,drv266x.yaml @@ -0,0 +1,50 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/input/ti,drv266x.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Texas Instruments - drv266x Haptics driver + +description: | + Product Page: + http://www.ti.com/product/drv2665 + http://www.ti.com/product/drv2667 + +maintainers: + - Anshul Dalal + +properties: + compatible: + enum: + - ti,drv2665 + - ti,drv2667 + + reg: + maxItems: 1 + + vbat-supply: + description: Required supply regulator + +required: + - compatible + - reg + - vbat-supply + +additionalProperties: false + +examples: + - | + #include + + + i2c { + #address-cells = <1>; + #size-cells = <0>; + + haptics@59 { + compatible = "ti,drv2667"; + reg = <0x59>; + vbat-supply = <&vbat>; + }; + }; -- GitLab From 60cb19b485a534a896431393a877d853bbe51b67 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 12 Dec 2023 16:13:07 -0800 Subject: [PATCH 0512/1290] perf dwarf-aux: Factor out die_get_typename_from_type() The die_get_typename_from_type() is to get the name of the given DIE in C-style type name. The difference from die_get_typename() is that it does not retrieve the DW_AT_type and use the given DIE directly. This will be used when users know the type DIE already. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Cc: linux-toolchains@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Link: https://lore.kernel.org/r/20231213001323.718046-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-aux.c | 38 ++++++++++++++++++++++++++----------- tools/perf/util/dwarf-aux.h | 3 +++ 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index edd9e407bc74e..7aa5fee0da190 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -1051,32 +1051,28 @@ Dwarf_Die *die_find_member(Dwarf_Die *st_die, const char *name, } /** - * die_get_typename - Get the name of given variable DIE - * @vr_die: a variable DIE + * die_get_typename_from_type - Get the name of given type DIE + * @type_die: a type DIE * @buf: a strbuf for result type name * - * Get the name of @vr_die and stores it to @buf. Return 0 if succeeded. + * Get the name of @type_die and stores it to @buf. Return 0 if succeeded. * and Return -ENOENT if failed to find type name. * Note that the result will stores typedef name if possible, and stores * "*(function_type)" if the type is a function pointer. */ -int die_get_typename(Dwarf_Die *vr_die, struct strbuf *buf) +int die_get_typename_from_type(Dwarf_Die *type_die, struct strbuf *buf) { - Dwarf_Die type; int tag, ret; const char *tmp = ""; - if (__die_get_real_type(vr_die, &type) == NULL) - return -ENOENT; - - tag = dwarf_tag(&type); + tag = dwarf_tag(type_die); if (tag == DW_TAG_array_type || tag == DW_TAG_pointer_type) tmp = "*"; else if (tag == DW_TAG_subroutine_type) { /* Function pointer */ return strbuf_add(buf, "(function_type)", 15); } else { - const char *name = dwarf_diename(&type); + const char *name = dwarf_diename(type_die); if (tag == DW_TAG_union_type) tmp = "union "; @@ -1089,7 +1085,7 @@ int die_get_typename(Dwarf_Die *vr_die, struct strbuf *buf) /* Write a base name */ return strbuf_addf(buf, "%s%s", tmp, name ?: ""); } - ret = die_get_typename(&type, buf); + ret = die_get_typename(type_die, buf); if (ret < 0) { /* void pointer has no type attribute */ if (tag == DW_TAG_pointer_type && ret == -ENOENT) @@ -1100,6 +1096,26 @@ int die_get_typename(Dwarf_Die *vr_die, struct strbuf *buf) return strbuf_addstr(buf, tmp); } +/** + * die_get_typename - Get the name of given variable DIE + * @vr_die: a variable DIE + * @buf: a strbuf for result type name + * + * Get the name of @vr_die and stores it to @buf. Return 0 if succeeded. + * and Return -ENOENT if failed to find type name. + * Note that the result will stores typedef name if possible, and stores + * "*(function_type)" if the type is a function pointer. + */ +int die_get_typename(Dwarf_Die *vr_die, struct strbuf *buf) +{ + Dwarf_Die type; + + if (__die_get_real_type(vr_die, &type) == NULL) + return -ENOENT; + + return die_get_typename_from_type(&type, buf); +} + /** * die_get_varname - Get the name and type of given variable DIE * @vr_die: a variable DIE diff --git a/tools/perf/util/dwarf-aux.h b/tools/perf/util/dwarf-aux.h index 0ddf61fd3f8be..4e64caac6df83 100644 --- a/tools/perf/util/dwarf-aux.h +++ b/tools/perf/util/dwarf-aux.h @@ -116,6 +116,9 @@ Dwarf_Die *die_find_variable_at(Dwarf_Die *sp_die, const char *name, Dwarf_Die *die_find_member(Dwarf_Die *st_die, const char *name, Dwarf_Die *die_mem); +/* Get the name of given type DIE */ +int die_get_typename_from_type(Dwarf_Die *type_die, struct strbuf *buf); + /* Get the name of given variable DIE */ int die_get_typename(Dwarf_Die *vr_die, struct strbuf *buf); -- GitLab From 3eee606757ad82f32332a2da174aa032bfd9cc32 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 12 Dec 2023 16:13:08 -0800 Subject: [PATCH 0513/1290] perf dwarf-regs: Add get_dwarf_regnum() The get_dwarf_regnum() returns a DWARF register number from a register name string according to the psABI. Also add two pseudo encodings of DWARF_REG_PC which is a register that are used by PC-relative addressing and DWARF_REG_FB which is a frame base register. They need to be handled in a special way. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Cc: linux-toolchains@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Link: https://lore.kernel.org/r/20231213001323.718046-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/util/dwarf-regs.c | 38 +++++++++++++++++++++++++++ tools/perf/util/dwarf-regs.c | 34 ++++++++++++++++++++++++ tools/perf/util/include/dwarf-regs.h | 19 ++++++++++++++ 3 files changed, 91 insertions(+) diff --git a/tools/perf/arch/x86/util/dwarf-regs.c b/tools/perf/arch/x86/util/dwarf-regs.c index 5309348057108..399c4a0a29d8c 100644 --- a/tools/perf/arch/x86/util/dwarf-regs.c +++ b/tools/perf/arch/x86/util/dwarf-regs.c @@ -113,3 +113,41 @@ int regs_query_register_offset(const char *name) return roff->offset; return -EINVAL; } + +struct dwarf_regs_idx { + const char *name; + int idx; +}; + +static const struct dwarf_regs_idx x86_regidx_table[] = { + { "rax", 0 }, { "eax", 0 }, { "ax", 0 }, { "al", 0 }, + { "rdx", 1 }, { "edx", 1 }, { "dx", 1 }, { "dl", 1 }, + { "rcx", 2 }, { "ecx", 2 }, { "cx", 2 }, { "cl", 2 }, + { "rbx", 3 }, { "edx", 3 }, { "bx", 3 }, { "bl", 3 }, + { "rsi", 4 }, { "esi", 4 }, { "si", 4 }, { "sil", 4 }, + { "rdi", 5 }, { "edi", 5 }, { "di", 5 }, { "dil", 5 }, + { "rbp", 6 }, { "ebp", 6 }, { "bp", 6 }, { "bpl", 6 }, + { "rsp", 7 }, { "esp", 7 }, { "sp", 7 }, { "spl", 7 }, + { "r8", 8 }, { "r8d", 8 }, { "r8w", 8 }, { "r8b", 8 }, + { "r9", 9 }, { "r9d", 9 }, { "r9w", 9 }, { "r9b", 9 }, + { "r10", 10 }, { "r10d", 10 }, { "r10w", 10 }, { "r10b", 10 }, + { "r11", 11 }, { "r11d", 11 }, { "r11w", 11 }, { "r11b", 11 }, + { "r12", 12 }, { "r12d", 12 }, { "r12w", 12 }, { "r12b", 12 }, + { "r13", 13 }, { "r13d", 13 }, { "r13w", 13 }, { "r13b", 13 }, + { "r14", 14 }, { "r14d", 14 }, { "r14w", 14 }, { "r14b", 14 }, + { "r15", 15 }, { "r15d", 15 }, { "r15w", 15 }, { "r15b", 15 }, + { "rip", DWARF_REG_PC }, +}; + +int get_arch_regnum(const char *name) +{ + unsigned int i; + + if (*name != '%') + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(x86_regidx_table); i++) + if (!strcmp(x86_regidx_table[i].name, name + 1)) + return x86_regidx_table[i].idx; + return -ENOENT; +} diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c index 69cfaa5953bf4..5b7f86c0063f2 100644 --- a/tools/perf/util/dwarf-regs.c +++ b/tools/perf/util/dwarf-regs.c @@ -5,9 +5,12 @@ * Written by: Masami Hiramatsu */ +#include +#include #include #include #include +#include #include #ifndef EM_AARCH64 @@ -68,3 +71,34 @@ const char *get_dwarf_regstr(unsigned int n, unsigned int machine) } return NULL; } + +__weak int get_arch_regnum(const char *name __maybe_unused) +{ + return -ENOTSUP; +} + +/* Return DWARF register number from architecture register name */ +int get_dwarf_regnum(const char *name, unsigned int machine) +{ + char *regname = strdup(name); + int reg = -1; + char *p; + + if (regname == NULL) + return -EINVAL; + + /* For convenience, remove trailing characters */ + p = strpbrk(regname, " ,)"); + if (p) + *p = '\0'; + + switch (machine) { + case EM_NONE: /* Generic arch - use host arch */ + reg = get_arch_regnum(regname); + break; + default: + pr_err("ELF MACHINE %x is not supported.\n", machine); + } + free(regname); + return reg; +} diff --git a/tools/perf/util/include/dwarf-regs.h b/tools/perf/util/include/dwarf-regs.h index 7d99a084e82d7..01fb25a1150af 100644 --- a/tools/perf/util/include/dwarf-regs.h +++ b/tools/perf/util/include/dwarf-regs.h @@ -2,6 +2,9 @@ #ifndef _PERF_DWARF_REGS_H_ #define _PERF_DWARF_REGS_H_ +#define DWARF_REG_PC 0xd3af9c /* random number */ +#define DWARF_REG_FB 0xd3affb /* random number */ + #ifdef HAVE_DWARF_SUPPORT const char *get_arch_regstr(unsigned int n); /* @@ -10,6 +13,22 @@ const char *get_arch_regstr(unsigned int n); * machine: ELF machine signature (EM_*) */ const char *get_dwarf_regstr(unsigned int n, unsigned int machine); + +int get_arch_regnum(const char *name); +/* + * get_dwarf_regnum - Returns DWARF regnum from register name + * name: architecture register name + * machine: ELF machine signature (EM_*) + */ +int get_dwarf_regnum(const char *name, unsigned int machine); + +#else /* HAVE_DWARF_SUPPORT */ + +static inline int get_dwarf_regnum(const char *name __maybe_unused, + unsigned int machine __maybe_unused) +{ + return -1; +} #endif #ifdef HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET -- GitLab From b9c87f536c6f28c75ace8a014646faad00f0e1ec Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 12 Dec 2023 16:13:09 -0800 Subject: [PATCH 0514/1290] perf annotate-data: Add find_data_type() to get type from memory access The find_data_type() is to get a data type from the memory access at the given address (IP) using a register and an offset. It requires DWARF debug info in the DSO and searches the list of variables and function parameters in the scope. In a pseudo code, it does basically the following: find_data_type(dso, ip, reg, offset) { pc = map__rip_2objdump(ip); CU = dwarf_addrdie(dso->dwarf, pc); scopes = die_get_scopes(CU, pc); for_each_scope(S, scopes) { V = die_find_variable_by_reg(S, pc, reg); if (V && V.type == pointer_type) { T = die_get_real_type(V); if (offset < T.size) return T; } } return NULL; } Committer notes: The 'size' variable in check_variable() is 64-bit, so use PRIu64 and inttypes.h to debug it. Ditto at find_data_type_die(). Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Cc: linux-toolchains@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Link: https://lore.kernel.org/r/20231213001323.718046-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/Build | 1 + tools/perf/util/annotate-data.c | 164 ++++++++++++++++++++++++++++++++ tools/perf/util/annotate-data.h | 40 ++++++++ 3 files changed, 205 insertions(+) create mode 100644 tools/perf/util/annotate-data.c create mode 100644 tools/perf/util/annotate-data.h diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 132508ebe125c..8027f450fa3e4 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -196,6 +196,7 @@ perf-$(CONFIG_DWARF) += probe-finder.o perf-$(CONFIG_DWARF) += dwarf-aux.o perf-$(CONFIG_DWARF) += dwarf-regs.o perf-$(CONFIG_DWARF) += debuginfo.o +perf-$(CONFIG_DWARF) += annotate-data.o perf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o perf-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind-local.o diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c new file mode 100644 index 0000000000000..9739ecc841eeb --- /dev/null +++ b/tools/perf/util/annotate-data.c @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Convert sample address to data type using DWARF debug info. + * + * Written by Namhyung Kim + */ + +#include +#include +#include + +#include "annotate-data.h" +#include "debuginfo.h" +#include "debug.h" +#include "dso.h" +#include "map.h" +#include "map_symbol.h" +#include "strbuf.h" +#include "symbol.h" + +static bool find_cu_die(struct debuginfo *di, u64 pc, Dwarf_Die *cu_die) +{ + Dwarf_Off off, next_off; + size_t header_size; + + if (dwarf_addrdie(di->dbg, pc, cu_die) != NULL) + return cu_die; + + /* + * There are some kernels don't have full aranges and contain only a few + * aranges entries. Fallback to iterate all CU entries in .debug_info + * in case it's missing. + */ + off = 0; + while (dwarf_nextcu(di->dbg, off, &next_off, &header_size, + NULL, NULL, NULL) == 0) { + if (dwarf_offdie(di->dbg, off + header_size, cu_die) && + dwarf_haspc(cu_die, pc)) + return true; + + off = next_off; + } + return false; +} + +/* The type info will be saved in @type_die */ +static int check_variable(Dwarf_Die *var_die, Dwarf_Die *type_die, int offset) +{ + Dwarf_Word size; + + /* Get the type of the variable */ + if (die_get_real_type(var_die, type_die) == NULL) { + pr_debug("variable has no type\n"); + return -1; + } + + /* + * It expects a pointer type for a memory access. + * Convert to a real type it points to. + */ + if (dwarf_tag(type_die) != DW_TAG_pointer_type || + die_get_real_type(type_die, type_die) == NULL) { + pr_debug("no pointer or no type\n"); + return -1; + } + + /* Get the size of the actual type */ + if (dwarf_aggregate_size(type_die, &size) < 0) { + pr_debug("type size is unknown\n"); + return -1; + } + + /* Minimal sanity check */ + if ((unsigned)offset >= size) { + pr_debug("offset: %d is bigger than size: %" PRIu64 "\n", offset, size); + return -1; + } + + return 0; +} + +/* The result will be saved in @type_die */ +static int find_data_type_die(struct debuginfo *di, u64 pc, + int reg, int offset, Dwarf_Die *type_die) +{ + Dwarf_Die cu_die, var_die; + Dwarf_Die *scopes = NULL; + int ret = -1; + int i, nr_scopes; + + /* Get a compile_unit for this address */ + if (!find_cu_die(di, pc, &cu_die)) { + pr_debug("cannot find CU for address %" PRIx64 "\n", pc); + return -1; + } + + /* Get a list of nested scopes - i.e. (inlined) functions and blocks. */ + nr_scopes = die_get_scopes(&cu_die, pc, &scopes); + + /* Search from the inner-most scope to the outer */ + for (i = nr_scopes - 1; i >= 0; i--) { + /* Look up variables/parameters in this scope */ + if (!die_find_variable_by_reg(&scopes[i], pc, reg, &var_die)) + continue; + + /* Found a variable, see if it's correct */ + ret = check_variable(&var_die, type_die, offset); + break; + } + + free(scopes); + return ret; +} + +/** + * find_data_type - Return a data type at the location + * @ms: map and symbol at the location + * @ip: instruction address of the memory access + * @reg: register that holds the base address + * @offset: offset from the base address + * + * This functions searches the debug information of the binary to get the data + * type it accesses. The exact location is expressed by (ip, reg, offset). + * It return %NULL if not found. + */ +struct annotated_data_type *find_data_type(struct map_symbol *ms, u64 ip, + int reg, int offset) +{ + struct annotated_data_type *result = NULL; + struct dso *dso = map__dso(ms->map); + struct debuginfo *di; + Dwarf_Die type_die; + struct strbuf sb; + u64 pc; + + di = debuginfo__new(dso->long_name); + if (di == NULL) { + pr_debug("cannot get the debug info\n"); + return NULL; + } + + /* + * IP is a relative instruction address from the start of the map, as + * it can be randomized/relocated, it needs to translate to PC which is + * a file address for DWARF processing. + */ + pc = map__rip_2objdump(ms->map, ip); + if (find_data_type_die(di, pc, reg, offset, &type_die) < 0) + goto out; + + result = zalloc(sizeof(*result)); + if (result == NULL) + goto out; + + strbuf_init(&sb, 32); + if (die_get_typename_from_type(&type_die, &sb) < 0) + strbuf_add(&sb, "(unknown type)", 14); + + result->type_name = strbuf_detach(&sb, NULL); + +out: + debuginfo__delete(di); + return result; +} diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h new file mode 100644 index 0000000000000..633147f78ca5e --- /dev/null +++ b/tools/perf/util/annotate-data.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _PERF_ANNOTATE_DATA_H +#define _PERF_ANNOTATE_DATA_H + +#include +#include +#include + +struct map_symbol; + +/** + * struct annotated_data_type - Data type to profile + * @type_name: Name of the data type + * @type_size: Size of the data type + * + * This represents a data type accessed by samples in the profile data. + */ +struct annotated_data_type { + char *type_name; + int type_size; +}; + +#ifdef HAVE_DWARF_SUPPORT + +/* Returns data type at the location (ip, reg, offset) */ +struct annotated_data_type *find_data_type(struct map_symbol *ms, u64 ip, + int reg, int offset); + +#else /* HAVE_DWARF_SUPPORT */ + +static inline struct annotated_data_type * +find_data_type(struct map_symbol *ms __maybe_unused, u64 ip __maybe_unused, + int reg __maybe_unused, int offset __maybe_unused) +{ + return NULL; +} + +#endif /* HAVE_DWARF_SUPPORT */ + +#endif /* _PERF_ANNOTATE_DATA_H */ -- GitLab From fc044c53b99fad039ac30b95b289992ebf7dd6b4 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 12 Dec 2023 16:13:10 -0800 Subject: [PATCH 0515/1290] perf annotate-data: Add dso->data_types tree To aggregate accesses to the same data type, add 'data_types' tree in DSO to maintain data types and find it by name and size. It might have different data types that happen to have the same name, so it also compares the size of the type. Even if it doesn't 100% guarantee, it reduces the possibility of mis-handling of such conflicts. And I don't think it's common to have different types with the same name. Committer notes: Very few cases on the Linux kernel, but there are some different types with the same name, unsure if there is a debug mode in libbpf dedup that warns about such cases, but there are provisions in pahole for that, see: "emit: Notice type shadowing, i.e. multiple types with the same name (enum, struct, union, etc)" https://git.kernel.org/pub/scm/devel/pahole/pahole.git/commit/?id=4f332dbfd02072e4f410db7bdcda8d6e3422974b $ pahole --compile > vmlinux.h $ rm -f a ; make a cc a.c -o a $ grep __[0-9] vmlinux.h union irte__1 { struct map_info__1; struct map_info__1 { struct map_info__1 * next; /* 0 8 */ $ drivers/iommu/amd/amd_iommu_types.h 'union irte' include/linux/dmar.h 'struct irte' include/linux/device-mapper.h: union map_info { void *ptr; }; include/linux/mtd/map.h: struct map_info { const char *name; unsigned long size; resource_size_t phys; kernel/events/uprobes.c: struct map_info { struct map_info *next; struct mm_struct *mm; unsigned long vaddr; }; Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Cc: linux-toolchains@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Link: https://lore.kernel.org/r/20231213001323.718046-5-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 95 +++++++++++++++++++++++++++++---- tools/perf/util/annotate-data.h | 9 ++++ tools/perf/util/dso.c | 4 ++ tools/perf/util/dso.h | 2 + 4 files changed, 100 insertions(+), 10 deletions(-) diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 9739ecc841eeb..1f921971174d2 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -18,6 +18,76 @@ #include "strbuf.h" #include "symbol.h" +/* + * Compare type name and size to maintain them in a tree. + * I'm not sure if DWARF would have information of a single type in many + * different places (compilation units). If not, it could compare the + * offset of the type entry in the .debug_info section. + */ +static int data_type_cmp(const void *_key, const struct rb_node *node) +{ + const struct annotated_data_type *key = _key; + struct annotated_data_type *type; + + type = rb_entry(node, struct annotated_data_type, node); + + if (key->type_size != type->type_size) + return key->type_size - type->type_size; + return strcmp(key->type_name, type->type_name); +} + +static bool data_type_less(struct rb_node *node_a, const struct rb_node *node_b) +{ + struct annotated_data_type *a, *b; + + a = rb_entry(node_a, struct annotated_data_type, node); + b = rb_entry(node_b, struct annotated_data_type, node); + + if (a->type_size != b->type_size) + return a->type_size < b->type_size; + return strcmp(a->type_name, b->type_name) < 0; +} + +static struct annotated_data_type *dso__findnew_data_type(struct dso *dso, + Dwarf_Die *type_die) +{ + struct annotated_data_type *result = NULL; + struct annotated_data_type key; + struct rb_node *node; + struct strbuf sb; + char *type_name; + Dwarf_Word size; + + strbuf_init(&sb, 32); + if (die_get_typename_from_type(type_die, &sb) < 0) + strbuf_add(&sb, "(unknown type)", 14); + type_name = strbuf_detach(&sb, NULL); + dwarf_aggregate_size(type_die, &size); + + /* Check existing nodes in dso->data_types tree */ + key.type_name = type_name; + key.type_size = size; + node = rb_find(&key, &dso->data_types, data_type_cmp); + if (node) { + result = rb_entry(node, struct annotated_data_type, node); + free(type_name); + return result; + } + + /* If not, add a new one */ + result = zalloc(sizeof(*result)); + if (result == NULL) { + free(type_name); + return NULL; + } + + result->type_name = type_name; + result->type_size = size; + + rb_add(&result->node, &dso->data_types, data_type_less); + return result; +} + static bool find_cu_die(struct debuginfo *di, u64 pc, Dwarf_Die *cu_die) { Dwarf_Off off, next_off; @@ -130,7 +200,6 @@ struct annotated_data_type *find_data_type(struct map_symbol *ms, u64 ip, struct dso *dso = map__dso(ms->map); struct debuginfo *di; Dwarf_Die type_die; - struct strbuf sb; u64 pc; di = debuginfo__new(dso->long_name); @@ -148,17 +217,23 @@ struct annotated_data_type *find_data_type(struct map_symbol *ms, u64 ip, if (find_data_type_die(di, pc, reg, offset, &type_die) < 0) goto out; - result = zalloc(sizeof(*result)); - if (result == NULL) - goto out; - - strbuf_init(&sb, 32); - if (die_get_typename_from_type(&type_die, &sb) < 0) - strbuf_add(&sb, "(unknown type)", 14); - - result->type_name = strbuf_detach(&sb, NULL); + result = dso__findnew_data_type(dso, &type_die); out: debuginfo__delete(di); return result; } + +void annotated_data_type__tree_delete(struct rb_root *root) +{ + struct annotated_data_type *pos; + + while (!RB_EMPTY_ROOT(root)) { + struct rb_node *node = rb_first(root); + + rb_erase(node, root); + pos = rb_entry(node, struct annotated_data_type, node); + free(pos->type_name); + free(pos); + } +} diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h index 633147f78ca5e..ab9f187bd7f13 100644 --- a/tools/perf/util/annotate-data.h +++ b/tools/perf/util/annotate-data.h @@ -4,6 +4,7 @@ #include #include +#include #include struct map_symbol; @@ -16,6 +17,7 @@ struct map_symbol; * This represents a data type accessed by samples in the profile data. */ struct annotated_data_type { + struct rb_node node; char *type_name; int type_size; }; @@ -26,6 +28,9 @@ struct annotated_data_type { struct annotated_data_type *find_data_type(struct map_symbol *ms, u64 ip, int reg, int offset); +/* Release all data type information in the tree */ +void annotated_data_type__tree_delete(struct rb_root *root); + #else /* HAVE_DWARF_SUPPORT */ static inline struct annotated_data_type * @@ -35,6 +40,10 @@ find_data_type(struct map_symbol *ms __maybe_unused, u64 ip __maybe_unused, return NULL; } +static inline void annotated_data_type__tree_delete(struct rb_root *root __maybe_unused) +{ +} + #endif /* HAVE_DWARF_SUPPORT */ #endif /* _PERF_ANNOTATE_DATA_H */ diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index 1f629b6fb7cfe..22fd5fa806ed8 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -31,6 +31,7 @@ #include "debug.h" #include "string2.h" #include "vdso.h" +#include "annotate-data.h" static const char * const debuglink_paths[] = { "%.0s%s", @@ -1327,6 +1328,7 @@ struct dso *dso__new_id(const char *name, struct dso_id *id) dso->data.cache = RB_ROOT; dso->inlined_nodes = RB_ROOT_CACHED; dso->srclines = RB_ROOT_CACHED; + dso->data_types = RB_ROOT; dso->data.fd = -1; dso->data.status = DSO_DATA_STATUS_UNKNOWN; dso->symtab_type = DSO_BINARY_TYPE__NOT_FOUND; @@ -1370,6 +1372,8 @@ void dso__delete(struct dso *dso) symbols__delete(&dso->symbols); dso->symbol_names_len = 0; zfree(&dso->symbol_names); + annotated_data_type__tree_delete(&dso->data_types); + if (dso->short_name_allocated) { zfree((char **)&dso->short_name); dso->short_name_allocated = false; diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index 3759de8c2267a..ce9f3849a773c 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -154,6 +154,8 @@ struct dso { size_t symbol_names_len; struct rb_root_cached inlined_nodes; struct rb_root_cached srclines; + struct rb_root data_types; + struct { u64 addr; struct symbol *symbol; -- GitLab From 0669729eb0afb0cf55fe1b97d7a8b1315354910f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 12 Dec 2023 16:13:11 -0800 Subject: [PATCH 0516/1290] perf annotate: Factor out evsel__get_arch() The evsel__get_arch() is to get architecture info from the environment. It'll be used by other places later so let's factor it out. Also add arch__is() to check the arch info by name. Committer notes: "get" is usually associated with refcounting, so we better rename this at some point to a better name. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Cc: linux-toolchains@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Link: https://lore.kernel.org/r/20231213001323.718046-6-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 44 +++++++++++++++++++++++++++----------- tools/perf/util/annotate.h | 2 ++ 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index c81fa0791918e..27b2a9961cd5e 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -843,6 +843,11 @@ static struct arch *arch__find(const char *name) return bsearch(name, architectures, nmemb, sizeof(struct arch), arch__key_cmp); } +bool arch__is(struct arch *arch, const char *name) +{ + return !strcmp(arch->name, name); +} + static struct annotated_source *annotated_source__new(void) { struct annotated_source *src = zalloc(sizeof(*src)); @@ -2378,15 +2383,8 @@ void symbol__calc_percent(struct symbol *sym, struct evsel *evsel) annotation__calc_percent(notes, evsel, symbol__size(sym)); } -int symbol__annotate(struct map_symbol *ms, struct evsel *evsel, - struct arch **parch) +static int evsel__get_arch(struct evsel *evsel, struct arch **parch) { - struct symbol *sym = ms->sym; - struct annotation *notes = symbol__annotation(sym); - struct annotate_args args = { - .evsel = evsel, - .options = &annotate_opts, - }; struct perf_env *env = evsel__env(evsel); const char *arch_name = perf_env__arch(env); struct arch *arch; @@ -2395,23 +2393,43 @@ int symbol__annotate(struct map_symbol *ms, struct evsel *evsel, if (!arch_name) return errno; - args.arch = arch = arch__find(arch_name); + *parch = arch = arch__find(arch_name); if (arch == NULL) { pr_err("%s: unsupported arch %s\n", __func__, arch_name); return ENOTSUP; } - if (parch) - *parch = arch; - if (arch->init) { err = arch->init(arch, env ? env->cpuid : NULL); if (err) { - pr_err("%s: failed to initialize %s arch priv area\n", __func__, arch->name); + pr_err("%s: failed to initialize %s arch priv area\n", + __func__, arch->name); return err; } } + return 0; +} + +int symbol__annotate(struct map_symbol *ms, struct evsel *evsel, + struct arch **parch) +{ + struct symbol *sym = ms->sym; + struct annotation *notes = symbol__annotation(sym); + struct annotate_args args = { + .evsel = evsel, + .options = &annotate_opts, + }; + struct arch *arch = NULL; + int err; + + err = evsel__get_arch(evsel, &arch); + if (err < 0) + return err; + + if (parch) + *parch = arch; + args.arch = arch; args.ms = *ms; if (annotate_opts.full_addr) notes->start = map__objdump_2mem(ms->map, ms->sym->start); diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 589f8aaf02366..2ef7e7dda7bdd 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -61,6 +61,8 @@ struct ins_operands { struct arch; +bool arch__is(struct arch *arch, const char *name); + struct ins_ops { void (*free)(struct ins_operands *ops); int (*parse)(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms); -- GitLab From 3a0c26edc3d2f39da3a91eb6eae404253e7ccbaa Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 12 Dec 2023 16:13:12 -0800 Subject: [PATCH 0517/1290] perf annotate: Add annotate_get_insn_location() The annotate_get_insn_location() is to get the detailed information of instruction locations like registers and offset. It has source and target operands locations in an array. Each operand can have a register and an offset. The offset is meaningful when mem_ref flag is set. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Cc: linux-toolchains@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Link: https://lore.kernel.org/r/20231213001323.718046-7-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 107 +++++++++++++++++++++++++++++++++++++ tools/perf/util/annotate.h | 36 +++++++++++++ 2 files changed, 143 insertions(+) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 27b2a9961cd5e..7c597440dc2eb 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -31,6 +31,7 @@ #include "bpf-utils.h" #include "block-range.h" #include "string2.h" +#include "dwarf-regs.h" #include "util/event.h" #include "util/sharded_mutex.h" #include "arch/common.h" @@ -3518,3 +3519,109 @@ int annotate_check_args(void) } return 0; } + +/* + * Get register number and access offset from the given instruction. + * It assumes AT&T x86 asm format like OFFSET(REG). Maybe it needs + * to revisit the format when it handles different architecture. + * Fills @reg and @offset when return 0. + */ +static int extract_reg_offset(struct arch *arch, const char *str, + struct annotated_op_loc *op_loc) +{ + char *p; + char *regname; + + if (arch->objdump.register_char == 0) + return -1; + + /* + * It should start from offset, but it's possible to skip 0 + * in the asm. So 0(%rax) should be same as (%rax). + * + * However, it also start with a segment select register like + * %gs:0x18(%rbx). In that case it should skip the part. + */ + if (*str == arch->objdump.register_char) { + while (*str && !isdigit(*str) && + *str != arch->objdump.memory_ref_char) + str++; + } + + op_loc->offset = strtol(str, &p, 0); + + p = strchr(p, arch->objdump.register_char); + if (p == NULL) + return -1; + + regname = strdup(p); + if (regname == NULL) + return -1; + + op_loc->reg = get_dwarf_regnum(regname, 0); + free(regname); + return 0; +} + +/** + * annotate_get_insn_location - Get location of instruction + * @arch: the architecture info + * @dl: the target instruction + * @loc: a buffer to save the data + * + * Get detailed location info (register and offset) in the instruction. + * It needs both source and target operand and whether it accesses a + * memory location. The offset field is meaningful only when the + * corresponding mem flag is set. + * + * Some examples on x86: + * + * mov (%rax), %rcx # src_reg = rax, src_mem = 1, src_offset = 0 + * # dst_reg = rcx, dst_mem = 0 + * + * mov 0x18, %r8 # src_reg = -1, dst_reg = r8 + */ +int annotate_get_insn_location(struct arch *arch, struct disasm_line *dl, + struct annotated_insn_loc *loc) +{ + struct ins_operands *ops; + struct annotated_op_loc *op_loc; + int i; + + if (!strcmp(dl->ins.name, "lock")) + ops = dl->ops.locked.ops; + else + ops = &dl->ops; + + if (ops == NULL) + return -1; + + memset(loc, 0, sizeof(*loc)); + + for_each_insn_op_loc(loc, i, op_loc) { + const char *insn_str = ops->source.raw; + + if (i == INSN_OP_TARGET) + insn_str = ops->target.raw; + + /* Invalidate the register by default */ + op_loc->reg = -1; + + if (insn_str == NULL) + continue; + + if (strchr(insn_str, arch->objdump.memory_ref_char)) { + op_loc->mem_ref = true; + extract_reg_offset(arch, insn_str, op_loc); + } else { + char *s = strdup(insn_str); + + if (s) { + op_loc->reg = get_dwarf_regnum(s, 0); + free(s); + } + } + } + + return 0; +} diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 2ef7e7dda7bdd..25ae8893d4f9b 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -439,4 +439,40 @@ int annotate_parse_percent_type(const struct option *opt, const char *_str, int annotate_check_args(void); +/** + * struct annotated_op_loc - Location info of instruction operand + * @reg: Register in the operand + * @offset: Memory access offset in the operand + * @mem_ref: Whether the operand accesses memory + */ +struct annotated_op_loc { + int reg; + int offset; + bool mem_ref; +}; + +enum annotated_insn_ops { + INSN_OP_SOURCE = 0, + INSN_OP_TARGET = 1, + + INSN_OP_MAX, +}; + +/** + * struct annotated_insn_loc - Location info of instruction + * @ops: Array of location info for source and target operands + */ +struct annotated_insn_loc { + struct annotated_op_loc ops[INSN_OP_MAX]; +}; + +#define for_each_insn_op_loc(insn_loc, i, op_loc) \ + for (i = INSN_OP_SOURCE, op_loc = &(insn_loc)->ops[i]; \ + i < INSN_OP_MAX; \ + i++, op_loc++) + +/* Get detailed location info in the instruction */ +int annotate_get_insn_location(struct arch *arch, struct disasm_line *dl, + struct annotated_insn_loc *loc); + #endif /* __PERF_ANNOTATE_H */ -- GitLab From 67bc54bbc5a25c0488cc488558a11c14c10f5f14 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 12 Dec 2023 16:13:13 -0800 Subject: [PATCH 0518/1290] perf annotate: Implement hist_entry__get_data_type() It's the function to find out the type info from the given sample data and will be called from the hist_entry sort logic when 'type' sort key is used. It first calls objdump to disassemble the instructions and figure out information about memory access at the location. Maybe we can do it better by analyzing the instruction directly, but I'll leave it for later work. The memory access is determined by checking instruction operands to have "(" and then extract register name and offset. It'll return NULL if no data type is found. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Cc: linux-toolchains@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Link: https://lore.kernel.org/r/20231213001323.718046-8-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 88 ++++++++++++++++++++++++++++++++++++++ tools/perf/util/annotate.h | 4 ++ 2 files changed, 92 insertions(+) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 7c597440dc2eb..8673eac4b9df4 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -25,6 +25,7 @@ #include "units.h" #include "debug.h" #include "annotate.h" +#include "annotate-data.h" #include "evsel.h" #include "evlist.h" #include "bpf-event.h" @@ -3625,3 +3626,90 @@ int annotate_get_insn_location(struct arch *arch, struct disasm_line *dl, return 0; } + +static void symbol__ensure_annotate(struct map_symbol *ms, struct evsel *evsel) +{ + struct disasm_line *dl, *tmp_dl; + struct annotation *notes; + + notes = symbol__annotation(ms->sym); + if (!list_empty(¬es->src->source)) + return; + + if (symbol__annotate(ms, evsel, NULL) < 0) + return; + + /* remove non-insn disasm lines for simplicity */ + list_for_each_entry_safe(dl, tmp_dl, ¬es->src->source, al.node) { + if (dl->al.offset == -1) { + list_del(&dl->al.node); + free(dl); + } + } +} + +static struct disasm_line *find_disasm_line(struct symbol *sym, u64 ip) +{ + struct disasm_line *dl; + struct annotation *notes; + + notes = symbol__annotation(sym); + + list_for_each_entry(dl, ¬es->src->source, al.node) { + if (sym->start + dl->al.offset == ip) + return dl; + } + return NULL; +} + +/** + * hist_entry__get_data_type - find data type for given hist entry + * @he: hist entry + * + * This function first annotates the instruction at @he->ip and extracts + * register and offset info from it. Then it searches the DWARF debug + * info to get a variable and type information using the address, register, + * and offset. + */ +struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he) +{ + struct map_symbol *ms = &he->ms; + struct evsel *evsel = hists_to_evsel(he->hists); + struct arch *arch; + struct disasm_line *dl; + struct annotated_insn_loc loc; + struct annotated_op_loc *op_loc; + u64 ip = he->ip; + int i; + + if (ms->map == NULL || ms->sym == NULL) + return NULL; + + if (!symbol_conf.init_annotation) + return NULL; + + if (evsel__get_arch(evsel, &arch) < 0) + return NULL; + + /* Make sure it runs objdump to get disasm of the function */ + symbol__ensure_annotate(ms, evsel); + + /* + * Get a disasm to extract the location from the insn. + * This is too slow... + */ + dl = find_disasm_line(ms->sym, ip); + if (dl == NULL) + return NULL; + + if (annotate_get_insn_location(arch, dl, &loc) < 0) + return NULL; + + for_each_insn_op_loc(&loc, i, op_loc) { + if (!op_loc->mem_ref) + continue; + + return find_data_type(ms, ip, op_loc->reg, op_loc->offset); + } + return NULL; +} diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 25ae8893d4f9b..6c75b28322860 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -23,6 +23,7 @@ struct option; struct perf_sample; struct evsel; struct symbol; +struct annotated_data_type; struct ins { const char *name; @@ -475,4 +476,7 @@ struct annotated_insn_loc { int annotate_get_insn_location(struct arch *arch, struct disasm_line *dl, struct annotated_insn_loc *loc); +/* Returns a data type from the sample instruction (if any) */ +struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he); + #endif /* __PERF_ANNOTATE_H */ -- GitLab From 2f2c41bdd87f450a6a71c5d090d42c248ca4bf1e Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 12 Dec 2023 16:13:14 -0800 Subject: [PATCH 0519/1290] perf report: Add 'type' sort key The 'type' sort key is to aggregate hist entries by data type they access. Add mem_type field to hist_entry struct to save the type. If hist_entry__get_data_type() returns NULL, it'd use the 'unknown_type' instance. Committer testing: Before: # perf mem record sleep 2s [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.037 MB perf.data (4 samples) ] root@number:/home/acme/Downloads# perf report --stdio -s type Error: Unknown --sort key: `type' Usage: perf report [] -s, --sort sort by key(s): overhead overhead_sys overhead_us overhead_guest_sys overhead_guest_us overhead_children sample period pid comm dso symbol parent cpu socket srcline srcfile local_weight weight transaction trace symbol_size dso_size cgroup cgroup_id ipc_null time code_page_size local_ins_lat ins_lat local_p_stage_cyc p_stage_cyc addr local_retire_lat retire_lat simd dso_from dso_to symbol_from symbol_to mispredict abort in_tx cycles srcline_from srcline_to ipc_lbr addr_from addr_to symbol_daddr dso_daddr locked tlb mem snoop dcacheline symbol_iaddr phys_daddr data_page_size blocked # After: # perf report --stdio -s type # To display the perf.data header info, please use --header/--header-only options. # # # Total Lost Samples: 0 # # Samples: 4 of event 'cpu_atom/mem-loads,ldlat=30/P' # Event count (approx.): 7 # # Overhead Data Type # ........ ......... # 100.00% (unknown) # # (Tip: Print event counts in CSV format with: perf stat -x,) # # rpm -q kernel-debuginfo kernel-debuginfo-6.6.4-200.fc39.x86_64 # uname -r 6.6.4-200.fc39.x86_64 # Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Cc: linux-toolchains@vger.kernel.org> Cc: linux-trace-devel@vger.kernel.org> Link: https://lore.kernel.org/r/20231213001323.718046-9-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-report.txt | 1 + tools/perf/util/annotate-data.h | 2 + tools/perf/util/hist.h | 1 + tools/perf/util/sort.c | 69 +++++++++++++++++++++++- tools/perf/util/sort.h | 4 ++ 5 files changed, 75 insertions(+), 2 deletions(-) diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index af068b4f1e5a6..aec34417090b0 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -118,6 +118,7 @@ OPTIONS - retire_lat: On X86, this reports pipeline stall of this instruction compared to the previous instruction in cycles. And currently supported only on X86 - simd: Flags describing a SIMD operation. "e" for empty Arm SVE predicate. "p" for partial Arm SVE predicate + - type: Data type of sample memory access. By default, comm, dso and symbol keys are used. (i.e. --sort comm,dso,symbol) diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h index ab9f187bd7f13..6efdd7e21b286 100644 --- a/tools/perf/util/annotate-data.h +++ b/tools/perf/util/annotate-data.h @@ -22,6 +22,8 @@ struct annotated_data_type { int type_size; }; +extern struct annotated_data_type unknown_type; + #ifdef HAVE_DWARF_SUPPORT /* Returns data type at the location (ip, reg, offset) */ diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 5d0db96609dff..7ebbf427b1eac 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -82,6 +82,7 @@ enum hist_column { HISTC_ADDR_TO, HISTC_ADDR, HISTC_SIMD, + HISTC_TYPE, HISTC_NR_COLS, /* Last entry */ }; diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 27b123ccd2d15..e647f0117bb56 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -24,6 +24,7 @@ #include "strbuf.h" #include "mem-events.h" #include "annotate.h" +#include "annotate-data.h" #include "event.h" #include "time-utils.h" #include "cgroup.h" @@ -2094,7 +2095,7 @@ struct sort_entry sort_dso_size = { .se_width_idx = HISTC_DSO_SIZE, }; -/* --sort dso_size */ +/* --sort addr */ static int64_t sort__addr_cmp(struct hist_entry *left, struct hist_entry *right) @@ -2131,6 +2132,69 @@ struct sort_entry sort_addr = { .se_width_idx = HISTC_ADDR, }; +/* --sort type */ + +struct annotated_data_type unknown_type = { + .type_name = (char *)"(unknown)", +}; + +static int64_t +sort__type_cmp(struct hist_entry *left, struct hist_entry *right) +{ + return sort__addr_cmp(left, right); +} + +static void sort__type_init(struct hist_entry *he) +{ + if (he->mem_type) + return; + + he->mem_type = hist_entry__get_data_type(he); + if (he->mem_type == NULL) + he->mem_type = &unknown_type; +} + +static int64_t +sort__type_collapse(struct hist_entry *left, struct hist_entry *right) +{ + struct annotated_data_type *left_type = left->mem_type; + struct annotated_data_type *right_type = right->mem_type; + + if (!left_type) { + sort__type_init(left); + left_type = left->mem_type; + } + + if (!right_type) { + sort__type_init(right); + right_type = right->mem_type; + } + + return strcmp(left_type->type_name, right_type->type_name); +} + +static int64_t +sort__type_sort(struct hist_entry *left, struct hist_entry *right) +{ + return sort__type_collapse(left, right); +} + +static int hist_entry__type_snprintf(struct hist_entry *he, char *bf, + size_t size, unsigned int width) +{ + return repsep_snprintf(bf, size, "%-*s", width, he->mem_type->type_name); +} + +struct sort_entry sort_type = { + .se_header = "Data Type", + .se_cmp = sort__type_cmp, + .se_collapse = sort__type_collapse, + .se_sort = sort__type_sort, + .se_init = sort__type_init, + .se_snprintf = hist_entry__type_snprintf, + .se_width_idx = HISTC_TYPE, +}; + struct sort_dimension { const char *name; @@ -2185,7 +2249,8 @@ static struct sort_dimension common_sort_dimensions[] = { DIM(SORT_ADDR, "addr", sort_addr), DIM(SORT_LOCAL_RETIRE_LAT, "local_retire_lat", sort_local_p_stage_cyc), DIM(SORT_GLOBAL_RETIRE_LAT, "retire_lat", sort_global_p_stage_cyc), - DIM(SORT_SIMD, "simd", sort_simd) + DIM(SORT_SIMD, "simd", sort_simd), + DIM(SORT_ANNOTATE_DATA_TYPE, "type", sort_type), }; #undef DIM diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index ecfb7f1359d5e..aabf0b8331a35 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -15,6 +15,7 @@ struct option; struct thread; +struct annotated_data_type; extern regex_t parent_regex; extern const char *sort_order; @@ -34,6 +35,7 @@ extern struct sort_entry sort_dso_to; extern struct sort_entry sort_sym_from; extern struct sort_entry sort_sym_to; extern struct sort_entry sort_srcline; +extern struct sort_entry sort_type; extern const char default_mem_sort_order[]; extern bool chk_double_cl; @@ -154,6 +156,7 @@ struct hist_entry { struct perf_hpp_list *hpp_list; struct hist_entry *parent_he; struct hist_entry_ops *ops; + struct annotated_data_type *mem_type; union { /* this is for hierarchical entry structure */ struct { @@ -243,6 +246,7 @@ enum sort_type { SORT_LOCAL_RETIRE_LAT, SORT_GLOBAL_RETIRE_LAT, SORT_SIMD, + SORT_ANNOTATE_DATA_TYPE, /* branch stack specific sort keys */ __SORT_BRANCH_STACK, -- GitLab From 81e57deec32594b124d777b1d3ca8a1415410230 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 12 Dec 2023 16:13:15 -0800 Subject: [PATCH 0520/1290] perf report: Support data type profiling Enable type annotation when the 'type' sort key is used. It shows type of variables the samples access at the moment. Users can see which types are accessed frequently. $ perf report -s dso,type --stdio ... # Overhead Shared Object Data Type # ........ ................. ......... # 35.47% [kernel.kallsyms] (unknown) 1.62% [kernel.kallsyms] struct sched_entry 1.23% [kernel.kallsyms] struct cfs_rq 0.83% [kernel.kallsyms] struct task_struct 0.34% [kernel.kallsyms] struct list_head 0.30% [kernel.kallsyms] struct mem_cgroup ... Committer testing: With the perf.data file collected in the previous cset: # perf report --stdio -s type # To display the perf.data header info, please use --header/--header-only options. # # # Total Lost Samples: 0 # # Samples: 4 of event 'cpu_atom/mem-loads,ldlat=30/P' # Event count (approx.): 7 # # Overhead Data Type # ........ ......... # 42.86% struct list_head 42.86% (unknown) 14.29% char # # (Tip: To record callchains for each sample: perf record -g) # # perf report --stdio -s dso,type # To display the perf.data header info, please use --header/--header-only options. # # # Total Lost Samples: 0 # # Samples: 4 of event 'cpu_atom/mem-loads,ldlat=30/P' # Event count (approx.): 7 # # Overhead Shared Object Data Type # ........ .................... ......... # 42.86% [kernel.kallsyms] struct list_head 28.57% libc.so.6 (unknown) 14.29% [kernel.kallsyms] char 14.29% ld-linux-x86-64.so.2 (unknown) # # (Tip: Save output of perf stat using: perf stat record ) # # Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Cc: linux-toolchains@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Link: https://lore.kernel.org/r/20231213001323.718046-10-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 178fb602bc982..f2ed2b7e80a32 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -96,6 +96,7 @@ struct report { bool stitch_lbr; bool disable_order; bool skip_empty; + bool data_type; int max_stack; struct perf_read_values show_threads_values; const char *pretty_printing_style; @@ -170,7 +171,7 @@ static int hist_iter__report_callback(struct hist_entry_iter *iter, struct mem_info *mi; struct branch_info *bi; - if (!ui__has_annotation() && !rep->symbol_ipc) + if (!ui__has_annotation() && !rep->symbol_ipc && !rep->data_type) return 0; if (sort__mode == SORT_MODE__BRANCH) { @@ -1639,6 +1640,16 @@ repeat: sort_order = NULL; } + if (sort_order && strstr(sort_order, "type")) { + report.data_type = true; + annotate_opts.annotate_src = false; + +#ifndef HAVE_DWARF_GETLOCATIONS_SUPPORT + pr_err("Error: Data type profiling is disabled due to missing DWARF support\n"); + goto error; +#endif + } + if (strcmp(input_name, "-") != 0) setup_browser(true); else @@ -1697,7 +1708,7 @@ repeat: * so don't allocate extra space that won't be used in the stdio * implementation. */ - if (ui__has_annotation() || report.symbol_ipc || + if (ui__has_annotation() || report.symbol_ipc || report.data_type || report.total_cycles_mode) { ret = symbol__annotation_init(); if (ret < 0) -- GitLab From 4a111cadac85362ed9476737d7a36e8dd3a8e476 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 12 Dec 2023 16:13:16 -0800 Subject: [PATCH 0521/1290] perf annotate-data: Add member field in the data type Add child member field if the current type is a composite type like a struct or union. The member fields are linked in the children list and do the same recursively if the child itself is a composite type. Add 'self' member to the annotated_data_type to handle the members in the same way. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Cc: linux-toolchains@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Link: https://lore.kernel.org/r/20231213001323.718046-11-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 101 ++++++++++++++++++++++++++++---- tools/perf/util/annotate-data.h | 27 +++++++-- tools/perf/util/sort.c | 9 ++- 3 files changed, 119 insertions(+), 18 deletions(-) diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 1f921971174d2..5269c6630e045 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -31,9 +31,9 @@ static int data_type_cmp(const void *_key, const struct rb_node *node) type = rb_entry(node, struct annotated_data_type, node); - if (key->type_size != type->type_size) - return key->type_size - type->type_size; - return strcmp(key->type_name, type->type_name); + if (key->self.size != type->self.size) + return key->self.size - type->self.size; + return strcmp(key->self.type_name, type->self.type_name); } static bool data_type_less(struct rb_node *node_a, const struct rb_node *node_b) @@ -43,9 +43,80 @@ static bool data_type_less(struct rb_node *node_a, const struct rb_node *node_b) a = rb_entry(node_a, struct annotated_data_type, node); b = rb_entry(node_b, struct annotated_data_type, node); - if (a->type_size != b->type_size) - return a->type_size < b->type_size; - return strcmp(a->type_name, b->type_name) < 0; + if (a->self.size != b->self.size) + return a->self.size < b->self.size; + return strcmp(a->self.type_name, b->self.type_name) < 0; +} + +/* Recursively add new members for struct/union */ +static int __add_member_cb(Dwarf_Die *die, void *arg) +{ + struct annotated_member *parent = arg; + struct annotated_member *member; + Dwarf_Die member_type, die_mem; + Dwarf_Word size, loc; + Dwarf_Attribute attr; + struct strbuf sb; + int tag; + + if (dwarf_tag(die) != DW_TAG_member) + return DIE_FIND_CB_SIBLING; + + member = zalloc(sizeof(*member)); + if (member == NULL) + return DIE_FIND_CB_END; + + strbuf_init(&sb, 32); + die_get_typename(die, &sb); + + die_get_real_type(die, &member_type); + if (dwarf_aggregate_size(&member_type, &size) < 0) + size = 0; + + if (!dwarf_attr_integrate(die, DW_AT_data_member_location, &attr)) + loc = 0; + else + dwarf_formudata(&attr, &loc); + + member->type_name = strbuf_detach(&sb, NULL); + /* member->var_name can be NULL */ + if (dwarf_diename(die)) + member->var_name = strdup(dwarf_diename(die)); + member->size = size; + member->offset = loc + parent->offset; + INIT_LIST_HEAD(&member->children); + list_add_tail(&member->node, &parent->children); + + tag = dwarf_tag(&member_type); + switch (tag) { + case DW_TAG_structure_type: + case DW_TAG_union_type: + die_find_child(&member_type, __add_member_cb, member, &die_mem); + break; + default: + break; + } + return DIE_FIND_CB_SIBLING; +} + +static void add_member_types(struct annotated_data_type *parent, Dwarf_Die *type) +{ + Dwarf_Die die_mem; + + die_find_child(type, __add_member_cb, &parent->self, &die_mem); +} + +static void delete_members(struct annotated_member *member) +{ + struct annotated_member *child, *tmp; + + list_for_each_entry_safe(child, tmp, &member->children, node) { + list_del(&child->node); + delete_members(child); + free(child->type_name); + free(child->var_name); + free(child); + } } static struct annotated_data_type *dso__findnew_data_type(struct dso *dso, @@ -65,8 +136,8 @@ static struct annotated_data_type *dso__findnew_data_type(struct dso *dso, dwarf_aggregate_size(type_die, &size); /* Check existing nodes in dso->data_types tree */ - key.type_name = type_name; - key.type_size = size; + key.self.type_name = type_name; + key.self.size = size; node = rb_find(&key, &dso->data_types, data_type_cmp); if (node) { result = rb_entry(node, struct annotated_data_type, node); @@ -81,8 +152,15 @@ static struct annotated_data_type *dso__findnew_data_type(struct dso *dso, return NULL; } - result->type_name = type_name; - result->type_size = size; + result->self.type_name = type_name; + result->self.size = size; + INIT_LIST_HEAD(&result->self.children); + + /* + * Fill member info unconditionally for now, + * later perf annotate would need it. + */ + add_member_types(result, type_die); rb_add(&result->node, &dso->data_types, data_type_less); return result; @@ -233,7 +311,8 @@ void annotated_data_type__tree_delete(struct rb_root *root) rb_erase(node, root); pos = rb_entry(node, struct annotated_data_type, node); - free(pos->type_name); + delete_members(&pos->self); + free(pos->self.type_name); free(pos); } } diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h index 6efdd7e21b286..33748222e6aa4 100644 --- a/tools/perf/util/annotate-data.h +++ b/tools/perf/util/annotate-data.h @@ -9,17 +9,36 @@ struct map_symbol; +/** + * struct annotated_member - Type of member field + * @node: List entry in the parent list + * @children: List head for child nodes + * @type_name: Name of the member type + * @var_name: Name of the member variable + * @offset: Offset from the outer data type + * @size: Size of the member field + * + * This represents a member type in a data type. + */ +struct annotated_member { + struct list_head node; + struct list_head children; + char *type_name; + char *var_name; + int offset; + int size; +}; + /** * struct annotated_data_type - Data type to profile - * @type_name: Name of the data type - * @type_size: Size of the data type + * @node: RB-tree node for dso->type_tree + * @self: Actual type information * * This represents a data type accessed by samples in the profile data. */ struct annotated_data_type { struct rb_node node; - char *type_name; - int type_size; + struct annotated_member self; }; extern struct annotated_data_type unknown_type; diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index e647f0117bb56..a41209e242ae4 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -2135,7 +2135,10 @@ struct sort_entry sort_addr = { /* --sort type */ struct annotated_data_type unknown_type = { - .type_name = (char *)"(unknown)", + .self = { + .type_name = (char *)"(unknown)", + .children = LIST_HEAD_INIT(unknown_type.self.children), + }, }; static int64_t @@ -2170,7 +2173,7 @@ sort__type_collapse(struct hist_entry *left, struct hist_entry *right) right_type = right->mem_type; } - return strcmp(left_type->type_name, right_type->type_name); + return strcmp(left_type->self.type_name, right_type->self.type_name); } static int64_t @@ -2182,7 +2185,7 @@ sort__type_sort(struct hist_entry *left, struct hist_entry *right) static int hist_entry__type_snprintf(struct hist_entry *he, char *bf, size_t size, unsigned int width) { - return repsep_snprintf(bf, size, "%-*s", width, he->mem_type->type_name); + return repsep_snprintf(bf, size, "%-*s", width, he->mem_type->self.type_name); } struct sort_entry sort_type = { -- GitLab From 9bd7ddd1576164fbb8d369c6a7b018af9544e202 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 12 Dec 2023 16:13:17 -0800 Subject: [PATCH 0522/1290] perf annotate-data: Update sample histogram for type The annotated_data_type__update_samples() to get histogram for data type access. It'll be called by perf annotate to show which fields in the data type are accessed frequently. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Cc: linux-toolchains@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Link: https://lore.kernel.org/r/20231213001323.718046-12-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate-data.c | 81 +++++++++++++++++++++++++++++++++ tools/perf/util/annotate-data.h | 42 +++++++++++++++++ tools/perf/util/annotate.c | 9 +++- 3 files changed, 131 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 5269c6630e045..868f52ccdb74c 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -13,6 +13,8 @@ #include "debuginfo.h" #include "debug.h" #include "dso.h" +#include "evsel.h" +#include "evlist.h" #include "map.h" #include "map_symbol.h" #include "strbuf.h" @@ -302,6 +304,44 @@ out: return result; } +static int alloc_data_type_histograms(struct annotated_data_type *adt, int nr_entries) +{ + int i; + size_t sz = sizeof(struct type_hist); + + sz += sizeof(struct type_hist_entry) * adt->self.size; + + /* Allocate a table of pointers for each event */ + adt->nr_histograms = nr_entries; + adt->histograms = calloc(nr_entries, sizeof(*adt->histograms)); + if (adt->histograms == NULL) + return -ENOMEM; + + /* + * Each histogram is allocated for the whole size of the type. + * TODO: Probably we can move the histogram to members. + */ + for (i = 0; i < nr_entries; i++) { + adt->histograms[i] = zalloc(sz); + if (adt->histograms[i] == NULL) + goto err; + } + return 0; + +err: + while (--i >= 0) + free(adt->histograms[i]); + free(adt->histograms); + return -ENOMEM; +} + +static void delete_data_type_histograms(struct annotated_data_type *adt) +{ + for (int i = 0; i < adt->nr_histograms; i++) + free(adt->histograms[i]); + free(adt->histograms); +} + void annotated_data_type__tree_delete(struct rb_root *root) { struct annotated_data_type *pos; @@ -312,7 +352,48 @@ void annotated_data_type__tree_delete(struct rb_root *root) rb_erase(node, root); pos = rb_entry(node, struct annotated_data_type, node); delete_members(&pos->self); + delete_data_type_histograms(pos); free(pos->self.type_name); free(pos); } } + +/** + * annotated_data_type__update_samples - Update histogram + * @adt: Data type to update + * @evsel: Event to update + * @offset: Offset in the type + * @nr_samples: Number of samples at this offset + * @period: Event count at this offset + * + * This function updates type histogram at @ofs for @evsel. Samples are + * aggregated before calling this function so it can be called with more + * than one samples at a certain offset. + */ +int annotated_data_type__update_samples(struct annotated_data_type *adt, + struct evsel *evsel, int offset, + int nr_samples, u64 period) +{ + struct type_hist *h; + + if (adt == NULL) + return 0; + + if (adt->histograms == NULL) { + int nr = evsel->evlist->core.nr_entries; + + if (alloc_data_type_histograms(adt, nr) < 0) + return -1; + } + + if (offset < 0 || offset >= adt->self.size) + return -1; + + h = adt->histograms[evsel->core.idx]; + + h->nr_samples += nr_samples; + h->addr[offset].nr_samples += nr_samples; + h->period += period; + h->addr[offset].period += period; + return 0; +} diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h index 33748222e6aa4..d2dc025b19345 100644 --- a/tools/perf/util/annotate-data.h +++ b/tools/perf/util/annotate-data.h @@ -7,6 +7,7 @@ #include #include +struct evsel; struct map_symbol; /** @@ -29,16 +30,42 @@ struct annotated_member { int size; }; +/** + * struct type_hist_entry - Histogram entry per offset + * @nr_samples: Number of samples + * @period: Count of event + */ +struct type_hist_entry { + int nr_samples; + u64 period; +}; + +/** + * struct type_hist - Type histogram for each event + * @nr_samples: Total number of samples in this data type + * @period: Total count of the event in this data type + * @offset: Array of histogram entry + */ +struct type_hist { + u64 nr_samples; + u64 period; + struct type_hist_entry addr[]; +}; + /** * struct annotated_data_type - Data type to profile * @node: RB-tree node for dso->type_tree * @self: Actual type information + * @nr_histogram: Number of histogram entries + * @histograms: An array of pointers to histograms * * This represents a data type accessed by samples in the profile data. */ struct annotated_data_type { struct rb_node node; struct annotated_member self; + int nr_histograms; + struct type_hist **histograms; }; extern struct annotated_data_type unknown_type; @@ -49,6 +76,11 @@ extern struct annotated_data_type unknown_type; struct annotated_data_type *find_data_type(struct map_symbol *ms, u64 ip, int reg, int offset); +/* Update type access histogram at the given offset */ +int annotated_data_type__update_samples(struct annotated_data_type *adt, + struct evsel *evsel, int offset, + int nr_samples, u64 period); + /* Release all data type information in the tree */ void annotated_data_type__tree_delete(struct rb_root *root); @@ -61,6 +93,16 @@ find_data_type(struct map_symbol *ms __maybe_unused, u64 ip __maybe_unused, return NULL; } +static inline int +annotated_data_type__update_samples(struct annotated_data_type *adt __maybe_unused, + struct evsel *evsel __maybe_unused, + int offset __maybe_unused, + int nr_samples __maybe_unused, + u64 period __maybe_unused) +{ + return -1; +} + static inline void annotated_data_type__tree_delete(struct rb_root *root __maybe_unused) { } diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 8673eac4b9df4..6747779ecef82 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -3679,6 +3679,7 @@ struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he) struct disasm_line *dl; struct annotated_insn_loc loc; struct annotated_op_loc *op_loc; + struct annotated_data_type *mem_type; u64 ip = he->ip; int i; @@ -3709,7 +3710,13 @@ struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he) if (!op_loc->mem_ref) continue; - return find_data_type(ms, ip, op_loc->reg, op_loc->offset); + mem_type = find_data_type(ms, ip, op_loc->reg, op_loc->offset); + + annotated_data_type__update_samples(mem_type, evsel, + op_loc->offset, + he->stat.nr_events, + he->stat.period); + return mem_type; } return NULL; } -- GitLab From 871304a79f755b2ab594bbd21857ecb4c4aa57c9 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 12 Dec 2023 16:13:18 -0800 Subject: [PATCH 0523/1290] perf report: Add 'typeoff' sort key The typeoff sort key shows the data type name, offset and the name of the field. This is useful to see which field in the struct is accessed most frequently. $ perf report -s type,typeoff --hierarchy --stdio ... # Overhead Data Type / Data Type Offset # ............ ............................ # ... 1.23% struct cfs_rq 0.19% struct cfs_rq +404 (throttle_count) 0.19% struct cfs_rq +0 (load.weight) 0.19% struct cfs_rq +336 (leaf_cfs_rq_list.next) 0.09% struct cfs_rq +272 (propagate) 0.09% struct cfs_rq +196 (removed.nr) 0.09% struct cfs_rq +80 (curr) 0.09% struct cfs_rq +544 (lt_b_children_throttled) 0.06% struct cfs_rq +320 (rq) Committer testing: Again with the perf.data from the previous csets: # perf report --stdio -s type,typeoff # To display the perf.data header info, please use --header/--header-only options. # # # Total Lost Samples: 0 # # Samples: 4 of event 'cpu_atom/mem-loads,ldlat=30/P' # Event count (approx.): 7 # # Overhead Data Type Data Type Offset # ........ ......... ................ # 42.86% struct list_head struct list_head +8 (prev) 42.86% (unknown) (unknown) +0 (no field) 14.29% char char +0 (no field) # # (Tip: To see callchains in a more compact form: perf report -g folded) # # perf report --stdio -s dso,type,typeoff # To display the perf.data header info, please use --header/--header-only options. # # # Total Lost Samples: 0 # # Samples: 4 of event 'cpu_atom/mem-loads,ldlat=30/P' # Event count (approx.): 7 # # Overhead Shared Object Data Type Data Type Offset # ........ .................... ......... ................ # 42.86% [kernel.kallsyms] struct list_head struct list_head +8 (prev) 28.57% libc.so.6 (unknown) (unknown) +0 (no field) 14.29% [kernel.kallsyms] char char +0 (no field) 14.29% ld-linux-x86-64.so.2 (unknown) (unknown) +0 (no field) # # (Tip: If you have debuginfo enabled, try: perf report -s sym,srcline) # # Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Cc: linux-toolchains@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Link: https://lore.kernel.org/r/20231213001323.718046-13-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-report.txt | 1 + tools/perf/util/annotate.c | 1 + tools/perf/util/hist.h | 1 + tools/perf/util/sort.c | 83 +++++++++++++++++++++++- tools/perf/util/sort.h | 2 + 5 files changed, 87 insertions(+), 1 deletion(-) diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index aec34417090b0..b57eb51b47aa6 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -119,6 +119,7 @@ OPTIONS to the previous instruction in cycles. And currently supported only on X86 - simd: Flags describing a SIMD operation. "e" for empty Arm SVE predicate. "p" for partial Arm SVE predicate - type: Data type of sample memory access. + - typeoff: Offset in the data type of sample memory access. By default, comm, dso and symbol keys are used. (i.e. --sort comm,dso,symbol) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 6747779ecef82..f966e8f83c5ec 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -3716,6 +3716,7 @@ struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he) op_loc->offset, he->stat.nr_events, he->stat.period); + he->mem_type_off = op_loc->offset; return mem_type; } return NULL; diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 7ebbf427b1eac..18128a49309ef 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -83,6 +83,7 @@ enum hist_column { HISTC_ADDR, HISTC_SIMD, HISTC_TYPE, + HISTC_TYPE_OFFSET, HISTC_NR_COLS, /* Last entry */ }; diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index a41209e242ae4..d78e680d39880 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -2153,8 +2153,10 @@ static void sort__type_init(struct hist_entry *he) return; he->mem_type = hist_entry__get_data_type(he); - if (he->mem_type == NULL) + if (he->mem_type == NULL) { he->mem_type = &unknown_type; + he->mem_type_off = 0; + } } static int64_t @@ -2198,6 +2200,84 @@ struct sort_entry sort_type = { .se_width_idx = HISTC_TYPE, }; +/* --sort typeoff */ + +static int64_t +sort__typeoff_sort(struct hist_entry *left, struct hist_entry *right) +{ + struct annotated_data_type *left_type = left->mem_type; + struct annotated_data_type *right_type = right->mem_type; + int64_t ret; + + if (!left_type) { + sort__type_init(left); + left_type = left->mem_type; + } + + if (!right_type) { + sort__type_init(right); + right_type = right->mem_type; + } + + ret = strcmp(left_type->self.type_name, right_type->self.type_name); + if (ret) + return ret; + return left->mem_type_off - right->mem_type_off; +} + +static void fill_member_name(char *buf, size_t sz, struct annotated_member *m, + int offset, bool first) +{ + struct annotated_member *child; + + if (list_empty(&m->children)) + return; + + list_for_each_entry(child, &m->children, node) { + if (child->offset <= offset && offset < child->offset + child->size) { + int len = 0; + + /* It can have anonymous struct/union members */ + if (child->var_name) { + len = scnprintf(buf, sz, "%s%s", + first ? "" : ".", child->var_name); + first = false; + } + + fill_member_name(buf + len, sz - len, child, offset, first); + return; + } + } +} + +static int hist_entry__typeoff_snprintf(struct hist_entry *he, char *bf, + size_t size, unsigned int width __maybe_unused) +{ + struct annotated_data_type *he_type = he->mem_type; + char buf[4096]; + + buf[0] = '\0'; + if (list_empty(&he_type->self.children)) + snprintf(buf, sizeof(buf), "no field"); + else + fill_member_name(buf, sizeof(buf), &he_type->self, + he->mem_type_off, true); + buf[4095] = '\0'; + + return repsep_snprintf(bf, size, "%s %+d (%s)", he_type->self.type_name, + he->mem_type_off, buf); +} + +struct sort_entry sort_type_offset = { + .se_header = "Data Type Offset", + .se_cmp = sort__type_cmp, + .se_collapse = sort__typeoff_sort, + .se_sort = sort__typeoff_sort, + .se_init = sort__type_init, + .se_snprintf = hist_entry__typeoff_snprintf, + .se_width_idx = HISTC_TYPE_OFFSET, +}; + struct sort_dimension { const char *name; @@ -2254,6 +2334,7 @@ static struct sort_dimension common_sort_dimensions[] = { DIM(SORT_GLOBAL_RETIRE_LAT, "retire_lat", sort_global_p_stage_cyc), DIM(SORT_SIMD, "simd", sort_simd), DIM(SORT_ANNOTATE_DATA_TYPE, "type", sort_type), + DIM(SORT_ANNOTATE_DATA_TYPE_OFFSET, "typeoff", sort_type_offset), }; #undef DIM diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index aabf0b8331a35..d806adcc1e1ea 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -113,6 +113,7 @@ struct hist_entry { u64 p_stage_cyc; u8 cpumode; u8 depth; + int mem_type_off; struct simd_flags simd_flags; /* We are added by hists__add_dummy_entry. */ @@ -247,6 +248,7 @@ enum sort_type { SORT_GLOBAL_RETIRE_LAT, SORT_SIMD, SORT_ANNOTATE_DATA_TYPE, + SORT_ANNOTATE_DATA_TYPE_OFFSET, /* branch stack specific sort keys */ __SORT_BRANCH_STACK, -- GitLab From e2c1c8ff2d2ffec340b8fc73ee13b8fb516d1c6d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 12 Dec 2023 16:13:19 -0800 Subject: [PATCH 0524/1290] perf report: Add 'symoff' sort key The symoff sort key is to print symbol and offset of sample. This is useful for data type profiling to show exact instruction in the function which refers the data. $ perf report -s type,sym,typeoff,symoff --hierarchy ... # Overhead Data Type / Symbol / Data Type Offset / Symbol Offset # .............. ..................................................... # 1.23% struct cfs_rq 0.84% update_blocked_averages 0.19% struct cfs_rq +336 (leaf_cfs_rq_list.next) 0.19% [k] update_blocked_averages+0x96 0.19% struct cfs_rq +0 (load.weight) 0.14% [k] update_blocked_averages+0x104 0.04% [k] update_blocked_averages+0x31c 0.17% struct cfs_rq +404 (throttle_count) 0.12% [k] update_blocked_averages+0x9d 0.05% [k] update_blocked_averages+0x1f9 0.08% struct cfs_rq +272 (propagate) 0.07% [k] update_blocked_averages+0x3d3 0.02% [k] update_blocked_averages+0x45b ... Committer testing: # perf report --stdio -s type,typeoff,symoff # To display the perf.data header info, please use --header/--header-only options. # # # Total Lost Samples: 0 # # Samples: 4 of event 'cpu_atom/mem-loads,ldlat=30/P' # Event count (approx.): 7 # # Overhead Data Type Data Type Offset Symbol Offset # ........ ......... ................ ............. # 42.86% struct list_head struct list_head +8 (prev) [k] __list_del_entry_valid_or_report+0x7 28.57% (unknown) (unknown) +0 (no field) [.] _nl_intern_locale_data+0x25 14.29% char char +0 (no field) [k] strncpy_from_user+0xa5 14.29% (unknown) (unknown) +0 (no field) [.] _dl_lookup_symbol_x+0x50 # # (Tip: To change sampling frequency to 100 Hz: perf record -F 100) # Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Cc: linux-toolchains@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Link: https://lore.kernel.org/r/20231213001323.718046-14-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-report.txt | 1 + tools/perf/util/hist.h | 1 + tools/perf/util/sort.c | 47 ++++++++++++++++++++++++ tools/perf/util/sort.h | 1 + 4 files changed, 50 insertions(+) diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index b57eb51b47aa6..38f59ac064f7d 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -120,6 +120,7 @@ OPTIONS - simd: Flags describing a SIMD operation. "e" for empty Arm SVE predicate. "p" for partial Arm SVE predicate - type: Data type of sample memory access. - typeoff: Offset in the data type of sample memory access. + - symoff: Offset in the symbol. By default, comm, dso and symbol keys are used. (i.e. --sort comm,dso,symbol) diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 18128a49309ef..4a0aea0c9e00e 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -84,6 +84,7 @@ enum hist_column { HISTC_SIMD, HISTC_TYPE, HISTC_TYPE_OFFSET, + HISTC_SYMBOL_OFFSET, HISTC_NR_COLS, /* Last entry */ }; diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index d78e680d39880..0cbbd5ba8175a 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -419,6 +419,52 @@ struct sort_entry sort_sym = { .se_width_idx = HISTC_SYMBOL, }; +/* --sort symoff */ + +static int64_t +sort__symoff_cmp(struct hist_entry *left, struct hist_entry *right) +{ + int64_t ret; + + ret = sort__sym_cmp(left, right); + if (ret) + return ret; + + return left->ip - right->ip; +} + +static int64_t +sort__symoff_sort(struct hist_entry *left, struct hist_entry *right) +{ + int64_t ret; + + ret = sort__sym_sort(left, right); + if (ret) + return ret; + + return left->ip - right->ip; +} + +static int +hist_entry__symoff_snprintf(struct hist_entry *he, char *bf, size_t size, unsigned int width) +{ + struct symbol *sym = he->ms.sym; + + if (sym == NULL) + return repsep_snprintf(bf, size, "[%c] %-#.*llx", he->level, width - 4, he->ip); + + return repsep_snprintf(bf, size, "[%c] %s+0x%llx", he->level, sym->name, he->ip - sym->start); +} + +struct sort_entry sort_sym_offset = { + .se_header = "Symbol Offset", + .se_cmp = sort__symoff_cmp, + .se_sort = sort__symoff_sort, + .se_snprintf = hist_entry__symoff_snprintf, + .se_filter = hist_entry__sym_filter, + .se_width_idx = HISTC_SYMBOL_OFFSET, +}; + /* --sort srcline */ char *hist_entry__srcline(struct hist_entry *he) @@ -2335,6 +2381,7 @@ static struct sort_dimension common_sort_dimensions[] = { DIM(SORT_SIMD, "simd", sort_simd), DIM(SORT_ANNOTATE_DATA_TYPE, "type", sort_type), DIM(SORT_ANNOTATE_DATA_TYPE_OFFSET, "typeoff", sort_type_offset), + DIM(SORT_SYM_OFFSET, "symoff", sort_sym_offset), }; #undef DIM diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index d806adcc1e1ea..6f6b4189a3897 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -249,6 +249,7 @@ enum sort_type { SORT_SIMD, SORT_ANNOTATE_DATA_TYPE, SORT_ANNOTATE_DATA_TYPE_OFFSET, + SORT_SYM_OFFSET, /* branch stack specific sort keys */ __SORT_BRANCH_STACK, -- GitLab From 263925bf843f5ca8257317d74992f8e6b7d222e3 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 12 Dec 2023 16:13:20 -0800 Subject: [PATCH 0525/1290] perf annotate: Add --data-type option Support data type annotation with new --data-type option. It internally uses type sort key to collect sample histogram for the type and display every members like below. $ perf annotate --data-type ... Annotate type: 'struct cfs_rq' in [kernel.kallsyms] (13 samples): ============================================================================ samples offset size field 13 0 640 struct cfs_rq { 2 0 16 struct load_weight load { 2 0 8 unsigned long weight; 0 8 4 u32 inv_weight; }; 0 16 8 unsigned long runnable_weight; 0 24 4 unsigned int nr_running; 1 28 4 unsigned int h_nr_running; ... For simplicity it prints the number of samples per field for now. But it should be easy to show the overhead percentage instead. The number at the outer struct is a sum of the numbers of the inner members. For example, struct cfs_rq got total 13 samples, and 2 came from the load (struct load_weight) and 1 from h_nr_running. Similarly, the struct load_weight got total 2 samples and they all came from the weight field. I've added two new flags in the symbol_conf for this. The annotate_data_member is to get the members of the type. This is also needed for perf report with typeoff sort key. The annotate_data_sample is to update sample stats for each offset and used only in annotate. Currently it only support stdio output mode, TUI support can be added later. Committer testing: With the perf.data from the previous csets, a very simple, short duration one: # perf annotate --data-type Annotate type: 'struct list_head' in [kernel.kallsyms] (1 samples): ============================================================================ samples offset size field 1 0 16 struct list_head { 0 0 8 struct list_head* next; 1 8 8 struct list_head* prev; }; Annotate type: 'char' in [kernel.kallsyms] (1 samples): ============================================================================ samples offset size field 1 0 1 char ; # Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Cc: linux-toolchains@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Link: https://lore.kernel.org/r/20231213001323.718046-15-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-annotate.txt | 8 ++ tools/perf/builtin-annotate.c | 97 +++++++++++++++++++++- tools/perf/util/annotate-data.c | 8 +- tools/perf/util/annotate.c | 10 ++- tools/perf/util/sort.c | 2 + tools/perf/util/symbol_conf.h | 4 +- 6 files changed, 118 insertions(+), 11 deletions(-) diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt index fe168e8165c8d..0e6a49b7795c5 100644 --- a/tools/perf/Documentation/perf-annotate.txt +++ b/tools/perf/Documentation/perf-annotate.txt @@ -155,6 +155,14 @@ include::itrace.txt[] stdio or stdio2 (Default: 0). Note that this is about selection of functions to display, not about lines within the function. +--data-type[=TYPE_NAME]:: + Display data type annotation instead of code. It infers data type of + samples (if they are memory accessing instructions) using DWARF debug + information. It can take an optional argument of data type name. In + that case it'd show annotation for the type only, otherwise it'd show + all data types it finds. + + SEE ALSO -------- linkperf:perf-record[1], linkperf:perf-report[1] diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index d880f1b039fdf..8acfbbc1b9c2d 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -20,6 +20,7 @@ #include "util/evlist.h" #include "util/evsel.h" #include "util/annotate.h" +#include "util/annotate-data.h" #include "util/event.h" #include #include "util/parse-events.h" @@ -55,9 +56,11 @@ struct perf_annotate { bool skip_missing; bool has_br_stack; bool group_set; + bool data_type; float min_percent; const char *sym_hist_filter; const char *cpu_list; + const char *target_data_type; DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); }; @@ -322,6 +325,32 @@ static int hist_entry__tty_annotate(struct hist_entry *he, return symbol__tty_annotate2(&he->ms, evsel); } +static void print_annotated_data_type(struct annotated_data_type *mem_type, + struct annotated_member *member, + struct evsel *evsel, int indent) +{ + struct annotated_member *child; + struct type_hist *h = mem_type->histograms[evsel->core.idx]; + int i, samples = 0; + + for (i = 0; i < member->size; i++) + samples += h->addr[member->offset + i].nr_samples; + + printf(" %10d %10d %10d %*s%s\t%s", + samples, member->offset, member->size, indent, "", member->type_name, + member->var_name ?: ""); + + if (!list_empty(&member->children)) + printf(" {\n"); + + list_for_each_entry(child, &member->children, node) + print_annotated_data_type(mem_type, child, evsel, indent + 4); + + if (!list_empty(&member->children)) + printf("%*s}", 35 + indent, ""); + printf(";\n"); +} + static void hists__find_annotations(struct hists *hists, struct evsel *evsel, struct perf_annotate *ann) @@ -361,6 +390,40 @@ find_next: continue; } + if (ann->data_type) { + struct dso *dso = map__dso(he->ms.map); + + /* skip unknown type */ + if (he->mem_type->histograms == NULL) + goto find_next; + + if (ann->target_data_type) { + const char *type_name = he->mem_type->self.type_name; + + /* skip 'struct ' prefix in the type name */ + if (strncmp(ann->target_data_type, "struct ", 7) && + !strncmp(type_name, "struct ", 7)) + type_name += 7; + + /* skip 'union ' prefix in the type name */ + if (strncmp(ann->target_data_type, "union ", 6) && + !strncmp(type_name, "union ", 6)) + type_name += 6; + + if (strcmp(ann->target_data_type, type_name)) + goto find_next; + } + + printf("Annotate type: '%s' in %s (%d samples):\n", + he->mem_type->self.type_name, dso->name, he->stat.nr_events); + printf("============================================================================\n"); + printf(" %10s %10s %10s %s\n", "samples", "offset", "size", "field"); + + print_annotated_data_type(he->mem_type, &he->mem_type->self, evsel, 0); + printf("\n"); + goto find_next; + } + if (use_browser == 2) { int ret; int (*annotate)(struct hist_entry *he, @@ -496,6 +559,17 @@ static int parse_percent_limit(const struct option *opt, const char *str, return 0; } +static int parse_data_type(const struct option *opt, const char *str, int unset) +{ + struct perf_annotate *ann = opt->value; + + ann->data_type = !unset; + if (str) + ann->target_data_type = strdup(str); + + return 0; +} + static const char * const annotate_usage[] = { "perf annotate []", NULL @@ -607,6 +681,9 @@ int cmd_annotate(int argc, const char **argv) OPT_CALLBACK_OPTARG(0, "itrace", &itrace_synth_opts, NULL, "opts", "Instruction Tracing options\n" ITRACE_HELP, itrace_parse_synth_opts), + OPT_CALLBACK_OPTARG(0, "data-type", &annotate, NULL, "name", + "Show data type annotate for the memory accesses", + parse_data_type), OPT_END() }; @@ -661,6 +738,13 @@ int cmd_annotate(int argc, const char **argv) } #endif +#ifndef HAVE_DWARF_GETLOCATIONS_SUPPORT + if (annotate.data_type) { + pr_err("Error: Data type profiling is disabled due to missing DWARF support\n"); + return -ENOTSUP; + } +#endif + ret = symbol__validate_sym_arguments(); if (ret) return ret; @@ -703,6 +787,14 @@ int cmd_annotate(int argc, const char **argv) use_browser = 2; #endif + /* FIXME: only support stdio for now */ + if (annotate.data_type) { + use_browser = 0; + annotate_opts.annotate_src = false; + symbol_conf.annotate_data_member = true; + symbol_conf.annotate_data_sample = true; + } + setup_browser(true); /* @@ -710,7 +802,10 @@ int cmd_annotate(int argc, const char **argv) * symbol, we do not care about the processes in annotate, * set sort order to avoid repeated output. */ - sort_order = "dso,symbol"; + if (annotate.data_type) + sort_order = "dso,type"; + else + sort_order = "dso,symbol"; /* * Set SORT_MODE__BRANCH so that annotate display IPC/Cycle diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 868f52ccdb74c..df9689f46619c 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -19,6 +19,7 @@ #include "map_symbol.h" #include "strbuf.h" #include "symbol.h" +#include "symbol_conf.h" /* * Compare type name and size to maintain them in a tree. @@ -158,11 +159,8 @@ static struct annotated_data_type *dso__findnew_data_type(struct dso *dso, result->self.size = size; INIT_LIST_HEAD(&result->self.children); - /* - * Fill member info unconditionally for now, - * later perf annotate would need it. - */ - add_member_types(result, type_die); + if (symbol_conf.annotate_data_member) + add_member_types(result, type_die); rb_add(&result->node, &dso->data_types, data_type_less); return result; diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index f966e8f83c5ec..68424ee0215e4 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -3712,10 +3712,12 @@ struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he) mem_type = find_data_type(ms, ip, op_loc->reg, op_loc->offset); - annotated_data_type__update_samples(mem_type, evsel, - op_loc->offset, - he->stat.nr_events, - he->stat.period); + if (symbol_conf.annotate_data_sample) { + annotated_data_type__update_samples(mem_type, evsel, + op_loc->offset, + he->stat.nr_events, + he->stat.period); + } he->mem_type_off = op_loc->offset; return mem_type; } diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 0cbbd5ba8175a..30254eb637099 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -3401,6 +3401,8 @@ int sort_dimension__add(struct perf_hpp_list *list, const char *tok, list->thread = 1; } else if (sd->entry == &sort_comm) { list->comm = 1; + } else if (sd->entry == &sort_type_offset) { + symbol_conf.annotate_data_member = true; } return __sort_dimension__add(sd, list, level); diff --git a/tools/perf/util/symbol_conf.h b/tools/perf/util/symbol_conf.h index 6040286e07a65..c114bbceef401 100644 --- a/tools/perf/util/symbol_conf.h +++ b/tools/perf/util/symbol_conf.h @@ -44,7 +44,9 @@ struct symbol_conf { buildid_mmap2, guest_code, lazy_load_kernel_maps, - keep_exited_threads; + keep_exited_threads, + annotate_data_member, + annotate_data_sample; const char *vmlinux_name, *kallsyms_name, *source_prefix, -- GitLab From 227ad323854ab48d45966461d7a0a94e03f19368 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 12 Dec 2023 16:13:21 -0800 Subject: [PATCH 0526/1290] perf annotate: Support event group display When events are grouped together, it'd be natural to show them at once like in other mode. Handle group leaders with members to collect the number of samples together and display like below: $ perf annotate --data-type --group ... Annotate type: 'struct page' in vmlinux (1 samples): event[0] = cpu/mem-loads,ldlat=30/P event[1] = cpu/mem-stores/P event[2] = dummy:u ============================================================================ samples offset size field 1 0 0 0 64 struct page { 0 0 0 0 8 long unsigned int flags; 0 0 0 8 40 union { 0 0 0 8 40 struct { 0 0 0 8 16 union { 0 0 0 8 16 struct list_head lru { 0 0 0 8 8 struct list_head* next; 0 0 0 16 8 struct list_head* prev; }; 0 0 0 8 16 struct { 0 0 0 8 8 void* __filler; 0 0 0 16 4 unsigned int mlock_count; }; 0 0 0 8 16 struct list_head buddy_list { 0 0 0 8 8 struct list_head* next; 0 0 0 16 8 struct list_head* prev; }; Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Cc: linux-toolchains@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Link: https://lore.kernel.org/r/20231213001323.718046-16-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 89 ++++++++++++++++++++++++++++++----- 1 file changed, 77 insertions(+), 12 deletions(-) diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 8acfbbc1b9c2d..3956ea1334cc5 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -325,19 +325,64 @@ static int hist_entry__tty_annotate(struct hist_entry *he, return symbol__tty_annotate2(&he->ms, evsel); } +static void print_annotated_data_header(struct hist_entry *he, struct evsel *evsel) +{ + struct dso *dso = map__dso(he->ms.map); + int nr_members = 1; + int nr_samples = he->stat.nr_events; + + if (evsel__is_group_event(evsel)) { + struct hist_entry *pair; + + list_for_each_entry(pair, &he->pairs.head, pairs.node) + nr_samples += pair->stat.nr_events; + } + + printf("Annotate type: '%s' in %s (%d samples):\n", + he->mem_type->self.type_name, dso->name, nr_samples); + + if (evsel__is_group_event(evsel)) { + struct evsel *pos; + int i = 0; + + for_each_group_evsel(pos, evsel) + printf(" event[%d] = %s\n", i++, pos->name); + + nr_members = evsel->core.nr_members; + } + + printf("============================================================================\n"); + printf("%*s %10s %10s %s\n", 11 * nr_members, "samples", "offset", "size", "field"); +} + static void print_annotated_data_type(struct annotated_data_type *mem_type, struct annotated_member *member, struct evsel *evsel, int indent) { struct annotated_member *child; struct type_hist *h = mem_type->histograms[evsel->core.idx]; - int i, samples = 0; + int i, nr_events = 1, samples = 0; for (i = 0; i < member->size; i++) samples += h->addr[member->offset + i].nr_samples; + printf(" %10d", samples); + + if (evsel__is_group_event(evsel)) { + struct evsel *pos; + + for_each_group_member(pos, evsel) { + h = mem_type->histograms[pos->core.idx]; + + samples = 0; + for (i = 0; i < member->size; i++) + samples += h->addr[member->offset + i].nr_samples; + printf(" %10d", samples); + } + nr_events = evsel->core.nr_members; + } - printf(" %10d %10d %10d %*s%s\t%s", - samples, member->offset, member->size, indent, "", member->type_name, + printf(" %10d %10d %*s%s\t%s", + member->offset, member->size, indent, "", member->type_name, member->var_name ?: ""); if (!list_empty(&member->children)) @@ -347,7 +392,7 @@ static void print_annotated_data_type(struct annotated_data_type *mem_type, print_annotated_data_type(mem_type, child, evsel, indent + 4); if (!list_empty(&member->children)) - printf("%*s}", 35 + indent, ""); + printf("%*s}", 11 * nr_events + 24 + indent, ""); printf(";\n"); } @@ -391,8 +436,6 @@ find_next: } if (ann->data_type) { - struct dso *dso = map__dso(he->ms.map); - /* skip unknown type */ if (he->mem_type->histograms == NULL) goto find_next; @@ -414,11 +457,7 @@ find_next: goto find_next; } - printf("Annotate type: '%s' in %s (%d samples):\n", - he->mem_type->self.type_name, dso->name, he->stat.nr_events); - printf("============================================================================\n"); - printf(" %10s %10s %10s %s\n", "samples", "offset", "size", "field"); - + print_annotated_data_header(he, evsel); print_annotated_data_type(he->mem_type, &he->mem_type->self, evsel, 0); printf("\n"); goto find_next; @@ -521,8 +560,20 @@ static int __cmd_annotate(struct perf_annotate *ann) evsel__reset_sample_bit(pos, CALLCHAIN); evsel__output_resort(pos, NULL); - if (symbol_conf.event_group && !evsel__is_group_leader(pos)) + /* + * An event group needs to display other events too. + * Let's delay printing until other events are processed. + */ + if (symbol_conf.event_group) { + if (!evsel__is_group_leader(pos)) { + struct hists *leader_hists; + + leader_hists = evsel__hists(evsel__leader(pos)); + hists__match(leader_hists, hists); + hists__link(leader_hists, hists); + } continue; + } hists__find_annotations(hists, pos, ann); } @@ -533,6 +584,20 @@ static int __cmd_annotate(struct perf_annotate *ann) goto out; } + /* Display group events together */ + evlist__for_each_entry(session->evlist, pos) { + struct hists *hists = evsel__hists(pos); + u32 nr_samples = hists->stats.nr_samples; + + if (nr_samples == 0) + continue; + + if (!symbol_conf.event_group || !evsel__is_group_leader(pos)) + continue; + + hists__find_annotations(hists, pos, ann); + } + if (use_browser == 2) { void (*show_annotations)(void); -- GitLab From 61a9741e9f78c64c5178e4ae9d405eeceff04c8f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 12 Dec 2023 16:13:22 -0800 Subject: [PATCH 0527/1290] perf annotate: Add --type-stat option for debugging The --type-stat option is to be used with --data-type and to print detailed failure reasons for the data type annotation. $ perf annotate --data-type --type-stat Annotate data type stats: total 294, ok 116 (39.5%), bad 178 (60.5%) ----------------------------------------------------------- 30 : no_sym 40 : no_insn_ops 33 : no_mem_ops 63 : no_var 4 : no_typeinfo 8 : bad_offset Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Cc: linux-toolchains@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Link: https://lore.kernel.org/r/20231213001323.718046-17-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-annotate.txt | 3 ++ tools/perf/builtin-annotate.c | 44 +++++++++++++++++++++- tools/perf/util/annotate-data.c | 10 ++++- tools/perf/util/annotate-data.h | 31 +++++++++++++++ tools/perf/util/annotate.c | 27 ++++++++++--- 5 files changed, 108 insertions(+), 7 deletions(-) diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt index 0e6a49b7795c5..b95524bea021e 100644 --- a/tools/perf/Documentation/perf-annotate.txt +++ b/tools/perf/Documentation/perf-annotate.txt @@ -162,6 +162,9 @@ include::itrace.txt[] that case it'd show annotation for the type only, otherwise it'd show all data types it finds. +--type-stat:: + Show stats for the data type annotation. + SEE ALSO -------- diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 3956ea1334cc5..55f97ab1395ba 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -57,6 +57,7 @@ struct perf_annotate { bool has_br_stack; bool group_set; bool data_type; + bool type_stat; float min_percent; const char *sym_hist_filter; const char *cpu_list; @@ -396,6 +397,43 @@ static void print_annotated_data_type(struct annotated_data_type *mem_type, printf(";\n"); } +static void print_annotate_data_stat(struct annotated_data_stat *s) +{ +#define PRINT_STAT(fld) if (s->fld) printf("%10d : %s\n", s->fld, #fld) + + int bad = s->no_sym + + s->no_insn + + s->no_insn_ops + + s->no_mem_ops + + s->no_reg + + s->no_dbginfo + + s->no_cuinfo + + s->no_var + + s->no_typeinfo + + s->invalid_size + + s->bad_offset; + int ok = s->total - bad; + + printf("Annotate data type stats:\n"); + printf("total %d, ok %d (%.1f%%), bad %d (%.1f%%)\n", + s->total, ok, 100.0 * ok / (s->total ?: 1), bad, 100.0 * bad / (s->total ?: 1)); + printf("-----------------------------------------------------------\n"); + PRINT_STAT(no_sym); + PRINT_STAT(no_insn); + PRINT_STAT(no_insn_ops); + PRINT_STAT(no_mem_ops); + PRINT_STAT(no_reg); + PRINT_STAT(no_dbginfo); + PRINT_STAT(no_cuinfo); + PRINT_STAT(no_var); + PRINT_STAT(no_typeinfo); + PRINT_STAT(invalid_size); + PRINT_STAT(bad_offset); + printf("\n"); + +#undef PRINT_STAT +} + static void hists__find_annotations(struct hists *hists, struct evsel *evsel, struct perf_annotate *ann) @@ -403,6 +441,9 @@ static void hists__find_annotations(struct hists *hists, struct rb_node *nd = rb_first_cached(&hists->entries), *next; int key = K_RIGHT; + if (ann->type_stat) + print_annotate_data_stat(&ann_data_stat); + while (nd) { struct hist_entry *he = rb_entry(nd, struct hist_entry, rb_node); struct annotation *notes; @@ -749,7 +790,8 @@ int cmd_annotate(int argc, const char **argv) OPT_CALLBACK_OPTARG(0, "data-type", &annotate, NULL, "name", "Show data type annotate for the memory accesses", parse_data_type), - + OPT_BOOLEAN(0, "type-stat", &annotate.type_stat, + "Show stats for the data type annotation"), OPT_END() }; int ret; diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index df9689f46619c..f22b4f18271c9 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -199,6 +199,7 @@ static int check_variable(Dwarf_Die *var_die, Dwarf_Die *type_die, int offset) /* Get the type of the variable */ if (die_get_real_type(var_die, type_die) == NULL) { pr_debug("variable has no type\n"); + ann_data_stat.no_typeinfo++; return -1; } @@ -209,18 +210,21 @@ static int check_variable(Dwarf_Die *var_die, Dwarf_Die *type_die, int offset) if (dwarf_tag(type_die) != DW_TAG_pointer_type || die_get_real_type(type_die, type_die) == NULL) { pr_debug("no pointer or no type\n"); + ann_data_stat.no_typeinfo++; return -1; } /* Get the size of the actual type */ if (dwarf_aggregate_size(type_die, &size) < 0) { pr_debug("type size is unknown\n"); + ann_data_stat.invalid_size++; return -1; } /* Minimal sanity check */ if ((unsigned)offset >= size) { pr_debug("offset: %d is bigger than size: %" PRIu64 "\n", offset, size); + ann_data_stat.bad_offset++; return -1; } @@ -239,6 +243,7 @@ static int find_data_type_die(struct debuginfo *di, u64 pc, /* Get a compile_unit for this address */ if (!find_cu_die(di, pc, &cu_die)) { pr_debug("cannot find CU for address %" PRIx64 "\n", pc); + ann_data_stat.no_cuinfo++; return -1; } @@ -253,9 +258,12 @@ static int find_data_type_die(struct debuginfo *di, u64 pc, /* Found a variable, see if it's correct */ ret = check_variable(&var_die, type_die, offset); - break; + goto out; } + if (ret < 0) + ann_data_stat.no_var++; +out: free(scopes); return ret; } diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h index d2dc025b19345..8e73096c01d1a 100644 --- a/tools/perf/util/annotate-data.h +++ b/tools/perf/util/annotate-data.h @@ -70,6 +70,37 @@ struct annotated_data_type { extern struct annotated_data_type unknown_type; +/** + * struct annotated_data_stat - Debug statistics + * @total: Total number of entry + * @no_sym: No symbol or map found + * @no_insn: Failed to get disasm line + * @no_insn_ops: The instruction has no operands + * @no_mem_ops: The instruction has no memory operands + * @no_reg: Failed to extract a register from the operand + * @no_dbginfo: The binary has no debug information + * @no_cuinfo: Failed to find a compile_unit + * @no_var: Failed to find a matching variable + * @no_typeinfo: Failed to get a type info for the variable + * @invalid_size: Failed to get a size info of the type + * @bad_offset: The access offset is out of the type + */ +struct annotated_data_stat { + int total; + int no_sym; + int no_insn; + int no_insn_ops; + int no_mem_ops; + int no_reg; + int no_dbginfo; + int no_cuinfo; + int no_var; + int no_typeinfo; + int invalid_size; + int bad_offset; +}; +extern struct annotated_data_stat ann_data_stat; + #ifdef HAVE_DWARF_SUPPORT /* Returns data type at the location (ip, reg, offset) */ diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 68424ee0215e4..9870257ce21e7 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -103,6 +103,9 @@ static struct ins_ops nop_ops; static struct ins_ops lock_ops; static struct ins_ops ret_ops; +/* Data type collection debug statistics */ +struct annotated_data_stat ann_data_stat; + static int arch__grow_instructions(struct arch *arch) { struct ins *new_instructions; @@ -3683,14 +3686,22 @@ struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he) u64 ip = he->ip; int i; - if (ms->map == NULL || ms->sym == NULL) + ann_data_stat.total++; + + if (ms->map == NULL || ms->sym == NULL) { + ann_data_stat.no_sym++; return NULL; + } - if (!symbol_conf.init_annotation) + if (!symbol_conf.init_annotation) { + ann_data_stat.no_sym++; return NULL; + } - if (evsel__get_arch(evsel, &arch) < 0) + if (evsel__get_arch(evsel, &arch) < 0) { + ann_data_stat.no_insn++; return NULL; + } /* Make sure it runs objdump to get disasm of the function */ symbol__ensure_annotate(ms, evsel); @@ -3700,11 +3711,15 @@ struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he) * This is too slow... */ dl = find_disasm_line(ms->sym, ip); - if (dl == NULL) + if (dl == NULL) { + ann_data_stat.no_insn++; return NULL; + } - if (annotate_get_insn_location(arch, dl, &loc) < 0) + if (annotate_get_insn_location(arch, dl, &loc) < 0) { + ann_data_stat.no_insn_ops++; return NULL; + } for_each_insn_op_loc(&loc, i, op_loc) { if (!op_loc->mem_ref) @@ -3721,5 +3736,7 @@ struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he) he->mem_type_off = op_loc->offset; return mem_type; } + + ann_data_stat.no_mem_ops++; return NULL; } -- GitLab From 58824fa0087e1cb732edbf1f112a5ea0b2205c8b Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 12 Dec 2023 16:13:23 -0800 Subject: [PATCH 0528/1290] perf annotate: Add --insn-stat option for debugging This is for a debugging purpose. It'd be useful to see per-instrucion level success/failure stats. $ perf annotate --data-type --insn-stat Annotate Instruction stats total 264, ok 143 (54.2%), bad 121 (45.8%) Name : Good Bad ----------------------------------------------------------- movq : 45 31 movl : 22 11 popq : 0 19 cmpl : 16 3 addq : 8 7 cmpq : 11 3 cmpxchgl : 3 7 cmpxchgq : 8 0 incl : 3 3 movzbl : 4 2 incq : 4 2 decl : 6 0 ... Committer notes: So these are about being able to find the type for accesses from these instructions, we should improve the naming, but it is for debugging, we can improve this later: @@ -3726,6 +3759,10 @@ struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he) continue; mem_type = find_data_type(ms, ip, op_loc->reg, op_loc->offset); + if (mem_type) + istat->good++; + else + istat->bad++; Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Stephane Eranian Cc: linux-toolchains@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Link: https://lore.kernel.org/r/20231213001323.718046-18-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 41 +++++++++++++++++++++++++++++++++++ tools/perf/util/annotate.c | 38 ++++++++++++++++++++++++++++++++ tools/perf/util/annotate.h | 8 +++++++ 3 files changed, 87 insertions(+) diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 55f97ab1395ba..6c1cc797692d9 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -58,6 +58,7 @@ struct perf_annotate { bool group_set; bool data_type; bool type_stat; + bool insn_stat; float min_percent; const char *sym_hist_filter; const char *cpu_list; @@ -434,6 +435,42 @@ static void print_annotate_data_stat(struct annotated_data_stat *s) #undef PRINT_STAT } +static void print_annotate_item_stat(struct list_head *head, const char *title) +{ + struct annotated_item_stat *istat, *pos, *iter; + int total_good, total_bad, total; + int sum1, sum2; + LIST_HEAD(tmp); + + /* sort the list by count */ + list_splice_init(head, &tmp); + total_good = total_bad = 0; + + list_for_each_entry_safe(istat, pos, &tmp, list) { + total_good += istat->good; + total_bad += istat->bad; + sum1 = istat->good + istat->bad; + + list_for_each_entry(iter, head, list) { + sum2 = iter->good + iter->bad; + if (sum1 > sum2) + break; + } + list_move_tail(&istat->list, &iter->list); + } + total = total_good + total_bad; + + printf("Annotate %s stats\n", title); + printf("total %d, ok %d (%.1f%%), bad %d (%.1f%%)\n\n", total, + total_good, 100.0 * total_good / (total ?: 1), + total_bad, 100.0 * total_bad / (total ?: 1)); + printf(" %-10s: %5s %5s\n", "Name", "Good", "Bad"); + printf("-----------------------------------------------------------\n"); + list_for_each_entry(istat, head, list) + printf(" %-10s: %5d %5d\n", istat->name, istat->good, istat->bad); + printf("\n"); +} + static void hists__find_annotations(struct hists *hists, struct evsel *evsel, struct perf_annotate *ann) @@ -443,6 +480,8 @@ static void hists__find_annotations(struct hists *hists, if (ann->type_stat) print_annotate_data_stat(&ann_data_stat); + if (ann->insn_stat) + print_annotate_item_stat(&ann_insn_stat, "Instruction"); while (nd) { struct hist_entry *he = rb_entry(nd, struct hist_entry, rb_node); @@ -792,6 +831,8 @@ int cmd_annotate(int argc, const char **argv) parse_data_type), OPT_BOOLEAN(0, "type-stat", &annotate.type_stat, "Show stats for the data type annotation"), + OPT_BOOLEAN(0, "insn-stat", &annotate.insn_stat, + "Show instruction stats for the data type annotation"), OPT_END() }; int ret; diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 9870257ce21e7..9b70ab110ce79 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -105,6 +105,7 @@ static struct ins_ops ret_ops; /* Data type collection debug statistics */ struct annotated_data_stat ann_data_stat; +LIST_HEAD(ann_insn_stat); static int arch__grow_instructions(struct arch *arch) { @@ -3665,6 +3666,30 @@ static struct disasm_line *find_disasm_line(struct symbol *sym, u64 ip) return NULL; } +static struct annotated_item_stat *annotate_data_stat(struct list_head *head, + const char *name) +{ + struct annotated_item_stat *istat; + + list_for_each_entry(istat, head, list) { + if (!strcmp(istat->name, name)) + return istat; + } + + istat = zalloc(sizeof(*istat)); + if (istat == NULL) + return NULL; + + istat->name = strdup(name); + if (istat->name == NULL) { + free(istat); + return NULL; + } + + list_add_tail(&istat->list, head); + return istat; +} + /** * hist_entry__get_data_type - find data type for given hist entry * @he: hist entry @@ -3683,6 +3708,7 @@ struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he) struct annotated_insn_loc loc; struct annotated_op_loc *op_loc; struct annotated_data_type *mem_type; + struct annotated_item_stat *istat; u64 ip = he->ip; int i; @@ -3716,8 +3742,15 @@ struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he) return NULL; } + istat = annotate_data_stat(&ann_insn_stat, dl->ins.name); + if (istat == NULL) { + ann_data_stat.no_insn++; + return NULL; + } + if (annotate_get_insn_location(arch, dl, &loc) < 0) { ann_data_stat.no_insn_ops++; + istat->bad++; return NULL; } @@ -3726,6 +3759,10 @@ struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he) continue; mem_type = find_data_type(ms, ip, op_loc->reg, op_loc->offset); + if (mem_type) + istat->good++; + else + istat->bad++; if (symbol_conf.annotate_data_sample) { annotated_data_type__update_samples(mem_type, evsel, @@ -3738,5 +3775,6 @@ struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he) } ann_data_stat.no_mem_ops++; + istat->bad++; return NULL; } diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 6c75b28322860..dba50762c6e80 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -479,4 +479,12 @@ int annotate_get_insn_location(struct arch *arch, struct disasm_line *dl, /* Returns a data type from the sample instruction (if any) */ struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he); +struct annotated_item_stat { + struct list_head list; + char *name; + int good; + int bad; +}; +extern struct list_head ann_insn_stat; + #endif /* __PERF_ANNOTATE_H */ -- GitLab From ad7ced12a08b5cf8fd0df7d0f764d4801a2b545a Mon Sep 17 00:00:00 2001 From: Andreas Kemnade Date: Sun, 24 Dec 2023 00:04:00 -0800 Subject: [PATCH 0529/1290] dt-bindings: touchscreen: convert neonode,zforce to json-schema Convert Neonode infrared touchscreen controller binding to DT schema. Signed-off-by: Andreas Kemnade Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20231223221213.774868-2-andreas@kemnade.info Signed-off-by: Dmitry Torokhov --- .../input/touchscreen/neonode,zforce.yaml | 67 +++++++++++++++++++ .../bindings/input/touchscreen/zforce_ts.txt | 34 ---------- 2 files changed, 67 insertions(+), 34 deletions(-) create mode 100644 Documentation/devicetree/bindings/input/touchscreen/neonode,zforce.yaml delete mode 100644 Documentation/devicetree/bindings/input/touchscreen/zforce_ts.txt diff --git a/Documentation/devicetree/bindings/input/touchscreen/neonode,zforce.yaml b/Documentation/devicetree/bindings/input/touchscreen/neonode,zforce.yaml new file mode 100644 index 0000000000000..c39662815a6c5 --- /dev/null +++ b/Documentation/devicetree/bindings/input/touchscreen/neonode,zforce.yaml @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/input/touchscreen/neonode,zforce.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Neonode infrared touchscreen controller + +maintainers: + - Heiko Stuebner + +properties: + compatible: + const: neonode,zforce + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + reset-gpios: + maxItems: 1 + + irq-gpios: + maxItems: 1 + + x-size: + $ref: /schemas/types.yaml#/definitions/uint32 + + y-size: + $ref: /schemas/types.yaml#/definitions/uint32 + + vdd-supply: true + +required: + - compatible + - reg + - interrupts + - reset-gpios + - x-size + - y-size + +unevaluatedProperties: false + +examples: + - | + #include + + i2c { + #address-cells = <1>; + #size-cells = <0>; + + touchscreen@50 { + compatible = "neonode,zforce"; + reg = <0x50>; + interrupts = <2 0>; + vdd-supply = <®_zforce_vdd>; + + reset-gpios = <&gpio5 9 0>; /* RST */ + irq-gpios = <&gpio5 6 0>; /* IRQ, optional */ + + x-size = <800>; + y-size = <600>; + }; + }; +... diff --git a/Documentation/devicetree/bindings/input/touchscreen/zforce_ts.txt b/Documentation/devicetree/bindings/input/touchscreen/zforce_ts.txt deleted file mode 100644 index e3c27c4fd9c85..0000000000000 --- a/Documentation/devicetree/bindings/input/touchscreen/zforce_ts.txt +++ /dev/null @@ -1,34 +0,0 @@ -* Neonode infrared touchscreen controller - -Required properties: -- compatible: must be "neonode,zforce" -- reg: I2C address of the chip -- interrupts: interrupt to which the chip is connected -- reset-gpios: reset gpio the chip is connected to -- x-size: horizontal resolution of touchscreen -- y-size: vertical resolution of touchscreen - -Optional properties: -- irq-gpios : interrupt gpio the chip is connected to -- vdd-supply: Regulator controlling the controller supply - -Example: - - i2c@00000000 { - /* ... */ - - zforce_ts@50 { - compatible = "neonode,zforce"; - reg = <0x50>; - interrupts = <2 0>; - vdd-supply = <®_zforce_vdd>; - - reset-gpios = <&gpio5 9 0>; /* RST */ - irq-gpios = <&gpio5 6 0>; /* IRQ, optional */ - - x-size = <800>; - y-size = <600>; - }; - - /* ... */ - }; -- GitLab From cc040e42fed8e80409ac5cf8663f0cd9b7951028 Mon Sep 17 00:00:00 2001 From: Andreas Kemnade Date: Sun, 24 Dec 2023 00:04:12 -0800 Subject: [PATCH 0530/1290] dt-bindings: touchscreen: neonode,zforce: Use standard properties Enable touchscreen orientation to be specified by using standard properties. Signed-off-by: Andreas Kemnade Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20231223221213.774868-3-andreas@kemnade.info Signed-off-by: Dmitry Torokhov --- .../bindings/input/touchscreen/neonode,zforce.yaml | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/input/touchscreen/neonode,zforce.yaml b/Documentation/devicetree/bindings/input/touchscreen/neonode,zforce.yaml index c39662815a6c5..c2ee89b76ea13 100644 --- a/Documentation/devicetree/bindings/input/touchscreen/neonode,zforce.yaml +++ b/Documentation/devicetree/bindings/input/touchscreen/neonode,zforce.yaml @@ -9,6 +9,9 @@ title: Neonode infrared touchscreen controller maintainers: - Heiko Stuebner +allOf: + - $ref: touchscreen.yaml# + properties: compatible: const: neonode,zforce @@ -26,9 +29,11 @@ properties: maxItems: 1 x-size: + deprecated: true $ref: /schemas/types.yaml#/definitions/uint32 y-size: + deprecated: true $ref: /schemas/types.yaml#/definitions/uint32 vdd-supply: true @@ -38,8 +43,6 @@ required: - reg - interrupts - reset-gpios - - x-size - - y-size unevaluatedProperties: false @@ -60,8 +63,10 @@ examples: reset-gpios = <&gpio5 9 0>; /* RST */ irq-gpios = <&gpio5 6 0>; /* IRQ, optional */ - x-size = <800>; - y-size = <600>; + touchscreen-min-x = <0>; + touchscreen-size-x = <800>; + touchscreen-min-y = <0>; + touchscreen-size-y = <600>; }; }; ... -- GitLab From 435e84ec2009bf40625ee7ca1f8453d3b22edf75 Mon Sep 17 00:00:00 2001 From: Andreas Kemnade Date: Sun, 24 Dec 2023 00:04:23 -0800 Subject: [PATCH 0531/1290] Input: zforce_ts - accept standard touchscreen properties Only driver-specific properties were accepted, change it to use the now-available standard properties. Signed-off-by: Andreas Kemnade Link: https://lore.kernel.org/r/20231223221213.774868-4-andreas@kemnade.info Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/zforce_ts.c | 34 +++++++++++++-------------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/drivers/input/touchscreen/zforce_ts.c b/drivers/input/touchscreen/zforce_ts.c index 5be5112845e1e..5680075f0bb84 100644 --- a/drivers/input/touchscreen/zforce_ts.c +++ b/drivers/input/touchscreen/zforce_ts.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -106,6 +107,7 @@ struct zforce_point { struct zforce_ts { struct i2c_client *client; struct input_dev *input; + struct touchscreen_properties prop; const struct zforce_ts_platdata *pdata; char phys[32]; @@ -266,7 +268,6 @@ static int zforce_setconfig(struct zforce_ts *ts, char b1) static int zforce_start(struct zforce_ts *ts) { struct i2c_client *client = ts->client; - const struct zforce_ts_platdata *pdata = ts->pdata; int ret; dev_dbg(&client->dev, "starting device\n"); @@ -277,7 +278,7 @@ static int zforce_start(struct zforce_ts *ts) return ret; } - ret = zforce_resolution(ts, pdata->x_max, pdata->y_max); + ret = zforce_resolution(ts, ts->prop.max_x, ts->prop.max_y); if (ret) { dev_err(&client->dev, "Unable to set resolution, %d\n", ret); goto error; @@ -337,7 +338,6 @@ static int zforce_stop(struct zforce_ts *ts) static int zforce_touch_event(struct zforce_ts *ts, u8 *payload) { struct i2c_client *client = ts->client; - const struct zforce_ts_platdata *pdata = ts->pdata; struct zforce_point point; int count, i, num = 0; @@ -355,8 +355,8 @@ static int zforce_touch_event(struct zforce_ts *ts, u8 *payload) point.coord_y = payload[9 * i + 4] << 8 | payload[9 * i + 3]; - if (point.coord_x > pdata->x_max || - point.coord_y > pdata->y_max) { + if (point.coord_x > ts->prop.max_x || + point.coord_y > ts->prop.max_y) { dev_warn(&client->dev, "coordinates (%d,%d) invalid\n", point.coord_x, point.coord_y); point.coord_x = point.coord_y = 0; @@ -390,10 +390,9 @@ static int zforce_touch_event(struct zforce_ts *ts, u8 *payload) point.state != STATE_UP); if (point.state != STATE_UP) { - input_report_abs(ts->input, ABS_MT_POSITION_X, - point.coord_x); - input_report_abs(ts->input, ABS_MT_POSITION_Y, - point.coord_y); + touchscreen_report_pos(ts->input, &ts->prop, + point.coord_x, point.coord_y, + true); input_report_abs(ts->input, ABS_MT_TOUCH_MAJOR, point.area_major); input_report_abs(ts->input, ABS_MT_TOUCH_MINOR, @@ -719,15 +718,8 @@ static struct zforce_ts_platdata *zforce_parse_dt(struct device *dev) return ERR_PTR(-ENOMEM); } - if (of_property_read_u32(np, "x-size", &pdata->x_max)) { - dev_err(dev, "failed to get x-size property\n"); - return ERR_PTR(-EINVAL); - } - - if (of_property_read_u32(np, "y-size", &pdata->y_max)) { - dev_err(dev, "failed to get y-size property\n"); - return ERR_PTR(-EINVAL); - } + of_property_read_u32(np, "x-size", &pdata->x_max); + of_property_read_u32(np, "y-size", &pdata->y_max); return pdata; } @@ -856,6 +848,12 @@ static int zforce_probe(struct i2c_client *client) input_set_abs_params(input_dev, ABS_MT_POSITION_Y, 0, pdata->y_max, 0, 0); + touchscreen_parse_properties(input_dev, true, &ts->prop); + if (ts->prop.max_x == 0 || ts->prop.max_y == 0) { + dev_err(&client->dev, "no size specified\n"); + return -EINVAL; + } + input_set_abs_params(input_dev, ABS_MT_TOUCH_MAJOR, 0, ZFORCE_MAX_AREA, 0, 0); input_set_abs_params(input_dev, ABS_MT_TOUCH_MINOR, 0, -- GitLab From 47757ea83a545536cdd418fec84b7a970710e48b Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 20 Nov 2023 15:29:09 +0000 Subject: [PATCH 0532/1290] netfs, fscache: Move fs/fscache/* into fs/netfs/ There's a problem with dependencies between netfslib and fscache as each wants to access some functions of the other. Deal with this by moving fs/fscache/* into fs/netfs/ and renaming those files to begin with "fscache-". For the moment, the moved files are changed as little as possible and an fscache module is still built. A subsequent patch will integrate them. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: Christian Brauner cc: linux-fsdevel@vger.kernel.org cc: linux-cachefs@redhat.com --- MAINTAINERS | 21 ++++++---- fs/Kconfig | 1 - fs/Makefile | 1 - fs/fscache/Kconfig | 40 ------------------- fs/fscache/Makefile | 16 -------- fs/netfs/Kconfig | 39 ++++++++++++++++++ fs/netfs/Makefile | 14 ++++++- fs/{fscache/cache.c => netfs/fscache_cache.c} | 0 .../cookie.c => netfs/fscache_cookie.c} | 0 .../internal.h => netfs/fscache_internal.h} | 0 fs/{fscache/io.c => netfs/fscache_io.c} | 0 fs/{fscache/main.c => netfs/fscache_main.c} | 0 fs/{fscache/proc.c => netfs/fscache_proc.c} | 0 fs/{fscache/stats.c => netfs/fscache_stats.c} | 0 .../volume.c => netfs/fscache_volume.c} | 0 fs/netfs/internal.h | 5 +++ fs/netfs/main.c | 5 ++- 17 files changed, 73 insertions(+), 69 deletions(-) delete mode 100644 fs/fscache/Kconfig delete mode 100644 fs/fscache/Makefile rename fs/{fscache/cache.c => netfs/fscache_cache.c} (100%) rename fs/{fscache/cookie.c => netfs/fscache_cookie.c} (100%) rename fs/{fscache/internal.h => netfs/fscache_internal.h} (100%) rename fs/{fscache/io.c => netfs/fscache_io.c} (100%) rename fs/{fscache/main.c => netfs/fscache_main.c} (100%) rename fs/{fscache/proc.c => netfs/fscache_proc.c} (100%) rename fs/{fscache/stats.c => netfs/fscache_stats.c} (100%) rename fs/{fscache/volume.c => netfs/fscache_volume.c} (100%) diff --git a/MAINTAINERS b/MAINTAINERS index 7cef2d2ef8d70..d836f88b0fe14 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8133,6 +8133,19 @@ S: Supported F: fs/iomap/ F: include/linux/iomap.h +FILESYSTEMS [NETFS LIBRARY] +M: David Howells +L: linux-cachefs@redhat.com (moderated for non-subscribers) +L: linux-fsdevel@vger.kernel.org +S: Supported +F: Documentation/filesystems/caching/ +F: Documentation/filesystems/netfs_library.rst +F: fs/netfs/ +F: include/linux/fscache*.h +F: include/linux/netfs.h +F: include/trace/events/fscache.h +F: include/trace/events/netfs.h + FINTEK F75375S HARDWARE MONITOR AND FAN CONTROLLER DRIVER M: Riku Voipio L: linux-hwmon@vger.kernel.org @@ -8567,14 +8580,6 @@ F: Documentation/power/freezing-of-tasks.rst F: include/linux/freezer.h F: kernel/freezer.c -FS-CACHE: LOCAL CACHING FOR NETWORK FILESYSTEMS -M: David Howells -L: linux-cachefs@redhat.com (moderated for non-subscribers) -S: Supported -F: Documentation/filesystems/caching/ -F: fs/fscache/ -F: include/linux/fscache*.h - FSCRYPT: FILE SYSTEM LEVEL ENCRYPTION SUPPORT M: Eric Biggers M: Theodore Y. Ts'o diff --git a/fs/Kconfig b/fs/Kconfig index 42837617a55b5..c935c341eb6ed 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -140,7 +140,6 @@ source "fs/overlayfs/Kconfig" menu "Caches" source "fs/netfs/Kconfig" -source "fs/fscache/Kconfig" source "fs/cachefiles/Kconfig" endmenu diff --git a/fs/Makefile b/fs/Makefile index 75522f88e7636..af7632368e987 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -60,7 +60,6 @@ obj-$(CONFIG_DLM) += dlm/ # Do not add any filesystems before this line obj-$(CONFIG_NETFS_SUPPORT) += netfs/ -obj-$(CONFIG_FSCACHE) += fscache/ obj-$(CONFIG_REISERFS_FS) += reiserfs/ obj-$(CONFIG_EXT4_FS) += ext4/ # We place ext4 before ext2 so that clean ext3 root fs's do NOT mount using the diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig deleted file mode 100644 index b313a978ae0a2..0000000000000 --- a/fs/fscache/Kconfig +++ /dev/null @@ -1,40 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config FSCACHE - tristate "General filesystem local caching manager" - select NETFS_SUPPORT - help - This option enables a generic filesystem caching manager that can be - used by various network and other filesystems to cache data locally. - Different sorts of caches can be plugged in, depending on the - resources available. - - See Documentation/filesystems/caching/fscache.rst for more information. - -config FSCACHE_STATS - bool "Gather statistical information on local caching" - depends on FSCACHE && PROC_FS - select NETFS_STATS - help - This option causes statistical information to be gathered on local - caching and exported through file: - - /proc/fs/fscache/stats - - The gathering of statistics adds a certain amount of overhead to - execution as there are a quite a few stats gathered, and on a - multi-CPU system these may be on cachelines that keep bouncing - between CPUs. On the other hand, the stats are very useful for - debugging purposes. Saying 'Y' here is recommended. - - See Documentation/filesystems/caching/fscache.rst for more information. - -config FSCACHE_DEBUG - bool "Debug FS-Cache" - depends on FSCACHE - help - This permits debugging to be dynamically enabled in the local caching - management module. If this is set, the debugging output may be - enabled by setting bits in /sys/modules/fscache/parameter/debug. - - See Documentation/filesystems/caching/fscache.rst for more information. diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile deleted file mode 100644 index afb090ea16c40..0000000000000 --- a/fs/fscache/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Makefile for general filesystem caching code -# - -fscache-y := \ - cache.o \ - cookie.o \ - io.o \ - main.o \ - volume.o - -fscache-$(CONFIG_PROC_FS) += proc.o -fscache-$(CONFIG_FSCACHE_STATS) += stats.o - -obj-$(CONFIG_FSCACHE) := fscache.o diff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig index b4db21022cb43..b4378688357c6 100644 --- a/fs/netfs/Kconfig +++ b/fs/netfs/Kconfig @@ -21,3 +21,42 @@ config NETFS_STATS multi-CPU system these may be on cachelines that keep bouncing between CPUs. On the other hand, the stats are very useful for debugging purposes. Saying 'Y' here is recommended. + +config FSCACHE + tristate "General filesystem local caching manager" + select NETFS_SUPPORT + help + This option enables a generic filesystem caching manager that can be + used by various network and other filesystems to cache data locally. + Different sorts of caches can be plugged in, depending on the + resources available. + + See Documentation/filesystems/caching/fscache.rst for more information. + +config FSCACHE_STATS + bool "Gather statistical information on local caching" + depends on FSCACHE && PROC_FS + select NETFS_STATS + help + This option causes statistical information to be gathered on local + caching and exported through file: + + /proc/fs/fscache/stats + + The gathering of statistics adds a certain amount of overhead to + execution as there are a quite a few stats gathered, and on a + multi-CPU system these may be on cachelines that keep bouncing + between CPUs. On the other hand, the stats are very useful for + debugging purposes. Saying 'Y' here is recommended. + + See Documentation/filesystems/caching/fscache.rst for more information. + +config FSCACHE_DEBUG + bool "Debug FS-Cache" + depends on FSCACHE + help + This permits debugging to be dynamically enabled in the local caching + management module. If this is set, the debugging output may be + enabled by setting bits in /sys/modules/fscache/parameter/debug. + + See Documentation/filesystems/caching/fscache.rst for more information. diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile index 386d6fb92793a..bbb2b824bd5e9 100644 --- a/fs/netfs/Makefile +++ b/fs/netfs/Makefile @@ -1,5 +1,17 @@ # SPDX-License-Identifier: GPL-2.0 +fscache-y := \ + fscache_cache.o \ + fscache_cookie.o \ + fscache_io.o \ + fscache_main.o \ + fscache_volume.o + +fscache-$(CONFIG_PROC_FS) += fscache_proc.o +fscache-$(CONFIG_FSCACHE_STATS) += fscache_stats.o + +obj-$(CONFIG_FSCACHE) := fscache.o + netfs-y := \ buffered_read.o \ io.o \ @@ -9,4 +21,4 @@ netfs-y := \ netfs-$(CONFIG_NETFS_STATS) += stats.o -obj-$(CONFIG_NETFS_SUPPORT) := netfs.o +obj-$(CONFIG_NETFS_SUPPORT) += netfs.o diff --git a/fs/fscache/cache.c b/fs/netfs/fscache_cache.c similarity index 100% rename from fs/fscache/cache.c rename to fs/netfs/fscache_cache.c diff --git a/fs/fscache/cookie.c b/fs/netfs/fscache_cookie.c similarity index 100% rename from fs/fscache/cookie.c rename to fs/netfs/fscache_cookie.c diff --git a/fs/fscache/internal.h b/fs/netfs/fscache_internal.h similarity index 100% rename from fs/fscache/internal.h rename to fs/netfs/fscache_internal.h diff --git a/fs/fscache/io.c b/fs/netfs/fscache_io.c similarity index 100% rename from fs/fscache/io.c rename to fs/netfs/fscache_io.c diff --git a/fs/fscache/main.c b/fs/netfs/fscache_main.c similarity index 100% rename from fs/fscache/main.c rename to fs/netfs/fscache_main.c diff --git a/fs/fscache/proc.c b/fs/netfs/fscache_proc.c similarity index 100% rename from fs/fscache/proc.c rename to fs/netfs/fscache_proc.c diff --git a/fs/fscache/stats.c b/fs/netfs/fscache_stats.c similarity index 100% rename from fs/fscache/stats.c rename to fs/netfs/fscache_stats.c diff --git a/fs/fscache/volume.c b/fs/netfs/fscache_volume.c similarity index 100% rename from fs/fscache/volume.c rename to fs/netfs/fscache_volume.c diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index 43fac1b14e40c..e96432499eb2a 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -5,9 +5,12 @@ * Written by David Howells (dhowells@redhat.com) */ +#include +#include #include #include #include +#include "fscache_internal.h" #ifdef pr_fmt #undef pr_fmt @@ -107,6 +110,7 @@ static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx) /* * debug tracing */ +#if 0 #define dbgprintk(FMT, ...) \ printk("[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) @@ -143,3 +147,4 @@ do { \ #define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__) #define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__) #endif +#endif diff --git a/fs/netfs/main.c b/fs/netfs/main.c index 068568702957e..237c54a01d971 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -8,8 +8,8 @@ #include #include #include "internal.h" -#define CREATE_TRACE_POINTS -#include +//#define CREATE_TRACE_POINTS +//#include MODULE_DESCRIPTION("Network fs support"); MODULE_AUTHOR("Red Hat, Inc."); @@ -18,3 +18,4 @@ MODULE_LICENSE("GPL"); unsigned netfs_debug; module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO); MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask"); + -- GitLab From 915cd30cdea8811cddd8f59e57dd9dd0a814b76c Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 20 Nov 2023 15:55:18 +0000 Subject: [PATCH 0533/1290] netfs, fscache: Combine fscache with netfs Now that the fscache code is moved to be colocated with the netfslib code so that they combined into one module, do the combining. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: Christian Brauner cc: linux-fsdevel@vger.kernel.org cc: linux-cachefs@redhat.com cc: linux-nfs@vger.kernel.org, cc: linux-erofs@lists.ozlabs.org --- arch/arm/configs/mxs_defconfig | 3 +- arch/csky/configs/defconfig | 3 +- arch/mips/configs/ip27_defconfig | 3 +- arch/mips/configs/lemote2f_defconfig | 3 +- arch/mips/configs/loongson3_defconfig | 3 +- arch/mips/configs/pic32mzda_defconfig | 3 +- arch/s390/configs/debug_defconfig | 3 +- arch/s390/configs/defconfig | 3 +- arch/sh/configs/sdk7786_defconfig | 3 +- fs/cachefiles/Kconfig | 2 +- fs/erofs/Kconfig | 7 +- fs/netfs/Kconfig | 4 +- fs/netfs/Makefile | 24 +-- fs/netfs/fscache_internal.h | 267 +------------------------- fs/netfs/fscache_main.c | 17 +- fs/netfs/internal.h | 192 +++++++++++++++++- fs/netfs/main.c | 4 +- fs/nfs/Kconfig | 4 +- 18 files changed, 237 insertions(+), 311 deletions(-) diff --git a/arch/arm/configs/mxs_defconfig b/arch/arm/configs/mxs_defconfig index feb38a94c1a70..43bc1255a5db9 100644 --- a/arch/arm/configs/mxs_defconfig +++ b/arch/arm/configs/mxs_defconfig @@ -138,7 +138,8 @@ CONFIG_PWM_MXS=y CONFIG_NVMEM_MXS_OCOTP=y CONFIG_EXT4_FS=y # CONFIG_DNOTIFY is not set -CONFIG_FSCACHE=m +CONFIG_NETFS_SUPPORT=m +CONFIG_FSCACHE=y CONFIG_FSCACHE_STATS=y CONFIG_CACHEFILES=m CONFIG_VFAT_FS=y diff --git a/arch/csky/configs/defconfig b/arch/csky/configs/defconfig index af722e4dfb47d..ff559e5162aa1 100644 --- a/arch/csky/configs/defconfig +++ b/arch/csky/configs/defconfig @@ -34,7 +34,8 @@ CONFIG_GENERIC_PHY=y CONFIG_EXT4_FS=y CONFIG_FANOTIFY=y CONFIG_QUOTA=y -CONFIG_FSCACHE=m +CONFIG_NETFS_SUPPORT=m +CONFIG_FSCACHE=y CONFIG_FSCACHE_STATS=y CONFIG_CACHEFILES=m CONFIG_MSDOS_FS=y diff --git a/arch/mips/configs/ip27_defconfig b/arch/mips/configs/ip27_defconfig index b51f738a39a05..4714074c8bd7f 100644 --- a/arch/mips/configs/ip27_defconfig +++ b/arch/mips/configs/ip27_defconfig @@ -287,7 +287,8 @@ CONFIG_BTRFS_FS_POSIX_ACL=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_FUSE_FS=m CONFIG_CUSE=m -CONFIG_FSCACHE=m +CONFIG_NETFS_SUPPORT=m +CONFIG_FSCACHE=y CONFIG_FSCACHE_STATS=y CONFIG_CACHEFILES=m CONFIG_PROC_KCORE=y diff --git a/arch/mips/configs/lemote2f_defconfig b/arch/mips/configs/lemote2f_defconfig index 38f17b6584218..3389e6e885d9f 100644 --- a/arch/mips/configs/lemote2f_defconfig +++ b/arch/mips/configs/lemote2f_defconfig @@ -238,7 +238,8 @@ CONFIG_BTRFS_FS=m CONFIG_QUOTA=y CONFIG_QFMT_V2=m CONFIG_AUTOFS_FS=m -CONFIG_FSCACHE=m +CONFIG_NETFS_SUPPORT=m +CONFIG_FSCACHE=y CONFIG_CACHEFILES=m CONFIG_ISO9660_FS=m CONFIG_JOLIET=y diff --git a/arch/mips/configs/loongson3_defconfig b/arch/mips/configs/loongson3_defconfig index 07839a4b397e5..78f4987520664 100644 --- a/arch/mips/configs/loongson3_defconfig +++ b/arch/mips/configs/loongson3_defconfig @@ -356,7 +356,8 @@ CONFIG_QFMT_V2=m CONFIG_AUTOFS_FS=y CONFIG_FUSE_FS=m CONFIG_VIRTIO_FS=m -CONFIG_FSCACHE=m +CONFIG_NETFS_SUPPORT=m +CONFIG_FSCACHE=y CONFIG_ISO9660_FS=m CONFIG_JOLIET=y CONFIG_MSDOS_FS=m diff --git a/arch/mips/configs/pic32mzda_defconfig b/arch/mips/configs/pic32mzda_defconfig index 166d2ad372d14..54774f90c23ea 100644 --- a/arch/mips/configs/pic32mzda_defconfig +++ b/arch/mips/configs/pic32mzda_defconfig @@ -68,7 +68,8 @@ CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y CONFIG_AUTOFS_FS=m CONFIG_FUSE_FS=m -CONFIG_FSCACHE=m +CONFIG_NETFS_SUPPORT=m +CONFIG_FSCACHE=y CONFIG_ISO9660_FS=m CONFIG_JOLIET=y CONFIG_ZISOFS=y diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 6de44ede4e14d..060c4207ef2e6 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -637,8 +637,9 @@ CONFIG_FUSE_FS=y CONFIG_CUSE=m CONFIG_VIRTIO_FS=m CONFIG_OVERLAY_FS=m +CONFIG_NETFS_SUPPORT=m CONFIG_NETFS_STATS=y -CONFIG_FSCACHE=m +CONFIG_FSCACHE=y CONFIG_CACHEFILES=m CONFIG_ISO9660_FS=y CONFIG_JOLIET=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index bcae47da6b7cd..dbfa2115d875d 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -622,8 +622,9 @@ CONFIG_FUSE_FS=y CONFIG_CUSE=m CONFIG_VIRTIO_FS=m CONFIG_OVERLAY_FS=m +CONFIG_NETFS_SUPPORT=m CONFIG_NETFS_STATS=y -CONFIG_FSCACHE=m +CONFIG_FSCACHE=y CONFIG_CACHEFILES=m CONFIG_ISO9660_FS=y CONFIG_JOLIET=y diff --git a/arch/sh/configs/sdk7786_defconfig b/arch/sh/configs/sdk7786_defconfig index cf59b98446e4d..7b427c17fbfec 100644 --- a/arch/sh/configs/sdk7786_defconfig +++ b/arch/sh/configs/sdk7786_defconfig @@ -171,7 +171,8 @@ CONFIG_BTRFS_FS=y CONFIG_AUTOFS_FS=m CONFIG_FUSE_FS=y CONFIG_CUSE=m -CONFIG_FSCACHE=m +CONFIG_NETFS_SUPPORT=m +CONFIG_FSCACHE=y CONFIG_CACHEFILES=m CONFIG_ISO9660_FS=m CONFIG_JOLIET=y diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig index 8df715640a48f..c5a070550ee33 100644 --- a/fs/cachefiles/Kconfig +++ b/fs/cachefiles/Kconfig @@ -2,7 +2,7 @@ config CACHEFILES tristate "Filesystem caching on files" - depends on FSCACHE && BLOCK + depends on NETFS_SUPPORT && FSCACHE && BLOCK help This permits use of a mounted filesystem as a cache for other filesystems - primarily networking filesystems - thus allowing fast diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 1d318f85232de..fffd3919343e4 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -114,8 +114,11 @@ config EROFS_FS_ZIP_DEFLATE config EROFS_FS_ONDEMAND bool "EROFS fscache-based on-demand read support" - depends on CACHEFILES_ONDEMAND && (EROFS_FS=m && FSCACHE || EROFS_FS=y && FSCACHE=y) - default n + depends on EROFS_FS + select NETFS_SUPPORT + select FSCACHE + select CACHEFILES + select CACHEFILES_ONDEMAND help This permits EROFS to use fscache-backed data blobs with on-demand read support. diff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig index b4378688357c6..bec805e0c44c0 100644 --- a/fs/netfs/Kconfig +++ b/fs/netfs/Kconfig @@ -23,8 +23,8 @@ config NETFS_STATS debugging purposes. Saying 'Y' here is recommended. config FSCACHE - tristate "General filesystem local caching manager" - select NETFS_SUPPORT + bool "General filesystem local caching manager" + depends on NETFS_SUPPORT help This option enables a generic filesystem caching manager that can be used by various network and other filesystems to cache data locally. diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile index bbb2b824bd5e9..b57162ef9cfb3 100644 --- a/fs/netfs/Makefile +++ b/fs/netfs/Makefile @@ -1,17 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -fscache-y := \ - fscache_cache.o \ - fscache_cookie.o \ - fscache_io.o \ - fscache_main.o \ - fscache_volume.o - -fscache-$(CONFIG_PROC_FS) += fscache_proc.o -fscache-$(CONFIG_FSCACHE_STATS) += fscache_stats.o - -obj-$(CONFIG_FSCACHE) := fscache.o - netfs-y := \ buffered_read.o \ io.o \ @@ -21,4 +9,16 @@ netfs-y := \ netfs-$(CONFIG_NETFS_STATS) += stats.o +netfs-$(CONFIG_FSCACHE) += \ + fscache_cache.o \ + fscache_cookie.o \ + fscache_io.o \ + fscache_main.o \ + fscache_volume.o + +ifeq ($(CONFIG_PROC_FS),y) +netfs-$(CONFIG_FSCACHE) += fscache_proc.o +endif +netfs-$(CONFIG_FSCACHE_STATS) += fscache_stats.o + obj-$(CONFIG_NETFS_SUPPORT) += netfs.o diff --git a/fs/netfs/fscache_internal.h b/fs/netfs/fscache_internal.h index 1336f517e9b1a..a09b948fcef21 100644 --- a/fs/netfs/fscache_internal.h +++ b/fs/netfs/fscache_internal.h @@ -5,273 +5,10 @@ * Written by David Howells (dhowells@redhat.com) */ +#include "internal.h" + #ifdef pr_fmt #undef pr_fmt #endif #define pr_fmt(fmt) "FS-Cache: " fmt - -#include -#include -#include -#include -#include - -/* - * cache.c - */ -#ifdef CONFIG_PROC_FS -extern const struct seq_operations fscache_caches_seq_ops; -#endif -bool fscache_begin_cache_access(struct fscache_cache *cache, enum fscache_access_trace why); -void fscache_end_cache_access(struct fscache_cache *cache, enum fscache_access_trace why); -struct fscache_cache *fscache_lookup_cache(const char *name, bool is_cache); -void fscache_put_cache(struct fscache_cache *cache, enum fscache_cache_trace where); - -static inline enum fscache_cache_state fscache_cache_state(const struct fscache_cache *cache) -{ - return smp_load_acquire(&cache->state); -} - -static inline bool fscache_cache_is_live(const struct fscache_cache *cache) -{ - return fscache_cache_state(cache) == FSCACHE_CACHE_IS_ACTIVE; -} - -static inline void fscache_set_cache_state(struct fscache_cache *cache, - enum fscache_cache_state new_state) -{ - smp_store_release(&cache->state, new_state); - -} - -static inline bool fscache_set_cache_state_maybe(struct fscache_cache *cache, - enum fscache_cache_state old_state, - enum fscache_cache_state new_state) -{ - return try_cmpxchg_release(&cache->state, &old_state, new_state); -} - -/* - * cookie.c - */ -extern struct kmem_cache *fscache_cookie_jar; -#ifdef CONFIG_PROC_FS -extern const struct seq_operations fscache_cookies_seq_ops; -#endif -extern struct timer_list fscache_cookie_lru_timer; - -extern void fscache_print_cookie(struct fscache_cookie *cookie, char prefix); -extern bool fscache_begin_cookie_access(struct fscache_cookie *cookie, - enum fscache_access_trace why); - -static inline void fscache_see_cookie(struct fscache_cookie *cookie, - enum fscache_cookie_trace where) -{ - trace_fscache_cookie(cookie->debug_id, refcount_read(&cookie->ref), - where); -} - -/* - * main.c - */ -extern unsigned fscache_debug; - -extern unsigned int fscache_hash(unsigned int salt, const void *data, size_t len); - -/* - * proc.c - */ -#ifdef CONFIG_PROC_FS -extern int __init fscache_proc_init(void); -extern void fscache_proc_cleanup(void); -#else -#define fscache_proc_init() (0) -#define fscache_proc_cleanup() do {} while (0) -#endif - -/* - * stats.c - */ -#ifdef CONFIG_FSCACHE_STATS -extern atomic_t fscache_n_volumes; -extern atomic_t fscache_n_volumes_collision; -extern atomic_t fscache_n_volumes_nomem; -extern atomic_t fscache_n_cookies; -extern atomic_t fscache_n_cookies_lru; -extern atomic_t fscache_n_cookies_lru_expired; -extern atomic_t fscache_n_cookies_lru_removed; -extern atomic_t fscache_n_cookies_lru_dropped; - -extern atomic_t fscache_n_acquires; -extern atomic_t fscache_n_acquires_ok; -extern atomic_t fscache_n_acquires_oom; - -extern atomic_t fscache_n_invalidates; - -extern atomic_t fscache_n_relinquishes; -extern atomic_t fscache_n_relinquishes_retire; -extern atomic_t fscache_n_relinquishes_dropped; - -extern atomic_t fscache_n_resizes; -extern atomic_t fscache_n_resizes_null; - -static inline void fscache_stat(atomic_t *stat) -{ - atomic_inc(stat); -} - -static inline void fscache_stat_d(atomic_t *stat) -{ - atomic_dec(stat); -} - -#define __fscache_stat(stat) (stat) - -int fscache_stats_show(struct seq_file *m, void *v); -#else - -#define __fscache_stat(stat) (NULL) -#define fscache_stat(stat) do {} while (0) -#define fscache_stat_d(stat) do {} while (0) -#endif - -/* - * volume.c - */ -#ifdef CONFIG_PROC_FS -extern const struct seq_operations fscache_volumes_seq_ops; -#endif - -struct fscache_volume *fscache_get_volume(struct fscache_volume *volume, - enum fscache_volume_trace where); -void fscache_put_volume(struct fscache_volume *volume, - enum fscache_volume_trace where); -bool fscache_begin_volume_access(struct fscache_volume *volume, - struct fscache_cookie *cookie, - enum fscache_access_trace why); -void fscache_create_volume(struct fscache_volume *volume, bool wait); - - -/*****************************************************************************/ -/* - * debug tracing - */ -#define dbgprintk(FMT, ...) \ - printk("[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) - -#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) -#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) -#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__) - -#define kjournal(FMT, ...) no_printk(FMT, ##__VA_ARGS__) - -#ifdef __KDEBUG -#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__) -#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__) -#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__) - -#elif defined(CONFIG_FSCACHE_DEBUG) -#define _enter(FMT, ...) \ -do { \ - if (__do_kdebug(ENTER)) \ - kenter(FMT, ##__VA_ARGS__); \ -} while (0) - -#define _leave(FMT, ...) \ -do { \ - if (__do_kdebug(LEAVE)) \ - kleave(FMT, ##__VA_ARGS__); \ -} while (0) - -#define _debug(FMT, ...) \ -do { \ - if (__do_kdebug(DEBUG)) \ - kdebug(FMT, ##__VA_ARGS__); \ -} while (0) - -#else -#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__) -#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__) -#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__) -#endif - -/* - * determine whether a particular optional debugging point should be logged - * - we need to go through three steps to persuade cpp to correctly join the - * shorthand in FSCACHE_DEBUG_LEVEL with its prefix - */ -#define ____do_kdebug(LEVEL, POINT) \ - unlikely((fscache_debug & \ - (FSCACHE_POINT_##POINT << (FSCACHE_DEBUG_ ## LEVEL * 3)))) -#define ___do_kdebug(LEVEL, POINT) \ - ____do_kdebug(LEVEL, POINT) -#define __do_kdebug(POINT) \ - ___do_kdebug(FSCACHE_DEBUG_LEVEL, POINT) - -#define FSCACHE_DEBUG_CACHE 0 -#define FSCACHE_DEBUG_COOKIE 1 -#define FSCACHE_DEBUG_OBJECT 2 -#define FSCACHE_DEBUG_OPERATION 3 - -#define FSCACHE_POINT_ENTER 1 -#define FSCACHE_POINT_LEAVE 2 -#define FSCACHE_POINT_DEBUG 4 - -#ifndef FSCACHE_DEBUG_LEVEL -#define FSCACHE_DEBUG_LEVEL CACHE -#endif - -/* - * assertions - */ -#if 1 /* defined(__KDEBUGALL) */ - -#define ASSERT(X) \ -do { \ - if (unlikely(!(X))) { \ - pr_err("\n"); \ - pr_err("Assertion failed\n"); \ - BUG(); \ - } \ -} while (0) - -#define ASSERTCMP(X, OP, Y) \ -do { \ - if (unlikely(!((X) OP (Y)))) { \ - pr_err("\n"); \ - pr_err("Assertion failed\n"); \ - pr_err("%lx " #OP " %lx is false\n", \ - (unsigned long)(X), (unsigned long)(Y)); \ - BUG(); \ - } \ -} while (0) - -#define ASSERTIF(C, X) \ -do { \ - if (unlikely((C) && !(X))) { \ - pr_err("\n"); \ - pr_err("Assertion failed\n"); \ - BUG(); \ - } \ -} while (0) - -#define ASSERTIFCMP(C, X, OP, Y) \ -do { \ - if (unlikely((C) && !((X) OP (Y)))) { \ - pr_err("\n"); \ - pr_err("Assertion failed\n"); \ - pr_err("%lx " #OP " %lx is false\n", \ - (unsigned long)(X), (unsigned long)(Y)); \ - BUG(); \ - } \ -} while (0) - -#else - -#define ASSERT(X) do {} while (0) -#define ASSERTCMP(X, OP, Y) do {} while (0) -#define ASSERTIF(C, X) do {} while (0) -#define ASSERTIFCMP(C, X, OP, Y) do {} while (0) - -#endif /* assert or not */ diff --git a/fs/netfs/fscache_main.c b/fs/netfs/fscache_main.c index dad85fd84f6f9..00600a4d9ce55 100644 --- a/fs/netfs/fscache_main.c +++ b/fs/netfs/fscache_main.c @@ -8,18 +8,9 @@ #define FSCACHE_DEBUG_LEVEL CACHE #include #include -#define CREATE_TRACE_POINTS #include "internal.h" - -MODULE_DESCRIPTION("FS Cache Manager"); -MODULE_AUTHOR("Red Hat, Inc."); -MODULE_LICENSE("GPL"); - -unsigned fscache_debug; -module_param_named(debug, fscache_debug, uint, - S_IWUSR | S_IRUGO); -MODULE_PARM_DESC(fscache_debug, - "FS-Cache debugging mask"); +#define CREATE_TRACE_POINTS +#include EXPORT_TRACEPOINT_SYMBOL(fscache_access_cache); EXPORT_TRACEPOINT_SYMBOL(fscache_access_volume); @@ -92,7 +83,7 @@ static int __init fscache_init(void) goto error_cookie_jar; } - pr_notice("Loaded\n"); + pr_notice("FS-Cache loaded\n"); return 0; error_cookie_jar: @@ -115,7 +106,7 @@ static void __exit fscache_exit(void) kmem_cache_destroy(fscache_cookie_jar); fscache_proc_cleanup(); destroy_workqueue(fscache_wq); - pr_notice("Unloaded\n"); + pr_notice("FS-Cache unloaded\n"); } module_exit(fscache_exit); diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index e96432499eb2a..43769ac606e81 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -9,8 +9,9 @@ #include #include #include +#include #include -#include "fscache_internal.h" +#include #ifdef pr_fmt #undef pr_fmt @@ -106,11 +107,143 @@ static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx) #endif } +/* + * fscache-cache.c + */ +#ifdef CONFIG_PROC_FS +extern const struct seq_operations fscache_caches_seq_ops; +#endif +bool fscache_begin_cache_access(struct fscache_cache *cache, enum fscache_access_trace why); +void fscache_end_cache_access(struct fscache_cache *cache, enum fscache_access_trace why); +struct fscache_cache *fscache_lookup_cache(const char *name, bool is_cache); +void fscache_put_cache(struct fscache_cache *cache, enum fscache_cache_trace where); + +static inline enum fscache_cache_state fscache_cache_state(const struct fscache_cache *cache) +{ + return smp_load_acquire(&cache->state); +} + +static inline bool fscache_cache_is_live(const struct fscache_cache *cache) +{ + return fscache_cache_state(cache) == FSCACHE_CACHE_IS_ACTIVE; +} + +static inline void fscache_set_cache_state(struct fscache_cache *cache, + enum fscache_cache_state new_state) +{ + smp_store_release(&cache->state, new_state); + +} + +static inline bool fscache_set_cache_state_maybe(struct fscache_cache *cache, + enum fscache_cache_state old_state, + enum fscache_cache_state new_state) +{ + return try_cmpxchg_release(&cache->state, &old_state, new_state); +} + +/* + * fscache-cookie.c + */ +extern struct kmem_cache *fscache_cookie_jar; +#ifdef CONFIG_PROC_FS +extern const struct seq_operations fscache_cookies_seq_ops; +#endif +extern struct timer_list fscache_cookie_lru_timer; + +extern void fscache_print_cookie(struct fscache_cookie *cookie, char prefix); +extern bool fscache_begin_cookie_access(struct fscache_cookie *cookie, + enum fscache_access_trace why); + +static inline void fscache_see_cookie(struct fscache_cookie *cookie, + enum fscache_cookie_trace where) +{ + trace_fscache_cookie(cookie->debug_id, refcount_read(&cookie->ref), + where); +} + +/* + * fscache-main.c + */ +extern unsigned int fscache_hash(unsigned int salt, const void *data, size_t len); + +/* + * fscache-proc.c + */ +#ifdef CONFIG_PROC_FS +extern int __init fscache_proc_init(void); +extern void fscache_proc_cleanup(void); +#else +#define fscache_proc_init() (0) +#define fscache_proc_cleanup() do {} while (0) +#endif + +/* + * fscache-stats.c + */ +#ifdef CONFIG_FSCACHE_STATS +extern atomic_t fscache_n_volumes; +extern atomic_t fscache_n_volumes_collision; +extern atomic_t fscache_n_volumes_nomem; +extern atomic_t fscache_n_cookies; +extern atomic_t fscache_n_cookies_lru; +extern atomic_t fscache_n_cookies_lru_expired; +extern atomic_t fscache_n_cookies_lru_removed; +extern atomic_t fscache_n_cookies_lru_dropped; + +extern atomic_t fscache_n_acquires; +extern atomic_t fscache_n_acquires_ok; +extern atomic_t fscache_n_acquires_oom; + +extern atomic_t fscache_n_invalidates; + +extern atomic_t fscache_n_relinquishes; +extern atomic_t fscache_n_relinquishes_retire; +extern atomic_t fscache_n_relinquishes_dropped; + +extern atomic_t fscache_n_resizes; +extern atomic_t fscache_n_resizes_null; + +static inline void fscache_stat(atomic_t *stat) +{ + atomic_inc(stat); +} + +static inline void fscache_stat_d(atomic_t *stat) +{ + atomic_dec(stat); +} + +#define __fscache_stat(stat) (stat) + +int fscache_stats_show(struct seq_file *m, void *v); +#else + +#define __fscache_stat(stat) (NULL) +#define fscache_stat(stat) do {} while (0) +#define fscache_stat_d(stat) do {} while (0) +#endif + +/* + * fscache-volume.c + */ +#ifdef CONFIG_PROC_FS +extern const struct seq_operations fscache_volumes_seq_ops; +#endif + +struct fscache_volume *fscache_get_volume(struct fscache_volume *volume, + enum fscache_volume_trace where); +void fscache_put_volume(struct fscache_volume *volume, + enum fscache_volume_trace where); +bool fscache_begin_volume_access(struct fscache_volume *volume, + struct fscache_cookie *cookie, + enum fscache_access_trace why); +void fscache_create_volume(struct fscache_volume *volume, bool wait); + /*****************************************************************************/ /* * debug tracing */ -#if 0 #define dbgprintk(FMT, ...) \ printk("[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) @@ -147,4 +280,57 @@ do { \ #define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__) #define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__) #endif -#endif + +/* + * assertions + */ +#if 1 /* defined(__KDEBUGALL) */ + +#define ASSERT(X) \ +do { \ + if (unlikely(!(X))) { \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTCMP(X, OP, Y) \ +do { \ + if (unlikely(!((X) OP (Y)))) { \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + pr_err("%lx " #OP " %lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTIF(C, X) \ +do { \ + if (unlikely((C) && !(X))) { \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTIFCMP(C, X, OP, Y) \ +do { \ + if (unlikely((C) && !((X) OP (Y)))) { \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + pr_err("%lx " #OP " %lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while (0) + +#else + +#define ASSERT(X) do {} while (0) +#define ASSERTCMP(X, OP, Y) do {} while (0) +#define ASSERTIF(C, X) do {} while (0) +#define ASSERTIFCMP(C, X, OP, Y) do {} while (0) + +#endif /* assert or not */ diff --git a/fs/netfs/main.c b/fs/netfs/main.c index 237c54a01d971..1ba8091fcf3ec 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -8,8 +8,8 @@ #include #include #include "internal.h" -//#define CREATE_TRACE_POINTS -//#include +#define CREATE_TRACE_POINTS +#include MODULE_DESCRIPTION("Network fs support"); MODULE_AUTHOR("Red Hat, Inc."); diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 01ac733a63203..f7e32d76e34d7 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -169,8 +169,8 @@ config ROOT_NFS config NFS_FSCACHE bool "Provide NFS client caching support" - depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y - select NETFS_SUPPORT + depends on NFS_FS=m && NETFS_SUPPORT || NFS_FS=y && NETFS_SUPPORT=y + select FSCACHE help Say Y here if you want NFS data to be cached locally on disc through the general filesystem cache manager -- GitLab From 4498a8eccc97de3d65f876b6fdeddb439ef73abc Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 20 Nov 2023 17:09:47 +0000 Subject: [PATCH 0534/1290] netfs, fscache: Remove ->begin_cache_operation Remove ->begin_cache_operation() in favour of just calling fscache directly. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: Christian Brauner cc: linux-fsdevel@vger.kernel.org cc: linux-cachefs@redhat.com --- Documentation/filesystems/netfs_library.rst | 23 +++----------- fs/9p/vfs_addr.c | 16 ---------- fs/afs/file.c | 13 -------- fs/ceph/addr.c | 1 - fs/ceph/cache.h | 12 -------- fs/netfs/buffered_read.c | 33 +++++++++++---------- fs/nfs/fscache.c | 7 ----- include/linux/fscache.h | 3 -- include/linux/netfs.h | 4 +-- 9 files changed, 23 insertions(+), 89 deletions(-) diff --git a/Documentation/filesystems/netfs_library.rst b/Documentation/filesystems/netfs_library.rst index 48b95d04f72d5..4cc657d743f7f 100644 --- a/Documentation/filesystems/netfs_library.rst +++ b/Documentation/filesystems/netfs_library.rst @@ -295,7 +295,6 @@ through which it can issue requests and negotiate:: struct netfs_request_ops { void (*init_request)(struct netfs_io_request *rreq, struct file *file); void (*free_request)(struct netfs_io_request *rreq); - int (*begin_cache_operation)(struct netfs_io_request *rreq); void (*expand_readahead)(struct netfs_io_request *rreq); bool (*clamp_length)(struct netfs_io_subrequest *subreq); void (*issue_read)(struct netfs_io_subrequest *subreq); @@ -317,20 +316,6 @@ The operations are as follows: [Optional] This is called as the request is being deallocated so that the filesystem can clean up any state it has attached there. - * ``begin_cache_operation()`` - - [Optional] This is called to ask the network filesystem to call into the - cache (if present) to initialise the caching state for this read. The netfs - library module cannot access the cache directly, so the cache should call - something like fscache_begin_read_operation() to do this. - - The cache gets to store its state in ->cache_resources and must set a table - of operations of its own there (though of a different type). - - This should return 0 on success and an error code otherwise. If an error is - reported, the operation may proceed anyway, just without local caching (only - out of memory and interruption errors cause failure here). - * ``expand_readahead()`` [Optional] This is called to allow the filesystem to expand the size of a @@ -460,14 +445,14 @@ When implementing a local cache to be used by the read helpers, two things are required: some way for the network filesystem to initialise the caching for a read request and a table of operations for the helpers to call. -The network filesystem's ->begin_cache_operation() method is called to set up a -cache and this must call into the cache to do the work. If using fscache, for -example, the cache would call:: +To begin a cache operation on an fscache object, the following function is +called:: int fscache_begin_read_operation(struct netfs_io_request *rreq, struct fscache_cookie *cookie); -passing in the request pointer and the cookie corresponding to the file. +passing in the request pointer and the cookie corresponding to the file. This +fills in the cache resources mentioned below. The netfs_io_request object contains a place for the cache to hang its state:: diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 8a635999a7d61..39db7c01e30af 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -82,25 +82,9 @@ static void v9fs_free_request(struct netfs_io_request *rreq) p9_fid_put(fid); } -/** - * v9fs_begin_cache_operation - Begin a cache operation for a read - * @rreq: The read request - */ -static int v9fs_begin_cache_operation(struct netfs_io_request *rreq) -{ -#ifdef CONFIG_9P_FSCACHE - struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(rreq->inode)); - - return fscache_begin_read_operation(&rreq->cache_resources, cookie); -#else - return -ENOBUFS; -#endif -} - const struct netfs_request_ops v9fs_req_ops = { .init_request = v9fs_init_request, .free_request = v9fs_free_request, - .begin_cache_operation = v9fs_begin_cache_operation, .issue_read = v9fs_issue_read, }; diff --git a/fs/afs/file.c b/fs/afs/file.c index d37dd201752ba..8c17e37c2e59f 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -366,18 +366,6 @@ static int afs_init_request(struct netfs_io_request *rreq, struct file *file) return 0; } -static int afs_begin_cache_operation(struct netfs_io_request *rreq) -{ -#ifdef CONFIG_AFS_FSCACHE - struct afs_vnode *vnode = AFS_FS_I(rreq->inode); - - return fscache_begin_read_operation(&rreq->cache_resources, - afs_vnode_cache(vnode)); -#else - return -ENOBUFS; -#endif -} - static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len, struct folio **foliop, void **_fsdata) { @@ -394,7 +382,6 @@ static void afs_free_request(struct netfs_io_request *rreq) const struct netfs_request_ops afs_req_ops = { .init_request = afs_init_request, .free_request = afs_free_request, - .begin_cache_operation = afs_begin_cache_operation, .check_write_begin = afs_check_write_begin, .issue_read = afs_issue_read, }; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 85be3bf18cdf3..3b8641febeac0 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -509,7 +509,6 @@ static void ceph_netfs_free_request(struct netfs_io_request *rreq) const struct netfs_request_ops ceph_netfs_ops = { .init_request = ceph_init_request, .free_request = ceph_netfs_free_request, - .begin_cache_operation = ceph_begin_cache_operation, .issue_read = ceph_netfs_issue_read, .expand_readahead = ceph_netfs_expand_readahead, .clamp_length = ceph_netfs_clamp_length, diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index dc502daac49ab..b804f10947647 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -57,13 +57,6 @@ static inline int ceph_fscache_dirty_folio(struct address_space *mapping, return fscache_dirty_folio(mapping, folio, ceph_fscache_cookie(ci)); } -static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq) -{ - struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(rreq->inode)); - - return fscache_begin_read_operation(&rreq->cache_resources, cookie); -} - static inline bool ceph_is_cache_enabled(struct inode *inode) { return fscache_cookie_enabled(ceph_fscache_cookie(ceph_inode(inode))); @@ -135,11 +128,6 @@ static inline bool ceph_is_cache_enabled(struct inode *inode) return false; } -static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq) -{ - return -ENOBUFS; -} - static inline void ceph_fscache_note_page_release(struct inode *inode) { } diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 2cd3ccf4c4399..d39d0ffe75d2d 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -147,6 +147,15 @@ static void netfs_rreq_expand(struct netfs_io_request *rreq, } } +/* + * Begin an operation, and fetch the stored zero point value from the cookie if + * available. + */ +static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_inode *ctx) +{ + return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx)); +} + /** * netfs_readahead - Helper to manage a read request * @ractl: The description of the readahead request @@ -180,11 +189,9 @@ void netfs_readahead(struct readahead_control *ractl) if (IS_ERR(rreq)) return; - if (ctx->ops->begin_cache_operation) { - ret = ctx->ops->begin_cache_operation(rreq); - if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) - goto cleanup_free; - } + ret = netfs_begin_cache_read(rreq, ctx); + if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) + goto cleanup_free; netfs_stat(&netfs_n_rh_readahead); trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl), @@ -238,11 +245,9 @@ int netfs_read_folio(struct file *file, struct folio *folio) goto alloc_error; } - if (ctx->ops->begin_cache_operation) { - ret = ctx->ops->begin_cache_operation(rreq); - if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) - goto discard; - } + ret = netfs_begin_cache_read(rreq, ctx); + if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) + goto discard; netfs_stat(&netfs_n_rh_readpage); trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage); @@ -390,11 +395,9 @@ retry: rreq->no_unlock_folio = folio_index(folio); __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); - if (ctx->ops->begin_cache_operation) { - ret = ctx->ops->begin_cache_operation(rreq); - if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) - goto error_put; - } + ret = netfs_begin_cache_read(rreq, ctx); + if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) + goto error_put; netfs_stat(&netfs_n_rh_write_begin); trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin); diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index b05717fe0d4e4..2d1bfee225c36 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -274,12 +274,6 @@ static void nfs_netfs_free_request(struct netfs_io_request *rreq) put_nfs_open_context(rreq->netfs_priv); } -static inline int nfs_netfs_begin_cache_operation(struct netfs_io_request *rreq) -{ - return fscache_begin_read_operation(&rreq->cache_resources, - netfs_i_cookie(netfs_inode(rreq->inode))); -} - static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sreq) { struct nfs_netfs_io_data *netfs; @@ -387,7 +381,6 @@ void nfs_netfs_read_completion(struct nfs_pgio_header *hdr) const struct netfs_request_ops nfs_netfs_ops = { .init_request = nfs_netfs_init_request, .free_request = nfs_netfs_free_request, - .begin_cache_operation = nfs_netfs_begin_cache_operation, .issue_read = nfs_netfs_issue_read, .clamp_length = nfs_netfs_clamp_length }; diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 8e312c8323a8e..9ed6696aee7ab 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -437,9 +437,6 @@ const struct netfs_cache_ops *fscache_operation_valid(const struct netfs_cache_r * indicates the cache resources to which the operation state should be * attached; @cookie indicates the cache object that will be accessed. * - * This is intended to be called from the ->begin_cache_operation() netfs lib - * operation as implemented by the network filesystem. - * * @cres->inval_counter is set from @cookie->inval_counter for comparison at * the end of the operation. This allows invalidation during the operation to * be detected by the caller. diff --git a/include/linux/netfs.h b/include/linux/netfs.h index b11a84f6c32b7..d294ff8f9ae45 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -208,7 +208,6 @@ struct netfs_io_request { struct netfs_request_ops { int (*init_request)(struct netfs_io_request *rreq, struct file *file); void (*free_request)(struct netfs_io_request *rreq); - int (*begin_cache_operation)(struct netfs_io_request *rreq); void (*expand_readahead)(struct netfs_io_request *rreq); bool (*clamp_length)(struct netfs_io_subrequest *subreq); @@ -229,8 +228,7 @@ enum netfs_read_from_hole { }; /* - * Table of operations for access to a cache. This is obtained by - * rreq->ops->begin_cache_operation(). + * Table of operations for access to a cache. */ struct netfs_cache_ops { /* End an operation */ -- GitLab From 7eb5b3e3a0a55f2d166ca949ef47ca6e0c704aab Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 21 Nov 2023 15:43:52 +0000 Subject: [PATCH 0535/1290] netfs, fscache: Move /proc/fs/fscache to /proc/fs/netfs and put in a symlink Rename /proc/fs/fscache to "netfs" and make a symlink from fscache to that. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: Christian Brauner cc: linux-fsdevel@vger.kernel.org cc: linux-cachefs@redhat.com --- fs/netfs/fscache_main.c | 8 ++------ fs/netfs/fscache_proc.c | 23 ++++++++--------------- fs/netfs/fscache_stats.c | 4 +--- fs/netfs/internal.h | 12 +++++++++++- fs/netfs/main.c | 33 +++++++++++++++++++++++++++++++++ fs/netfs/stats.c | 13 +++++++------ include/linux/netfs.h | 1 - 7 files changed, 62 insertions(+), 32 deletions(-) diff --git a/fs/netfs/fscache_main.c b/fs/netfs/fscache_main.c index 00600a4d9ce55..42e98bb523e36 100644 --- a/fs/netfs/fscache_main.c +++ b/fs/netfs/fscache_main.c @@ -62,7 +62,7 @@ unsigned int fscache_hash(unsigned int salt, const void *data, size_t len) /* * initialise the fs caching module */ -static int __init fscache_init(void) +int __init fscache_init(void) { int ret = -ENOMEM; @@ -94,12 +94,10 @@ error_wq: return ret; } -fs_initcall(fscache_init); - /* * clean up on module removal */ -static void __exit fscache_exit(void) +void __exit fscache_exit(void) { _enter(""); @@ -108,5 +106,3 @@ static void __exit fscache_exit(void) destroy_workqueue(fscache_wq); pr_notice("FS-Cache unloaded\n"); } - -module_exit(fscache_exit); diff --git a/fs/netfs/fscache_proc.c b/fs/netfs/fscache_proc.c index dc3b0e9c8cce8..ecd0d1edafaa4 100644 --- a/fs/netfs/fscache_proc.c +++ b/fs/netfs/fscache_proc.c @@ -12,41 +12,34 @@ #include "internal.h" /* - * initialise the /proc/fs/fscache/ directory + * Add files to /proc/fs/netfs/. */ int __init fscache_proc_init(void) { - if (!proc_mkdir("fs/fscache", NULL)) - goto error_dir; + if (!proc_symlink("fs/fscache", NULL, "../netfs")) + goto error_sym; - if (!proc_create_seq("fs/fscache/caches", S_IFREG | 0444, NULL, + if (!proc_create_seq("fs/netfs/caches", S_IFREG | 0444, NULL, &fscache_caches_seq_ops)) goto error; - if (!proc_create_seq("fs/fscache/volumes", S_IFREG | 0444, NULL, + if (!proc_create_seq("fs/netfs/volumes", S_IFREG | 0444, NULL, &fscache_volumes_seq_ops)) goto error; - if (!proc_create_seq("fs/fscache/cookies", S_IFREG | 0444, NULL, + if (!proc_create_seq("fs/netfs/cookies", S_IFREG | 0444, NULL, &fscache_cookies_seq_ops)) goto error; - -#ifdef CONFIG_FSCACHE_STATS - if (!proc_create_single("fs/fscache/stats", S_IFREG | 0444, NULL, - fscache_stats_show)) - goto error; -#endif - return 0; error: remove_proc_entry("fs/fscache", NULL); -error_dir: +error_sym: return -ENOMEM; } /* - * clean up the /proc/fs/fscache/ directory + * Clean up the /proc/fs/fscache symlink. */ void fscache_proc_cleanup(void) { diff --git a/fs/netfs/fscache_stats.c b/fs/netfs/fscache_stats.c index fc94e5e79f1c6..aad812ead398f 100644 --- a/fs/netfs/fscache_stats.c +++ b/fs/netfs/fscache_stats.c @@ -52,7 +52,7 @@ EXPORT_SYMBOL(fscache_n_culled); /* * display the general statistics */ -int fscache_stats_show(struct seq_file *m, void *v) +int fscache_stats_show(struct seq_file *m) { seq_puts(m, "FS-Cache statistics\n"); seq_printf(m, "Cookies: n=%d v=%d vcol=%u voom=%u\n", @@ -96,7 +96,5 @@ int fscache_stats_show(struct seq_file *m, void *v) seq_printf(m, "IO : rd=%u wr=%u\n", atomic_read(&fscache_n_read), atomic_read(&fscache_n_write)); - - netfs_stats_show(m); return 0; } diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index 43769ac606e81..3f6e222294339 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -76,6 +76,7 @@ extern atomic_t netfs_n_rh_write_done; extern atomic_t netfs_n_rh_write_failed; extern atomic_t netfs_n_rh_write_zskip; +int netfs_stats_show(struct seq_file *m, void *v); static inline void netfs_stat(atomic_t *stat) { @@ -166,6 +167,13 @@ static inline void fscache_see_cookie(struct fscache_cookie *cookie, * fscache-main.c */ extern unsigned int fscache_hash(unsigned int salt, const void *data, size_t len); +#ifdef CONFIG_FSCACHE +int __init fscache_init(void); +void __exit fscache_exit(void); +#else +static inline int fscache_init(void) { return 0; } +static inline void fscache_exit(void) {} +#endif /* * fscache-proc.c @@ -216,12 +224,14 @@ static inline void fscache_stat_d(atomic_t *stat) #define __fscache_stat(stat) (stat) -int fscache_stats_show(struct seq_file *m, void *v); +int fscache_stats_show(struct seq_file *m); #else #define __fscache_stat(stat) (NULL) #define fscache_stat(stat) do {} while (0) #define fscache_stat_d(stat) do {} while (0) + +static inline int fscache_stats_show(struct seq_file *m) { return 0; } #endif /* diff --git a/fs/netfs/main.c b/fs/netfs/main.c index 1ba8091fcf3ec..c9af6e0896d31 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -7,6 +7,8 @@ #include #include +#include +#include #include "internal.h" #define CREATE_TRACE_POINTS #include @@ -19,3 +21,34 @@ unsigned netfs_debug; module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO); MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask"); +static int __init netfs_init(void) +{ + int ret = -ENOMEM; + + if (!proc_mkdir("fs/netfs", NULL)) + goto error; + +#ifdef CONFIG_FSCACHE_STATS + if (!proc_create_single("fs/netfs/stats", S_IFREG | 0444, NULL, + netfs_stats_show)) + goto error_proc; +#endif + + ret = fscache_init(); + if (ret < 0) + goto error_proc; + return 0; + +error_proc: + remove_proc_entry("fs/netfs", NULL); +error: + return ret; +} +fs_initcall(netfs_init); + +static void __exit netfs_exit(void) +{ + fscache_exit(); + remove_proc_entry("fs/netfs", NULL); +} +module_exit(netfs_exit); diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c index 5510a7a14a40d..6025dc485f7ea 100644 --- a/fs/netfs/stats.c +++ b/fs/netfs/stats.c @@ -28,31 +28,32 @@ atomic_t netfs_n_rh_write_done; atomic_t netfs_n_rh_write_failed; atomic_t netfs_n_rh_write_zskip; -void netfs_stats_show(struct seq_file *m) +int netfs_stats_show(struct seq_file *m, void *v) { - seq_printf(m, "RdHelp : RA=%u RP=%u WB=%u WBZ=%u rr=%u sr=%u\n", + seq_printf(m, "Netfs : RA=%u RP=%u WB=%u WBZ=%u rr=%u sr=%u\n", atomic_read(&netfs_n_rh_readahead), atomic_read(&netfs_n_rh_readpage), atomic_read(&netfs_n_rh_write_begin), atomic_read(&netfs_n_rh_write_zskip), atomic_read(&netfs_n_rh_rreq), atomic_read(&netfs_n_rh_sreq)); - seq_printf(m, "RdHelp : ZR=%u sh=%u sk=%u\n", + seq_printf(m, "Netfs : ZR=%u sh=%u sk=%u\n", atomic_read(&netfs_n_rh_zero), atomic_read(&netfs_n_rh_short_read), atomic_read(&netfs_n_rh_write_zskip)); - seq_printf(m, "RdHelp : DL=%u ds=%u df=%u di=%u\n", + seq_printf(m, "Netfs : DL=%u ds=%u df=%u di=%u\n", atomic_read(&netfs_n_rh_download), atomic_read(&netfs_n_rh_download_done), atomic_read(&netfs_n_rh_download_failed), atomic_read(&netfs_n_rh_download_instead)); - seq_printf(m, "RdHelp : RD=%u rs=%u rf=%u\n", + seq_printf(m, "Netfs : RD=%u rs=%u rf=%u\n", atomic_read(&netfs_n_rh_read), atomic_read(&netfs_n_rh_read_done), atomic_read(&netfs_n_rh_read_failed)); - seq_printf(m, "RdHelp : WR=%u ws=%u wf=%u\n", + seq_printf(m, "Netfs : WR=%u ws=%u wf=%u\n", atomic_read(&netfs_n_rh_write), atomic_read(&netfs_n_rh_write_done), atomic_read(&netfs_n_rh_write_failed)); + return fscache_stats_show(m); } EXPORT_SYMBOL(netfs_stats_show); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index d294ff8f9ae45..9bd91cd615d51 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -294,7 +294,6 @@ void netfs_get_subrequest(struct netfs_io_subrequest *subreq, enum netfs_sreq_ref_trace what); void netfs_put_subrequest(struct netfs_io_subrequest *subreq, bool was_async, enum netfs_sreq_ref_trace what); -void netfs_stats_show(struct seq_file *); ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len, struct iov_iter *new, iov_iter_extraction_t extraction_flags); -- GitLab From c9c4ff12df110feb1b91951010f673f4b16e49e8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 27 Nov 2023 13:58:07 +0000 Subject: [PATCH 0536/1290] netfs: Move pinning-for-writeback from fscache to netfs Move the resource pinning-for-writeback from fscache code to netfslib code. This is used to keep a cache backing object pinned whilst we have dirty pages on the netfs inode in the pagecache such that VM writeback will be able to reach it. Whilst we're at it, switch the parameters of netfs_unpin_writeback() to match ->write_inode() so that it can be used for that directly. Note that this mechanism could be more generically useful than that for network filesystems. Quite often they have to keep around other resources (e.g. authentication tokens or network connections) until the writeback is complete. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/9p/vfs_addr.c | 33 ++++----------- fs/9p/vfs_inode.c | 3 +- fs/9p/vfs_super.c | 14 +------ fs/afs/file.c | 8 +--- fs/afs/inode.c | 2 +- fs/afs/internal.h | 6 --- fs/afs/super.c | 2 +- fs/afs/write.c | 9 ---- fs/ceph/cache.h | 23 ++++------- fs/ceph/inode.c | 2 +- fs/fs-writeback.c | 10 ++--- fs/netfs/Makefile | 1 + fs/netfs/fscache_io.c | 40 ------------------ fs/netfs/misc.c | 86 +++++++++++++++++++++++++++++++++++++++ fs/smb/client/cifsfs.c | 5 +-- fs/smb/client/file.c | 18 +------- include/linux/fs.h | 2 +- include/linux/fscache.h | 42 ------------------- include/linux/netfs.h | 3 ++ include/linux/writeback.h | 2 +- 20 files changed, 124 insertions(+), 187 deletions(-) create mode 100644 fs/netfs/misc.c diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 39db7c01e30af..131b83c31f85d 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -317,30 +317,15 @@ out: return copied; } -#ifdef CONFIG_9P_FSCACHE -/* - * Mark a page as having been made dirty and thus needing writeback. We also - * need to pin the cache object to write back to. - */ -static bool v9fs_dirty_folio(struct address_space *mapping, struct folio *folio) -{ - struct v9fs_inode *v9inode = V9FS_I(mapping->host); - - return fscache_dirty_folio(mapping, folio, v9fs_inode_cookie(v9inode)); -} -#else -#define v9fs_dirty_folio filemap_dirty_folio -#endif - const struct address_space_operations v9fs_addr_operations = { - .read_folio = netfs_read_folio, - .readahead = netfs_readahead, - .dirty_folio = v9fs_dirty_folio, - .writepage = v9fs_vfs_writepage, - .write_begin = v9fs_write_begin, - .write_end = v9fs_write_end, - .release_folio = v9fs_release_folio, + .read_folio = netfs_read_folio, + .readahead = netfs_readahead, + .dirty_folio = netfs_dirty_folio, + .writepage = v9fs_vfs_writepage, + .write_begin = v9fs_write_begin, + .write_end = v9fs_write_end, + .release_folio = v9fs_release_folio, .invalidate_folio = v9fs_invalidate_folio, - .launder_folio = v9fs_launder_folio, - .direct_IO = v9fs_direct_IO, + .launder_folio = v9fs_launder_folio, + .direct_IO = v9fs_direct_IO, }; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index b845ee18a80be..74122540e00f2 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -376,8 +376,7 @@ void v9fs_evict_inode(struct inode *inode) #ifdef CONFIG_9P_FSCACHE version = cpu_to_le32(v9inode->qid.version); - fscache_clear_inode_writeback(v9fs_inode_cookie(v9inode), inode, - &version); + netfs_clear_inode_writeback(inode, &version); #endif clear_inode(inode); diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 73db55c050bf1..941f7d0e0bfa2 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -289,31 +289,21 @@ static int v9fs_drop_inode(struct inode *inode) static int v9fs_write_inode(struct inode *inode, struct writeback_control *wbc) { - struct v9fs_inode *v9inode; - /* * send an fsync request to server irrespective of * wbc->sync_mode. */ p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); - - v9inode = V9FS_I(inode); - fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode)); - - return 0; + return netfs_unpin_writeback(inode, wbc); } static int v9fs_write_inode_dotl(struct inode *inode, struct writeback_control *wbc) { - struct v9fs_inode *v9inode; - v9inode = V9FS_I(inode); p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); - fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode)); - - return 0; + return netfs_unpin_writeback(inode, wbc); } static const struct super_operations v9fs_super_ops = { diff --git a/fs/afs/file.c b/fs/afs/file.c index 8c17e37c2e59f..9142fda7dbd6c 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -55,7 +55,7 @@ const struct inode_operations afs_file_inode_operations = { const struct address_space_operations afs_file_aops = { .read_folio = netfs_read_folio, .readahead = netfs_readahead, - .dirty_folio = afs_dirty_folio, + .dirty_folio = netfs_dirty_folio, .launder_folio = afs_launder_folio, .release_folio = afs_release_folio, .invalidate_folio = afs_invalidate_folio, @@ -386,12 +386,6 @@ const struct netfs_request_ops afs_req_ops = { .issue_read = afs_issue_read, }; -int afs_write_inode(struct inode *inode, struct writeback_control *wbc) -{ - fscache_unpin_writeback(wbc, afs_vnode_cache(AFS_FS_I(inode))); - return 0; -} - /* * Adjust the dirty region of the page on truncation or full invalidation, * getting rid of the markers altogether if the region is entirely invalidated. diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 78efc97193493..6f375f0cf650d 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -823,7 +823,7 @@ void afs_evict_inode(struct inode *inode) truncate_inode_pages_final(&inode->i_data); afs_set_cache_aux(vnode, &aux); - fscache_clear_inode_writeback(afs_vnode_cache(vnode), inode, &aux); + netfs_clear_inode_writeback(inode, &aux); clear_inode(inode); while (!list_empty(&vnode->wb_keys)) { diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 7385d62c8cf50..b77797559e27c 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -1073,7 +1073,6 @@ extern int afs_release(struct inode *, struct file *); extern int afs_fetch_data(struct afs_vnode *, struct afs_read *); extern struct afs_read *afs_alloc_read(gfp_t); extern void afs_put_read(struct afs_read *); -extern int afs_write_inode(struct inode *, struct writeback_control *); static inline struct afs_read *afs_get_read(struct afs_read *req) { @@ -1522,11 +1521,6 @@ extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *); /* * write.c */ -#ifdef CONFIG_AFS_FSCACHE -bool afs_dirty_folio(struct address_space *, struct folio *); -#else -#define afs_dirty_folio filemap_dirty_folio -#endif extern int afs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, void **fsdata); diff --git a/fs/afs/super.c b/fs/afs/super.c index a01a0fb2cdbb5..4f1b0492f1c5d 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -55,7 +55,7 @@ int afs_net_id; static const struct super_operations afs_super_ops = { .statfs = afs_statfs, .alloc_inode = afs_alloc_inode, - .write_inode = afs_write_inode, + .write_inode = netfs_unpin_writeback, .drop_inode = afs_drop_inode, .destroy_inode = afs_destroy_inode, .free_inode = afs_free_inode, diff --git a/fs/afs/write.c b/fs/afs/write.c index 4a168781936b5..e40cf8e7543ab 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -23,15 +23,6 @@ static void afs_write_to_cache(struct afs_vnode *vnode, loff_t start, size_t len loff_t i_size, bool caching); #ifdef CONFIG_AFS_FSCACHE -/* - * Mark a page as having been made dirty and thus needing writeback. We also - * need to pin the cache object to write back to. - */ -bool afs_dirty_folio(struct address_space *mapping, struct folio *folio) -{ - return fscache_dirty_folio(mapping, folio, - afs_vnode_cache(AFS_FS_I(mapping->host))); -} static void afs_folio_start_fscache(bool caching, struct folio *folio) { if (caching) diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index b804f10947647..8fc7d828d9903 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -43,19 +43,13 @@ static inline void ceph_fscache_resize(struct inode *inode, loff_t to) } } -static inline void ceph_fscache_unpin_writeback(struct inode *inode, +static inline int ceph_fscache_unpin_writeback(struct inode *inode, struct writeback_control *wbc) { - fscache_unpin_writeback(wbc, ceph_fscache_cookie(ceph_inode(inode))); + return netfs_unpin_writeback(inode, wbc); } -static inline int ceph_fscache_dirty_folio(struct address_space *mapping, - struct folio *folio) -{ - struct ceph_inode_info *ci = ceph_inode(mapping->host); - - return fscache_dirty_folio(mapping, folio, ceph_fscache_cookie(ci)); -} +#define ceph_fscache_dirty_folio netfs_dirty_folio static inline bool ceph_is_cache_enabled(struct inode *inode) { @@ -112,16 +106,13 @@ static inline void ceph_fscache_resize(struct inode *inode, loff_t to) { } -static inline void ceph_fscache_unpin_writeback(struct inode *inode, - struct writeback_control *wbc) +static inline int ceph_fscache_unpin_writeback(struct inode *inode, + struct writeback_control *wbc) { + return 0; } -static inline int ceph_fscache_dirty_folio(struct address_space *mapping, - struct folio *folio) -{ - return filemap_dirty_folio(mapping, folio); -} +#define ceph_fscache_dirty_folio filemap_dirty_folio static inline bool ceph_is_cache_enabled(struct inode *inode) { diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 0679240f06db9..3149d79a9dbe5 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -694,7 +694,7 @@ void ceph_evict_inode(struct inode *inode) percpu_counter_dec(&mdsc->metric.total_inodes); truncate_inode_pages_final(&inode->i_data); - if (inode->i_state & I_PINNING_FSCACHE_WB) + if (inode->i_state & I_PINNING_NETFS_WB) ceph_fscache_unuse_cookie(inode, true); clear_inode(inode); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 1767493dffda7..3d84fcc471c60 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1675,11 +1675,11 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) inode->i_state |= I_DIRTY_PAGES; - else if (unlikely(inode->i_state & I_PINNING_FSCACHE_WB)) { + else if (unlikely(inode->i_state & I_PINNING_NETFS_WB)) { if (!(inode->i_state & I_DIRTY_PAGES)) { - inode->i_state &= ~I_PINNING_FSCACHE_WB; - wbc->unpinned_fscache_wb = true; - dirty |= I_PINNING_FSCACHE_WB; /* Cause write_inode */ + inode->i_state &= ~I_PINNING_NETFS_WB; + wbc->unpinned_netfs_wb = true; + dirty |= I_PINNING_NETFS_WB; /* Cause write_inode */ } } @@ -1691,7 +1691,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) if (ret == 0) ret = err; } - wbc->unpinned_fscache_wb = false; + wbc->unpinned_netfs_wb = false; trace_writeback_single_inode(inode, wbc, nr_to_write); return ret; } diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile index b57162ef9cfb3..a84fe9bbd3c4e 100644 --- a/fs/netfs/Makefile +++ b/fs/netfs/Makefile @@ -5,6 +5,7 @@ netfs-y := \ io.o \ iterator.o \ main.o \ + misc.o \ objects.o netfs-$(CONFIG_NETFS_STATS) += stats.o diff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c index 0d2b8dec8f82c..79171a6879304 100644 --- a/fs/netfs/fscache_io.c +++ b/fs/netfs/fscache_io.c @@ -158,46 +158,6 @@ int __fscache_begin_write_operation(struct netfs_cache_resources *cres, } EXPORT_SYMBOL(__fscache_begin_write_operation); -/** - * fscache_dirty_folio - Mark folio dirty and pin a cache object for writeback - * @mapping: The mapping the folio belongs to. - * @folio: The folio being dirtied. - * @cookie: The cookie referring to the cache object - * - * Set the dirty flag on a folio and pin an in-use cache object in memory - * so that writeback can later write to it. This is intended - * to be called from the filesystem's ->dirty_folio() method. - * - * Return: true if the dirty flag was set on the folio, false otherwise. - */ -bool fscache_dirty_folio(struct address_space *mapping, struct folio *folio, - struct fscache_cookie *cookie) -{ - struct inode *inode = mapping->host; - bool need_use = false; - - _enter(""); - - if (!filemap_dirty_folio(mapping, folio)) - return false; - if (!fscache_cookie_valid(cookie)) - return true; - - if (!(inode->i_state & I_PINNING_FSCACHE_WB)) { - spin_lock(&inode->i_lock); - if (!(inode->i_state & I_PINNING_FSCACHE_WB)) { - inode->i_state |= I_PINNING_FSCACHE_WB; - need_use = true; - } - spin_unlock(&inode->i_lock); - - if (need_use) - fscache_use_cookie(cookie, true); - } - return true; -} -EXPORT_SYMBOL(fscache_dirty_folio); - struct fscache_write_request { struct netfs_cache_resources cache_resources; struct address_space *mapping; diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c new file mode 100644 index 0000000000000..68baf55c47a43 --- /dev/null +++ b/fs/netfs/misc.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Miscellaneous routines. + * + * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include +#include "internal.h" + +/** + * netfs_dirty_folio - Mark folio dirty and pin a cache object for writeback + * @mapping: The mapping the folio belongs to. + * @folio: The folio being dirtied. + * + * Set the dirty flag on a folio and pin an in-use cache object in memory so + * that writeback can later write to it. This is intended to be called from + * the filesystem's ->dirty_folio() method. + * + * Return: true if the dirty flag was set on the folio, false otherwise. + */ +bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio) +{ + struct inode *inode = mapping->host; + struct netfs_inode *ictx = netfs_inode(inode); + struct fscache_cookie *cookie = netfs_i_cookie(ictx); + bool need_use = false; + + _enter(""); + + if (!filemap_dirty_folio(mapping, folio)) + return false; + if (!fscache_cookie_valid(cookie)) + return true; + + if (!(inode->i_state & I_PINNING_NETFS_WB)) { + spin_lock(&inode->i_lock); + if (!(inode->i_state & I_PINNING_NETFS_WB)) { + inode->i_state |= I_PINNING_NETFS_WB; + need_use = true; + } + spin_unlock(&inode->i_lock); + + if (need_use) + fscache_use_cookie(cookie, true); + } + return true; +} +EXPORT_SYMBOL(netfs_dirty_folio); + +/** + * netfs_unpin_writeback - Unpin writeback resources + * @inode: The inode on which the cookie resides + * @wbc: The writeback control + * + * Unpin the writeback resources pinned by netfs_dirty_folio(). This is + * intended to be called as/by the netfs's ->write_inode() method. + */ +int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc) +{ + struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode)); + + if (wbc->unpinned_netfs_wb) + fscache_unuse_cookie(cookie, NULL, NULL); + return 0; +} +EXPORT_SYMBOL(netfs_unpin_writeback); + +/** + * netfs_clear_inode_writeback - Clear writeback resources pinned by an inode + * @inode: The inode to clean up + * @aux: Auxiliary data to apply to the inode + * + * Clear any writeback resources held by an inode when the inode is evicted. + * This must be called before clear_inode() is called. + */ +void netfs_clear_inode_writeback(struct inode *inode, const void *aux) +{ + struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode)); + + if (inode->i_state & I_PINNING_NETFS_WB) { + loff_t i_size = i_size_read(inode); + fscache_unuse_cookie(cookie, aux, &i_size); + } +} +EXPORT_SYMBOL(netfs_clear_inode_writeback); diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 2131638f26d0b..96a65cf9b5ec9 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -429,7 +429,7 @@ static void cifs_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); - if (inode->i_state & I_PINNING_FSCACHE_WB) + if (inode->i_state & I_PINNING_NETFS_WB) cifs_fscache_unuse_inode_cookie(inode, true); cifs_fscache_release_inode_cookie(inode); clear_inode(inode); @@ -792,8 +792,7 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root) static int cifs_write_inode(struct inode *inode, struct writeback_control *wbc) { - fscache_unpin_writeback(wbc, cifs_inode_cookie(inode)); - return 0; + return netfs_unpin_writeback(inode, wbc); } static int cifs_drop_inode(struct inode *inode) diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 32a8525415d96..b02b7f0a47dca 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -5043,27 +5043,13 @@ static void cifs_swap_deactivate(struct file *file) /* do we need to unpin (or unlock) the file */ } -/* - * Mark a page as having been made dirty and thus needing writeback. We also - * need to pin the cache object to write back to. - */ -#ifdef CONFIG_CIFS_FSCACHE -static bool cifs_dirty_folio(struct address_space *mapping, struct folio *folio) -{ - return fscache_dirty_folio(mapping, folio, - cifs_inode_cookie(mapping->host)); -} -#else -#define cifs_dirty_folio filemap_dirty_folio -#endif - const struct address_space_operations cifs_addr_ops = { .read_folio = cifs_read_folio, .readahead = cifs_readahead, .writepages = cifs_writepages, .write_begin = cifs_write_begin, .write_end = cifs_write_end, - .dirty_folio = cifs_dirty_folio, + .dirty_folio = netfs_dirty_folio, .release_folio = cifs_release_folio, .direct_IO = cifs_direct_io, .invalidate_folio = cifs_invalidate_folio, @@ -5087,7 +5073,7 @@ const struct address_space_operations cifs_addr_ops_smallbuf = { .writepages = cifs_writepages, .write_begin = cifs_write_begin, .write_end = cifs_write_end, - .dirty_folio = cifs_dirty_folio, + .dirty_folio = netfs_dirty_folio, .release_folio = cifs_release_folio, .invalidate_folio = cifs_invalidate_folio, .launder_folio = cifs_launder_folio, diff --git a/include/linux/fs.h b/include/linux/fs.h index 98b7a7a8c42e3..68a9572616947 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2294,7 +2294,7 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, #define I_CREATING (1 << 15) #define I_DONTCACHE (1 << 16) #define I_SYNC_QUEUED (1 << 17) -#define I_PINNING_FSCACHE_WB (1 << 18) +#define I_PINNING_NETFS_WB (1 << 18) #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 9ed6696aee7ab..6e8562cbcc432 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -626,48 +626,6 @@ static inline void fscache_write_to_cache(struct fscache_cookie *cookie, } -#if __fscache_available -bool fscache_dirty_folio(struct address_space *mapping, struct folio *folio, - struct fscache_cookie *cookie); -#else -#define fscache_dirty_folio(MAPPING, FOLIO, COOKIE) \ - filemap_dirty_folio(MAPPING, FOLIO) -#endif - -/** - * fscache_unpin_writeback - Unpin writeback resources - * @wbc: The writeback control - * @cookie: The cookie referring to the cache object - * - * Unpin the writeback resources pinned by fscache_dirty_folio(). This is - * intended to be called by the netfs's ->write_inode() method. - */ -static inline void fscache_unpin_writeback(struct writeback_control *wbc, - struct fscache_cookie *cookie) -{ - if (wbc->unpinned_fscache_wb) - fscache_unuse_cookie(cookie, NULL, NULL); -} - -/** - * fscache_clear_inode_writeback - Clear writeback resources pinned by an inode - * @cookie: The cookie referring to the cache object - * @inode: The inode to clean up - * @aux: Auxiliary data to apply to the inode - * - * Clear any writeback resources held by an inode when the inode is evicted. - * This must be called before clear_inode() is called. - */ -static inline void fscache_clear_inode_writeback(struct fscache_cookie *cookie, - struct inode *inode, - const void *aux) -{ - if (inode->i_state & I_PINNING_FSCACHE_WB) { - loff_t i_size = i_size_read(inode); - fscache_unuse_cookie(cookie, aux, &i_size); - } -} - /** * fscache_note_page_release - Note that a netfs page got released * @cookie: The cookie corresponding to the file diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 9bd91cd615d51..32faf6c897029 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -288,6 +288,9 @@ int netfs_read_folio(struct file *, struct folio *); int netfs_write_begin(struct netfs_inode *, struct file *, struct address_space *, loff_t pos, unsigned int len, struct folio **, void **fsdata); +bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio); +int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc); +void netfs_clear_inode_writeback(struct inode *inode, const void *aux); void netfs_subreq_terminated(struct netfs_io_subrequest *, ssize_t, bool); void netfs_get_subrequest(struct netfs_io_subrequest *subreq, diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 083387c00f0c8..1e08392fb43e1 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -60,7 +60,7 @@ struct writeback_control { unsigned for_reclaim:1; /* Invoked from the page allocator */ unsigned range_cyclic:1; /* range_start is cyclic */ unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ - unsigned unpinned_fscache_wb:1; /* Cleared I_PINNING_FSCACHE_WB */ + unsigned unpinned_netfs_wb:1; /* Cleared I_PINNING_NETFS_WB */ /* * When writeback IOs are bounced through async layers, only the -- GitLab From 87b57a048964abfd5f3d8b79bc55687327f5a380 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 4 Mar 2022 10:34:27 +0000 Subject: [PATCH 0537/1290] netfs: Add a procfile to list in-progress requests Add a procfile, /proc/fs/netfs/requests, to list in-progress netfslib I/O requests. Signed-off-by: David Howells cc: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/netfs/internal.h | 22 ++++++++++++++ fs/netfs/main.c | 69 ++++++++++++++++++++++++++++++++++++++++++- fs/netfs/objects.c | 4 ++- include/linux/netfs.h | 6 +++- 4 files changed, 98 insertions(+), 3 deletions(-) diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index 3f6e222294339..4708fb15446bd 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -33,6 +33,28 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync); * main.c */ extern unsigned int netfs_debug; +extern struct list_head netfs_io_requests; +extern spinlock_t netfs_proc_lock; + +#ifdef CONFIG_PROC_FS +static inline void netfs_proc_add_rreq(struct netfs_io_request *rreq) +{ + spin_lock(&netfs_proc_lock); + list_add_tail_rcu(&rreq->proc_link, &netfs_io_requests); + spin_unlock(&netfs_proc_lock); +} +static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) +{ + if (!list_empty(&rreq->proc_link)) { + spin_lock(&netfs_proc_lock); + list_del_rcu(&rreq->proc_link); + spin_unlock(&netfs_proc_lock); + } +} +#else +static inline void netfs_proc_add_rreq(struct netfs_io_request *rreq) {} +static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {} +#endif /* * objects.c diff --git a/fs/netfs/main.c b/fs/netfs/main.c index c9af6e0896d31..97ce1436615b7 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -21,13 +21,80 @@ unsigned netfs_debug; module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO); MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask"); +#ifdef CONFIG_PROC_FS +LIST_HEAD(netfs_io_requests); +DEFINE_SPINLOCK(netfs_proc_lock); + +static const char *netfs_origins[] = { + [NETFS_READAHEAD] = "RA", + [NETFS_READPAGE] = "RP", + [NETFS_READ_FOR_WRITE] = "RW", +}; + +/* + * Generate a list of I/O requests in /proc/fs/netfs/requests + */ +static int netfs_requests_seq_show(struct seq_file *m, void *v) +{ + struct netfs_io_request *rreq; + + if (v == &netfs_io_requests) { + seq_puts(m, + "REQUEST OR REF FL ERR OPS COVERAGE\n" + "======== == === == ==== === =========\n" + ); + return 0; + } + + rreq = list_entry(v, struct netfs_io_request, proc_link); + seq_printf(m, + "%08x %s %3d %2lx %4d %3d @%04llx %zx/%zx", + rreq->debug_id, + netfs_origins[rreq->origin], + refcount_read(&rreq->ref), + rreq->flags, + rreq->error, + atomic_read(&rreq->nr_outstanding), + rreq->start, rreq->submitted, rreq->len); + seq_putc(m, '\n'); + return 0; +} + +static void *netfs_requests_seq_start(struct seq_file *m, loff_t *_pos) + __acquires(rcu) +{ + rcu_read_lock(); + return seq_list_start_head(&netfs_io_requests, *_pos); +} + +static void *netfs_requests_seq_next(struct seq_file *m, void *v, loff_t *_pos) +{ + return seq_list_next(v, &netfs_io_requests, _pos); +} + +static void netfs_requests_seq_stop(struct seq_file *m, void *v) + __releases(rcu) +{ + rcu_read_unlock(); +} + +static const struct seq_operations netfs_requests_seq_ops = { + .start = netfs_requests_seq_start, + .next = netfs_requests_seq_next, + .stop = netfs_requests_seq_stop, + .show = netfs_requests_seq_show, +}; +#endif /* CONFIG_PROC_FS */ + static int __init netfs_init(void) { int ret = -ENOMEM; if (!proc_mkdir("fs/netfs", NULL)) goto error; - + if (!proc_create_seq("fs/netfs/requests", S_IFREG | 0444, NULL, + &netfs_requests_seq_ops)) + goto error_proc; #ifdef CONFIG_FSCACHE_STATS if (!proc_create_single("fs/netfs/stats", S_IFREG | 0444, NULL, netfs_stats_show)) diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index e17cdf53f6a78..85f428fc52e6f 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -45,6 +45,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, } } + netfs_proc_add_rreq(rreq); netfs_stat(&netfs_n_rh_rreq); return rreq; } @@ -76,12 +77,13 @@ static void netfs_free_request(struct work_struct *work) container_of(work, struct netfs_io_request, work); trace_netfs_rreq(rreq, netfs_rreq_trace_free); + netfs_proc_del_rreq(rreq); netfs_clear_subrequests(rreq, false); if (rreq->netfs_ops->free_request) rreq->netfs_ops->free_request(rreq); if (rreq->cache_resources.ops) rreq->cache_resources.ops->end_operation(&rreq->cache_resources); - kfree(rreq); + kfree_rcu(rreq, rcu); netfs_stat_d(&netfs_n_rh_rreq); } diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 32faf6c897029..7244ddebd974c 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -175,10 +175,14 @@ enum netfs_io_origin { * operations to a variety of data stores and then stitch the result together. */ struct netfs_io_request { - struct work_struct work; + union { + struct work_struct work; + struct rcu_head rcu; + }; struct inode *inode; /* The file being accessed */ struct address_space *mapping; /* The mapping being accessed */ struct netfs_cache_resources cache_resources; + struct list_head proc_link; /* Link in netfs_iorequests */ struct list_head subrequests; /* Contributory I/O operations */ void *netfs_priv; /* Private data for the netfs */ unsigned int debug_id; -- GitLab From cc3cb0a18da46a51d9fc173155576ba1d068e536 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 9 Mar 2022 11:01:12 +0000 Subject: [PATCH 0538/1290] netfs: Allow the netfs to make the io (sub)request alloc larger Allow the network filesystem to specify extra space to be allocated on the end of the io (sub)request. This allows cifs, for example, to use this space rather than allocating its own cifs_readdata struct. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/netfs/objects.c | 7 +++++-- include/linux/netfs.h | 2 ++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index 85f428fc52e6f..c4229c5f3f540 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -22,7 +22,8 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, struct netfs_io_request *rreq; int ret; - rreq = kzalloc(sizeof(struct netfs_io_request), GFP_KERNEL); + rreq = kzalloc(ctx->ops->io_request_size ?: sizeof(struct netfs_io_request), + GFP_KERNEL); if (!rreq) return ERR_PTR(-ENOMEM); @@ -114,7 +115,9 @@ struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq { struct netfs_io_subrequest *subreq; - subreq = kzalloc(sizeof(struct netfs_io_subrequest), GFP_KERNEL); + subreq = kzalloc(rreq->netfs_ops->io_subrequest_size ?: + sizeof(struct netfs_io_subrequest), + GFP_KERNEL); if (subreq) { INIT_LIST_HEAD(&subreq->rreq_link); refcount_set(&subreq->ref, 2); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 7244ddebd974c..d6f27000eeb07 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -210,6 +210,8 @@ struct netfs_io_request { * Operations the network filesystem can/must provide to the helpers. */ struct netfs_request_ops { + unsigned int io_request_size; /* Alloc size for netfs_io_request struct */ + unsigned int io_subrequest_size; /* Alloc size for netfs_io_subrequest struct */ int (*init_request)(struct netfs_io_request *rreq, struct file *file); void (*free_request)(struct netfs_io_request *rreq); -- GitLab From 5f5ce7ba15e7e6a6539ac8e1f845757aaebecf0d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 25 Feb 2022 11:19:14 +0000 Subject: [PATCH 0539/1290] netfs: Add a ->free_subrequest() op Add a ->free_subrequest() op so that the netfs can clean up data attached to a subrequest. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/netfs/objects.c | 2 ++ include/linux/netfs.h | 1 + 2 files changed, 3 insertions(+) diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index c4229c5f3f540..1bd20bdad9836 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -145,6 +145,8 @@ static void netfs_free_subrequest(struct netfs_io_subrequest *subreq, struct netfs_io_request *rreq = subreq->rreq; trace_netfs_sreq(subreq, netfs_sreq_trace_free); + if (rreq->netfs_ops->free_subrequest) + rreq->netfs_ops->free_subrequest(subreq); kfree(subreq); netfs_stat_d(&netfs_n_rh_sreq); netfs_put_request(rreq, was_async, netfs_rreq_trace_put_subreq); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index d6f27000eeb07..06f57d9d09f6c 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -214,6 +214,7 @@ struct netfs_request_ops { unsigned int io_subrequest_size; /* Alloc size for netfs_io_subrequest struct */ int (*init_request)(struct netfs_io_request *rreq, struct file *file); void (*free_request)(struct netfs_io_request *rreq); + void (*free_subrequest)(struct netfs_io_subrequest *rreq); void (*expand_readahead)(struct netfs_io_request *rreq); bool (*clamp_length)(struct netfs_io_subrequest *subreq); -- GitLab From a34847d4b73c3a98b565b1d1cc6e1b70c661e18b Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 2 Dec 2022 14:12:41 +0000 Subject: [PATCH 0540/1290] afs: Don't use folio->private to record partial modification AFS currently uses folio->private to store the range of bytes within a folio that have been modified - the idea being that if we have, say, a 2MiB folio and someone writes a single byte, we only have to write back that single page and not the whole 2MiB folio - thereby saving on network bandwidth. Remove this, at least for now, and accept the extra network load (which doesn't matter in the common case of writing a whole file at a time from beginning to end). This makes folio->private available for netfslib to use. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: Marc Dionne cc: linux-afs@lists.infradead.org cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/afs/file.c | 67 ------------- fs/afs/internal.h | 56 ----------- fs/afs/write.c | 188 ++++++++----------------------------- include/trace/events/afs.h | 16 +--- 4 files changed, 42 insertions(+), 285 deletions(-) diff --git a/fs/afs/file.c b/fs/afs/file.c index 9142fda7dbd6c..0d783e5b2147b 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -386,63 +386,6 @@ const struct netfs_request_ops afs_req_ops = { .issue_read = afs_issue_read, }; -/* - * Adjust the dirty region of the page on truncation or full invalidation, - * getting rid of the markers altogether if the region is entirely invalidated. - */ -static void afs_invalidate_dirty(struct folio *folio, size_t offset, - size_t length) -{ - struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio)); - unsigned long priv; - unsigned int f, t, end = offset + length; - - priv = (unsigned long)folio_get_private(folio); - - /* we clean up only if the entire page is being invalidated */ - if (offset == 0 && length == folio_size(folio)) - goto full_invalidate; - - /* If the page was dirtied by page_mkwrite(), the PTE stays writable - * and we don't get another notification to tell us to expand it - * again. - */ - if (afs_is_folio_dirty_mmapped(priv)) - return; - - /* We may need to shorten the dirty region */ - f = afs_folio_dirty_from(folio, priv); - t = afs_folio_dirty_to(folio, priv); - - if (t <= offset || f >= end) - return; /* Doesn't overlap */ - - if (f < offset && t > end) - return; /* Splits the dirty region - just absorb it */ - - if (f >= offset && t <= end) - goto undirty; - - if (f < offset) - t = offset; - else - f = end; - if (f == t) - goto undirty; - - priv = afs_folio_dirty(folio, f, t); - folio_change_private(folio, (void *)priv); - trace_afs_folio_dirty(vnode, tracepoint_string("trunc"), folio); - return; - -undirty: - trace_afs_folio_dirty(vnode, tracepoint_string("undirty"), folio); - folio_clear_dirty_for_io(folio); -full_invalidate: - trace_afs_folio_dirty(vnode, tracepoint_string("inval"), folio); - folio_detach_private(folio); -} - /* * invalidate part or all of a page * - release a page and clean up its private data if offset is 0 (indicating @@ -453,11 +396,6 @@ static void afs_invalidate_folio(struct folio *folio, size_t offset, { _enter("{%lu},%zu,%zu", folio->index, offset, length); - BUG_ON(!folio_test_locked(folio)); - - if (folio_get_private(folio)) - afs_invalidate_dirty(folio, offset, length); - folio_wait_fscache(folio); _leave(""); } @@ -485,11 +423,6 @@ static bool afs_release_folio(struct folio *folio, gfp_t gfp) fscache_note_page_release(afs_vnode_cache(vnode)); #endif - if (folio_test_private(folio)) { - trace_afs_folio_dirty(vnode, tracepoint_string("rel"), folio); - folio_detach_private(folio); - } - /* Indicate that the folio can be released */ _leave(" = T"); return true; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index b77797559e27c..a7c8d1d702ee0 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -894,62 +894,6 @@ static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int fl i_size_read(&vnode->netfs.inode), flags); } -/* - * We use folio->private to hold the amount of the folio that we've written to, - * splitting the field into two parts. However, we need to represent a range - * 0...FOLIO_SIZE, so we reduce the resolution if the size of the folio - * exceeds what we can encode. - */ -#ifdef CONFIG_64BIT -#define __AFS_FOLIO_PRIV_MASK 0x7fffffffUL -#define __AFS_FOLIO_PRIV_SHIFT 32 -#define __AFS_FOLIO_PRIV_MMAPPED 0x80000000UL -#else -#define __AFS_FOLIO_PRIV_MASK 0x7fffUL -#define __AFS_FOLIO_PRIV_SHIFT 16 -#define __AFS_FOLIO_PRIV_MMAPPED 0x8000UL -#endif - -static inline unsigned int afs_folio_dirty_resolution(struct folio *folio) -{ - int shift = folio_shift(folio) - (__AFS_FOLIO_PRIV_SHIFT - 1); - return (shift > 0) ? shift : 0; -} - -static inline size_t afs_folio_dirty_from(struct folio *folio, unsigned long priv) -{ - unsigned long x = priv & __AFS_FOLIO_PRIV_MASK; - - /* The lower bound is inclusive */ - return x << afs_folio_dirty_resolution(folio); -} - -static inline size_t afs_folio_dirty_to(struct folio *folio, unsigned long priv) -{ - unsigned long x = (priv >> __AFS_FOLIO_PRIV_SHIFT) & __AFS_FOLIO_PRIV_MASK; - - /* The upper bound is immediately beyond the region */ - return (x + 1) << afs_folio_dirty_resolution(folio); -} - -static inline unsigned long afs_folio_dirty(struct folio *folio, size_t from, size_t to) -{ - unsigned int res = afs_folio_dirty_resolution(folio); - from >>= res; - to = (to - 1) >> res; - return (to << __AFS_FOLIO_PRIV_SHIFT) | from; -} - -static inline unsigned long afs_folio_dirty_mmapped(unsigned long priv) -{ - return priv | __AFS_FOLIO_PRIV_MMAPPED; -} - -static inline bool afs_is_folio_dirty_mmapped(unsigned long priv) -{ - return priv & __AFS_FOLIO_PRIV_MMAPPED; -} - #include /*****************************************************************************/ diff --git a/fs/afs/write.c b/fs/afs/write.c index e40cf8e7543ab..80daf28d8f8b8 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -16,7 +16,8 @@ static int afs_writepages_region(struct address_space *mapping, struct writeback_control *wbc, - loff_t start, loff_t end, loff_t *_next, + unsigned long long start, + unsigned long long end, loff_t *_next, bool max_one_loop); static void afs_write_to_cache(struct afs_vnode *vnode, loff_t start, size_t len, @@ -34,25 +35,6 @@ static void afs_folio_start_fscache(bool caching, struct folio *folio) } #endif -/* - * Flush out a conflicting write. This may extend the write to the surrounding - * pages if also dirty and contiguous to the conflicting region.. - */ -static int afs_flush_conflicting_write(struct address_space *mapping, - struct folio *folio) -{ - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .range_start = folio_pos(folio), - .range_end = LLONG_MAX, - }; - loff_t next; - - return afs_writepages_region(mapping, &wbc, folio_pos(folio), LLONG_MAX, - &next, true); -} - /* * prepare to perform part of a write to a page */ @@ -62,10 +44,6 @@ int afs_write_begin(struct file *file, struct address_space *mapping, { struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); struct folio *folio; - unsigned long priv; - unsigned f, from; - unsigned t, to; - pgoff_t index; int ret; _enter("{%llx:%llu},%llx,%x", @@ -79,49 +57,20 @@ int afs_write_begin(struct file *file, struct address_space *mapping, if (ret < 0) return ret; - index = folio_index(folio); - from = pos - index * PAGE_SIZE; - to = from + len; - try_again: /* See if this page is already partially written in a way that we can * merge the new write with. */ - if (folio_test_private(folio)) { - priv = (unsigned long)folio_get_private(folio); - f = afs_folio_dirty_from(folio, priv); - t = afs_folio_dirty_to(folio, priv); - ASSERTCMP(f, <=, t); - - if (folio_test_writeback(folio)) { - trace_afs_folio_dirty(vnode, tracepoint_string("alrdy"), folio); - folio_unlock(folio); - goto wait_for_writeback; - } - /* If the file is being filled locally, allow inter-write - * spaces to be merged into writes. If it's not, only write - * back what the user gives us. - */ - if (!test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags) && - (to < f || from > t)) - goto flush_conflicting_write; + if (folio_test_writeback(folio)) { + trace_afs_folio_dirty(vnode, tracepoint_string("alrdy"), folio); + folio_unlock(folio); + goto wait_for_writeback; } *_page = folio_file_page(folio, pos / PAGE_SIZE); _leave(" = 0"); return 0; - /* The previous write and this write aren't adjacent or overlapping, so - * flush the page out. - */ -flush_conflicting_write: - trace_afs_folio_dirty(vnode, tracepoint_string("confl"), folio); - folio_unlock(folio); - - ret = afs_flush_conflicting_write(mapping, folio); - if (ret < 0) - goto error; - wait_for_writeback: ret = folio_wait_writeback_killable(folio); if (ret < 0) @@ -147,9 +96,6 @@ int afs_write_end(struct file *file, struct address_space *mapping, { struct folio *folio = page_folio(subpage); struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); - unsigned long priv; - unsigned int f, from = offset_in_folio(folio, pos); - unsigned int t, to = from + copied; loff_t i_size, write_end_pos; _enter("{%llx:%llu},{%lx}", @@ -179,23 +125,6 @@ int afs_write_end(struct file *file, struct address_space *mapping, fscache_update_cookie(afs_vnode_cache(vnode), NULL, &write_end_pos); } - if (folio_test_private(folio)) { - priv = (unsigned long)folio_get_private(folio); - f = afs_folio_dirty_from(folio, priv); - t = afs_folio_dirty_to(folio, priv); - if (from < f) - f = from; - if (to > t) - t = to; - priv = afs_folio_dirty(folio, f, t); - folio_change_private(folio, (void *)priv); - trace_afs_folio_dirty(vnode, tracepoint_string("dirty+"), folio); - } else { - priv = afs_folio_dirty(folio, from, to); - folio_attach_private(folio, (void *)priv); - trace_afs_folio_dirty(vnode, tracepoint_string("dirty"), folio); - } - if (folio_mark_dirty(folio)) _debug("dirtied %lx", folio_index(folio)); @@ -300,7 +229,6 @@ static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsign } trace_afs_folio_dirty(vnode, tracepoint_string("clear"), folio); - folio_detach_private(folio); folio_end_writeback(folio); } @@ -454,17 +382,12 @@ static void afs_extend_writeback(struct address_space *mapping, long *_count, loff_t start, loff_t max_len, - bool new_content, bool caching, - unsigned int *_len) + size_t *_len) { struct folio_batch fbatch; struct folio *folio; - unsigned long priv; - unsigned int psize, filler = 0; - unsigned int f, t; - loff_t len = *_len; - pgoff_t index = (start + len) / PAGE_SIZE; + pgoff_t index = (start + *_len) / PAGE_SIZE; bool stop = true; unsigned int i; @@ -492,7 +415,7 @@ static void afs_extend_writeback(struct address_space *mapping, continue; } - /* Has the page moved or been split? */ + /* Has the folio moved or been split? */ if (unlikely(folio != xas_reload(&xas))) { folio_put(folio); break; @@ -510,24 +433,13 @@ static void afs_extend_writeback(struct address_space *mapping, break; } - psize = folio_size(folio); - priv = (unsigned long)folio_get_private(folio); - f = afs_folio_dirty_from(folio, priv); - t = afs_folio_dirty_to(folio, priv); - if (f != 0 && !new_content) { - folio_unlock(folio); - folio_put(folio); - break; - } - - len += filler + t; - filler = psize - t; - if (len >= max_len || *_count <= 0) + index += folio_nr_pages(folio); + *_count -= folio_nr_pages(folio); + *_len += folio_size(folio); + stop = false; + if (*_len >= max_len || *_count <= 0) stop = true; - else if (t == psize || new_content) - stop = false; - index += folio_nr_pages(folio); if (!folio_batch_add(&fbatch, folio)) break; if (stop) @@ -553,16 +465,12 @@ static void afs_extend_writeback(struct address_space *mapping, if (folio_start_writeback(folio)) BUG(); afs_folio_start_fscache(caching, folio); - - *_count -= folio_nr_pages(folio); folio_unlock(folio); } folio_batch_release(&fbatch); cond_resched(); } while (!stop); - - *_len = len; } /* @@ -572,14 +480,13 @@ static void afs_extend_writeback(struct address_space *mapping, static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, struct writeback_control *wbc, struct folio *folio, - loff_t start, loff_t end) + unsigned long long start, + unsigned long long end) { struct afs_vnode *vnode = AFS_FS_I(mapping->host); struct iov_iter iter; - unsigned long priv; - unsigned int offset, to, len, max_len; - loff_t i_size = i_size_read(&vnode->netfs.inode); - bool new_content = test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); + unsigned long long i_size = i_size_read(&vnode->netfs.inode); + size_t len, max_len; bool caching = fscache_cookie_enabled(afs_vnode_cache(vnode)); long count = wbc->nr_to_write; int ret; @@ -597,13 +504,9 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, * immediately lockable, is not dirty or is missing, or we reach the * end of the range. */ - priv = (unsigned long)folio_get_private(folio); - offset = afs_folio_dirty_from(folio, priv); - to = afs_folio_dirty_to(folio, priv); trace_afs_folio_dirty(vnode, tracepoint_string("store"), folio); - len = to - offset; - start += offset; + len = folio_size(folio); if (start < i_size) { /* Trim the write to the EOF; the extra data is ignored. Also * put an upper limit on the size of a single storedata op. @@ -612,12 +515,10 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, max_len = min_t(unsigned long long, max_len, end - start + 1); max_len = min_t(unsigned long long, max_len, i_size - start); - if (len < max_len && - (to == folio_size(folio) || new_content)) + if (len < max_len) afs_extend_writeback(mapping, vnode, &count, - start, max_len, new_content, - caching, &len); - len = min_t(loff_t, len, max_len); + start, max_len, caching, &len); + len = min_t(unsigned long long, len, i_size - start); } /* We now have a contiguous set of dirty pages, each with writeback @@ -627,7 +528,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, folio_unlock(folio); if (start < i_size) { - _debug("write back %x @%llx [%llx]", len, start, i_size); + _debug("write back %zx @%llx [%llx]", len, start, i_size); /* Speculatively write to the cache. We have to fix this up * later if the store fails. @@ -637,7 +538,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, iov_iter_xarray(&iter, ITER_SOURCE, &mapping->i_pages, start, len); ret = afs_store_data(vnode, &iter, start, false); } else { - _debug("write discard %x @%llx [%llx]", len, start, i_size); + _debug("write discard %zx @%llx [%llx]", len, start, i_size); /* The dirty region was entirely beyond the EOF. */ fscache_clear_page_bits(mapping, start, len, caching); @@ -693,7 +594,8 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, */ static int afs_writepages_region(struct address_space *mapping, struct writeback_control *wbc, - loff_t start, loff_t end, loff_t *_next, + unsigned long long start, + unsigned long long end, loff_t *_next, bool max_one_loop) { struct folio *folio; @@ -905,7 +807,6 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) struct inode *inode = file_inode(file); struct afs_vnode *vnode = AFS_FS_I(inode); struct afs_file *af = file->private_data; - unsigned long priv; vm_fault_t ret = VM_FAULT_RETRY; _enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, folio_index(folio)); @@ -929,24 +830,15 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) if (folio_lock_killable(folio) < 0) goto out; - /* We mustn't change folio->private until writeback is complete as that - * details the portion of the page we need to write back and we might - * need to redirty the page if there's a problem. - */ if (folio_wait_writeback_killable(folio) < 0) { folio_unlock(folio); goto out; } - priv = afs_folio_dirty(folio, 0, folio_size(folio)); - priv = afs_folio_dirty_mmapped(priv); - if (folio_test_private(folio)) { - folio_change_private(folio, (void *)priv); + if (folio_test_dirty(folio)) trace_afs_folio_dirty(vnode, tracepoint_string("mkwrite+"), folio); - } else { - folio_attach_private(folio, (void *)priv); + else trace_afs_folio_dirty(vnode, tracepoint_string("mkwrite"), folio); - } file_update_time(file); ret = VM_FAULT_LOCKED; @@ -991,30 +883,26 @@ int afs_launder_folio(struct folio *folio) struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio)); struct iov_iter iter; struct bio_vec bv; - unsigned long priv; - unsigned int f, t; + unsigned long long fend, i_size = vnode->netfs.inode.i_size; + size_t len; int ret = 0; _enter("{%lx}", folio->index); - priv = (unsigned long)folio_get_private(folio); - if (folio_clear_dirty_for_io(folio)) { - f = 0; - t = folio_size(folio); - if (folio_test_private(folio)) { - f = afs_folio_dirty_from(folio, priv); - t = afs_folio_dirty_to(folio, priv); - } + if (folio_clear_dirty_for_io(folio) && folio_pos(folio) < i_size) { + len = folio_size(folio); + fend = folio_pos(folio) + len; + if (vnode->netfs.inode.i_size < fend) + len = fend - i_size; - bvec_set_folio(&bv, folio, t - f, f); - iov_iter_bvec(&iter, ITER_SOURCE, &bv, 1, bv.bv_len); + bvec_set_folio(&bv, folio, len, 0); + iov_iter_bvec(&iter, WRITE, &bv, 1, len); trace_afs_folio_dirty(vnode, tracepoint_string("launder"), folio); - ret = afs_store_data(vnode, &iter, folio_pos(folio) + f, true); + ret = afs_store_data(vnode, &iter, folio_pos(folio), true); } trace_afs_folio_dirty(vnode, tracepoint_string("laundered"), folio); - folio_detach_private(folio); folio_wait_fscache(folio); return ret; } diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index 597677acc6b17..08506680350c8 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -846,26 +846,18 @@ TRACE_EVENT(afs_folio_dirty, __field(struct afs_vnode *, vnode) __field(const char *, where) __field(pgoff_t, index) - __field(unsigned long, from) - __field(unsigned long, to) + __field(size_t, size) ), TP_fast_assign( - unsigned long priv = (unsigned long)folio_get_private(folio); __entry->vnode = vnode; __entry->where = where; __entry->index = folio_index(folio); - __entry->from = afs_folio_dirty_from(folio, priv); - __entry->to = afs_folio_dirty_to(folio, priv); - __entry->to |= (afs_is_folio_dirty_mmapped(priv) ? - (1UL << (BITS_PER_LONG - 1)) : 0); + __entry->size = folio_size(folio); ), - TP_printk("vn=%p %lx %s %lx-%lx%s", - __entry->vnode, __entry->index, __entry->where, - __entry->from, - __entry->to & ~(1UL << (BITS_PER_LONG - 1)), - __entry->to & (1UL << (BITS_PER_LONG - 1)) ? " M" : "") + TP_printk("vn=%p ix=%05lx s=%05lx %s", + __entry->vnode, __entry->index, __entry->size, __entry->where) ); TRACE_EVENT(afs_call_state, -- GitLab From c1ec4d7c2e13471558cfea302b7583856284f94c Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 20 Aug 2021 17:08:30 +0100 Subject: [PATCH 0541/1290] netfs: Provide invalidate_folio and release_folio calls Provide default invalidate_folio and release_folio calls. These will need to interact with invalidation correctly at some point. They will be needed if netfslib is to make use of folio->private for its own purposes. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/9p/vfs_addr.c | 33 ++------------------------- fs/afs/file.c | 53 ++++--------------------------------------- fs/ceph/addr.c | 24 ++------------------ fs/ceph/cache.h | 10 -------- fs/netfs/misc.c | 42 ++++++++++++++++++++++++++++++++++ include/linux/netfs.h | 6 +++-- 6 files changed, 54 insertions(+), 114 deletions(-) diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 131b83c31f85d..055b672a247d7 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -88,35 +88,6 @@ const struct netfs_request_ops v9fs_req_ops = { .issue_read = v9fs_issue_read, }; -/** - * v9fs_release_folio - release the private state associated with a folio - * @folio: The folio to be released - * @gfp: The caller's allocation restrictions - * - * Returns true if the page can be released, false otherwise. - */ - -static bool v9fs_release_folio(struct folio *folio, gfp_t gfp) -{ - if (folio_test_private(folio)) - return false; -#ifdef CONFIG_9P_FSCACHE - if (folio_test_fscache(folio)) { - if (current_is_kswapd() || !(gfp & __GFP_FS)) - return false; - folio_wait_fscache(folio); - } - fscache_note_page_release(v9fs_inode_cookie(V9FS_I(folio_inode(folio)))); -#endif - return true; -} - -static void v9fs_invalidate_folio(struct folio *folio, size_t offset, - size_t length) -{ - folio_wait_fscache(folio); -} - #ifdef CONFIG_9P_FSCACHE static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error, bool was_async) @@ -324,8 +295,8 @@ const struct address_space_operations v9fs_addr_operations = { .writepage = v9fs_vfs_writepage, .write_begin = v9fs_write_begin, .write_end = v9fs_write_end, - .release_folio = v9fs_release_folio, - .invalidate_folio = v9fs_invalidate_folio, + .release_folio = netfs_release_folio, + .invalidate_folio = netfs_invalidate_folio, .launder_folio = v9fs_launder_folio, .direct_IO = v9fs_direct_IO, }; diff --git a/fs/afs/file.c b/fs/afs/file.c index 0d783e5b2147b..d152ba451f0ec 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -20,9 +20,6 @@ static int afs_file_mmap(struct file *file, struct vm_area_struct *vma); static int afs_symlink_read_folio(struct file *file, struct folio *folio); -static void afs_invalidate_folio(struct folio *folio, size_t offset, - size_t length); -static bool afs_release_folio(struct folio *folio, gfp_t gfp_flags); static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter); static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos, @@ -57,8 +54,8 @@ const struct address_space_operations afs_file_aops = { .readahead = netfs_readahead, .dirty_folio = netfs_dirty_folio, .launder_folio = afs_launder_folio, - .release_folio = afs_release_folio, - .invalidate_folio = afs_invalidate_folio, + .release_folio = netfs_release_folio, + .invalidate_folio = netfs_invalidate_folio, .write_begin = afs_write_begin, .write_end = afs_write_end, .writepages = afs_writepages, @@ -67,8 +64,8 @@ const struct address_space_operations afs_file_aops = { const struct address_space_operations afs_symlink_aops = { .read_folio = afs_symlink_read_folio, - .release_folio = afs_release_folio, - .invalidate_folio = afs_invalidate_folio, + .release_folio = netfs_release_folio, + .invalidate_folio = netfs_invalidate_folio, .migrate_folio = filemap_migrate_folio, }; @@ -386,48 +383,6 @@ const struct netfs_request_ops afs_req_ops = { .issue_read = afs_issue_read, }; -/* - * invalidate part or all of a page - * - release a page and clean up its private data if offset is 0 (indicating - * the entire page) - */ -static void afs_invalidate_folio(struct folio *folio, size_t offset, - size_t length) -{ - _enter("{%lu},%zu,%zu", folio->index, offset, length); - - folio_wait_fscache(folio); - _leave(""); -} - -/* - * release a page and clean up its private state if it's not busy - * - return true if the page can now be released, false if not - */ -static bool afs_release_folio(struct folio *folio, gfp_t gfp) -{ - struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio)); - - _enter("{{%llx:%llu}[%lu],%lx},%x", - vnode->fid.vid, vnode->fid.vnode, folio_index(folio), folio->flags, - gfp); - - /* deny if folio is being written to the cache and the caller hasn't - * elected to wait */ -#ifdef CONFIG_AFS_FSCACHE - if (folio_test_fscache(folio)) { - if (current_is_kswapd() || !(gfp & __GFP_FS)) - return false; - folio_wait_fscache(folio); - } - fscache_note_page_release(afs_vnode_cache(vnode)); -#endif - - /* Indicate that the folio can be released */ - _leave(" = T"); - return true; -} - static void afs_add_open_mmap(struct afs_vnode *vnode) { if (atomic_inc_return(&vnode->cb_nr_mmap) == 1) { diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 3b8641febeac0..8eedc62e7ac4e 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -159,27 +159,7 @@ static void ceph_invalidate_folio(struct folio *folio, size_t offset, ceph_put_snap_context(snapc); } - folio_wait_fscache(folio); -} - -static bool ceph_release_folio(struct folio *folio, gfp_t gfp) -{ - struct inode *inode = folio->mapping->host; - struct ceph_client *cl = ceph_inode_to_client(inode); - - doutc(cl, "%llx.%llx idx %lu (%sdirty)\n", ceph_vinop(inode), - folio->index, folio_test_dirty(folio) ? "" : "not "); - - if (folio_test_private(folio)) - return false; - - if (folio_test_fscache(folio)) { - if (current_is_kswapd() || !(gfp & __GFP_FS)) - return false; - folio_wait_fscache(folio); - } - ceph_fscache_note_page_release(inode); - return true; + netfs_invalidate_folio(folio, offset, length); } static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq) @@ -1585,7 +1565,7 @@ const struct address_space_operations ceph_aops = { .write_end = ceph_write_end, .dirty_folio = ceph_dirty_folio, .invalidate_folio = ceph_invalidate_folio, - .release_folio = ceph_release_folio, + .release_folio = netfs_release_folio, .direct_IO = noop_direct_IO, }; diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index 8fc7d828d9903..20efac020394e 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -56,12 +56,6 @@ static inline bool ceph_is_cache_enabled(struct inode *inode) return fscache_cookie_enabled(ceph_fscache_cookie(ceph_inode(inode))); } -static inline void ceph_fscache_note_page_release(struct inode *inode) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - - fscache_note_page_release(ceph_fscache_cookie(ci)); -} #else /* CONFIG_CEPH_FSCACHE */ static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc) @@ -118,10 +112,6 @@ static inline bool ceph_is_cache_enabled(struct inode *inode) { return false; } - -static inline void ceph_fscache_note_page_release(struct inode *inode) -{ -} #endif /* CONFIG_CEPH_FSCACHE */ #endif diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 68baf55c47a43..45bb19ec9a638 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -84,3 +84,45 @@ void netfs_clear_inode_writeback(struct inode *inode, const void *aux) } } EXPORT_SYMBOL(netfs_clear_inode_writeback); + +/** + * netfs_invalidate_folio - Invalidate or partially invalidate a folio + * @folio: Folio proposed for release + * @offset: Offset of the invalidated region + * @length: Length of the invalidated region + * + * Invalidate part or all of a folio for a network filesystem. The folio will + * be removed afterwards if the invalidated region covers the entire folio. + */ +void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) +{ + _enter("{%lx},%zx,%zx", folio_index(folio), offset, length); + + folio_wait_fscache(folio); +} +EXPORT_SYMBOL(netfs_invalidate_folio); + +/** + * netfs_release_folio - Try to release a folio + * @folio: Folio proposed for release + * @gfp: Flags qualifying the release + * + * Request release of a folio and clean up its private state if it's not busy. + * Returns true if the folio can now be released, false if not + */ +bool netfs_release_folio(struct folio *folio, gfp_t gfp) +{ + struct netfs_inode *ctx = netfs_inode(folio_inode(folio)); + + if (folio_test_private(folio)) + return false; + if (folio_test_fscache(folio)) { + if (current_is_kswapd() || !(gfp & __GFP_FS)) + return false; + folio_wait_fscache(folio); + } + + fscache_note_page_release(netfs_i_cookie(ctx)); + return true; +} +EXPORT_SYMBOL(netfs_release_folio); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 06f57d9d09f6c..8efbfd3b28202 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -293,11 +293,13 @@ struct readahead_control; void netfs_readahead(struct readahead_control *); int netfs_read_folio(struct file *, struct folio *); int netfs_write_begin(struct netfs_inode *, struct file *, - struct address_space *, loff_t pos, unsigned int len, - struct folio **, void **fsdata); + struct address_space *, loff_t pos, unsigned int len, + struct folio **, void **fsdata); bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio); int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc); void netfs_clear_inode_writeback(struct inode *inode, const void *aux); +void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length); +bool netfs_release_folio(struct folio *folio, gfp_t gfp); void netfs_subreq_terminated(struct netfs_io_subrequest *, ssize_t, bool); void netfs_get_subrequest(struct netfs_io_subrequest *subreq, -- GitLab From 46ed60dcd4f2c94d27735743ce55cd8d6b93cc1d Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 11 Oct 2023 15:34:07 +0100 Subject: [PATCH 0542/1290] netfs: Implement unbuffered/DIO vs buffered I/O locking Borrow NFS's direct-vs-buffered I/O locking into netfslib. Similar code is also used in ceph. Modify it to have the correct checker annotations for i_rwsem lock acquisition/release and to return -ERESTARTSYS if waits are interrupted. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/netfs/Makefile | 1 + fs/netfs/locking.c | 216 ++++++++++++++++++++++++++++++++++++++++++ include/linux/netfs.h | 10 ++ 3 files changed, 227 insertions(+) create mode 100644 fs/netfs/locking.c diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile index a84fe9bbd3c4e..cf3fc847b8ace 100644 --- a/fs/netfs/Makefile +++ b/fs/netfs/Makefile @@ -4,6 +4,7 @@ netfs-y := \ buffered_read.o \ io.o \ iterator.o \ + locking.o \ main.o \ misc.o \ objects.o diff --git a/fs/netfs/locking.c b/fs/netfs/locking.c new file mode 100644 index 0000000000000..75dc52a49b3a4 --- /dev/null +++ b/fs/netfs/locking.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * I/O and data path helper functionality. + * + * Borrowed from NFS Copyright (c) 2016 Trond Myklebust + */ + +#include +#include +#include "internal.h" + +/* + * inode_dio_wait_interruptible - wait for outstanding DIO requests to finish + * @inode: inode to wait for + * + * Waits for all pending direct I/O requests to finish so that we can + * proceed with a truncate or equivalent operation. + * + * Must be called under a lock that serializes taking new references + * to i_dio_count, usually by inode->i_mutex. + */ +static int inode_dio_wait_interruptible(struct inode *inode) +{ + if (!atomic_read(&inode->i_dio_count)) + return 0; + + wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP); + DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP); + + for (;;) { + prepare_to_wait(wq, &q.wq_entry, TASK_INTERRUPTIBLE); + if (!atomic_read(&inode->i_dio_count)) + break; + if (signal_pending(current)) + break; + schedule(); + } + finish_wait(wq, &q.wq_entry); + + return atomic_read(&inode->i_dio_count) ? -ERESTARTSYS : 0; +} + +/* Call with exclusively locked inode->i_rwsem */ +static int netfs_block_o_direct(struct netfs_inode *ictx) +{ + if (!test_bit(NETFS_ICTX_ODIRECT, &ictx->flags)) + return 0; + clear_bit(NETFS_ICTX_ODIRECT, &ictx->flags); + return inode_dio_wait_interruptible(&ictx->inode); +} + +/** + * netfs_start_io_read - declare the file is being used for buffered reads + * @inode: file inode + * + * Declare that a buffered read operation is about to start, and ensure + * that we block all direct I/O. + * On exit, the function ensures that the NETFS_ICTX_ODIRECT flag is unset, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that buffered read operations are allowed to + * execute in parallel, thanks to the shared lock, whereas direct I/O + * operations need to wait to grab an exclusive lock in order to set + * NETFS_ICTX_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. + */ +int netfs_start_io_read(struct inode *inode) + __acquires(inode->i_rwsem) +{ + struct netfs_inode *ictx = netfs_inode(inode); + + /* Be an optimist! */ + if (down_read_interruptible(&inode->i_rwsem) < 0) + return -ERESTARTSYS; + if (test_bit(NETFS_ICTX_ODIRECT, &ictx->flags) == 0) + return 0; + up_read(&inode->i_rwsem); + + /* Slow path.... */ + if (down_write_killable(&inode->i_rwsem) < 0) + return -ERESTARTSYS; + if (netfs_block_o_direct(ictx) < 0) { + up_write(&inode->i_rwsem); + return -ERESTARTSYS; + } + downgrade_write(&inode->i_rwsem); + return 0; +} +EXPORT_SYMBOL(netfs_start_io_read); + +/** + * netfs_end_io_read - declare that the buffered read operation is done + * @inode: file inode + * + * Declare that a buffered read operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void netfs_end_io_read(struct inode *inode) + __releases(inode->i_rwsem) +{ + up_read(&inode->i_rwsem); +} +EXPORT_SYMBOL(netfs_end_io_read); + +/** + * netfs_start_io_write - declare the file is being used for buffered writes + * @inode: file inode + * + * Declare that a buffered read operation is about to start, and ensure + * that we block all direct I/O. + */ +int netfs_start_io_write(struct inode *inode) + __acquires(inode->i_rwsem) +{ + struct netfs_inode *ictx = netfs_inode(inode); + + if (down_write_killable(&inode->i_rwsem) < 0) + return -ERESTARTSYS; + if (netfs_block_o_direct(ictx) < 0) { + up_write(&inode->i_rwsem); + return -ERESTARTSYS; + } + return 0; +} +EXPORT_SYMBOL(netfs_start_io_write); + +/** + * netfs_end_io_write - declare that the buffered write operation is done + * @inode: file inode + * + * Declare that a buffered write operation is done, and release the + * lock on inode->i_rwsem. + */ +void netfs_end_io_write(struct inode *inode) + __releases(inode->i_rwsem) +{ + up_write(&inode->i_rwsem); +} +EXPORT_SYMBOL(netfs_end_io_write); + +/* Call with exclusively locked inode->i_rwsem */ +static int netfs_block_buffered(struct inode *inode) +{ + struct netfs_inode *ictx = netfs_inode(inode); + int ret; + + if (!test_bit(NETFS_ICTX_ODIRECT, &ictx->flags)) { + set_bit(NETFS_ICTX_ODIRECT, &ictx->flags); + if (inode->i_mapping->nrpages != 0) { + unmap_mapping_range(inode->i_mapping, 0, 0, 0); + ret = filemap_fdatawait(inode->i_mapping); + if (ret < 0) { + clear_bit(NETFS_ICTX_ODIRECT, &ictx->flags); + return ret; + } + } + } + return 0; +} + +/** + * netfs_start_io_direct - declare the file is being used for direct i/o + * @inode: file inode + * + * Declare that a direct I/O operation is about to start, and ensure + * that we block all buffered I/O. + * On exit, the function ensures that the NETFS_ICTX_ODIRECT flag is set, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that direct I/O operations are allowed to + * execute in parallel, thanks to the shared lock, whereas buffered I/O + * operations need to wait to grab an exclusive lock in order to clear + * NETFS_ICTX_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. + */ +int netfs_start_io_direct(struct inode *inode) + __acquires(inode->i_rwsem) +{ + struct netfs_inode *ictx = netfs_inode(inode); + int ret; + + /* Be an optimist! */ + if (down_read_interruptible(&inode->i_rwsem) < 0) + return -ERESTARTSYS; + if (test_bit(NETFS_ICTX_ODIRECT, &ictx->flags) != 0) + return 0; + up_read(&inode->i_rwsem); + + /* Slow path.... */ + if (down_write_killable(&inode->i_rwsem) < 0) + return -ERESTARTSYS; + ret = netfs_block_buffered(inode); + if (ret < 0) { + up_write(&inode->i_rwsem); + return ret; + } + downgrade_write(&inode->i_rwsem); + return 0; +} +EXPORT_SYMBOL(netfs_start_io_direct); + +/** + * netfs_end_io_direct - declare that the direct i/o operation is done + * @inode: file inode + * + * Declare that a direct I/O operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void netfs_end_io_direct(struct inode *inode) + __releases(inode->i_rwsem) +{ + up_read(&inode->i_rwsem); +} +EXPORT_SYMBOL(netfs_end_io_direct); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 8efbfd3b28202..fc6d9756a0294 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -129,6 +129,8 @@ struct netfs_inode { struct fscache_cookie *cache; #endif loff_t remote_i_size; /* Size of the remote file */ + unsigned long flags; +#define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ }; /* @@ -310,6 +312,13 @@ ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len, struct iov_iter *new, iov_iter_extraction_t extraction_flags); +int netfs_start_io_read(struct inode *inode); +void netfs_end_io_read(struct inode *inode); +int netfs_start_io_write(struct inode *inode); +void netfs_end_io_write(struct inode *inode); +int netfs_start_io_direct(struct inode *inode); +void netfs_end_io_direct(struct inode *inode); + /** * netfs_inode - Get the netfs inode context from the inode * @inode: The inode to query @@ -335,6 +344,7 @@ static inline void netfs_inode_init(struct netfs_inode *ctx, { ctx->ops = ops; ctx->remote_i_size = i_size_read(&ctx->inode); + ctx->flags = 0; #if IS_ENABLED(CONFIG_FSCACHE) ctx->cache = NULL; #endif -- GitLab From 92b6cc5d1e7cbe569f00e9c1249ac8214fd5e2d2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 26 Sep 2023 17:42:26 +0100 Subject: [PATCH 0543/1290] netfs: Add iov_iters to (sub)requests to describe various buffers Add three iov_iter structs: (1) Add an iov_iter (->iter) to the I/O request to describe the unencrypted-side buffer. (2) Add an iov_iter (->io_iter) to the I/O request to describe the encrypted-side I/O buffer. This may be a different size to the buffer in (1). (3) Add an iov_iter (->io_iter) to the I/O subrequest to describe the part of the I/O buffer for that subrequest. This will allow future patches to point to a bounce buffer instead for purposes of handling oversize writes, decryption (where we want to save the encrypted data to the cache) and decompression. These iov_iters persist for the lifetime of the (sub)request, and so can be accessed multiple times without worrying about them being deallocated upon return to the caller. The network filesystem must appropriately advance the iterator before terminating the request. Signed-off-by: David Howells cc: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/afs/file.c | 6 +--- fs/netfs/buffered_read.c | 13 ++++++++ fs/netfs/io.c | 69 +++++++++++++++++++++++++++++----------- include/linux/netfs.h | 3 ++ 4 files changed, 67 insertions(+), 24 deletions(-) diff --git a/fs/afs/file.c b/fs/afs/file.c index d152ba451f0ec..3403bb792deb8 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -323,11 +323,7 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq) fsreq->len = subreq->len - subreq->transferred; fsreq->key = key_get(subreq->rreq->netfs_priv); fsreq->vnode = vnode; - fsreq->iter = &fsreq->def_iter; - - iov_iter_xarray(&fsreq->def_iter, ITER_DEST, - &fsreq->vnode->netfs.inode.i_mapping->i_pages, - fsreq->pos, fsreq->len); + fsreq->iter = &subreq->io_iter; afs_fetch_data(fsreq->vnode, fsreq); afs_put_read(fsreq); diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index d39d0ffe75d2d..751556faa70b1 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -199,6 +199,10 @@ void netfs_readahead(struct readahead_control *ractl) netfs_rreq_expand(rreq, ractl); + /* Set up the output buffer */ + iov_iter_xarray(&rreq->iter, ITER_DEST, &ractl->mapping->i_pages, + rreq->start, rreq->len); + /* Drop the refs on the folios here rather than in the cache or * filesystem. The locks will be dropped in netfs_rreq_unlock(). */ @@ -251,6 +255,11 @@ int netfs_read_folio(struct file *file, struct folio *folio) netfs_stat(&netfs_n_rh_readpage); trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage); + + /* Set up the output buffer */ + iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, + rreq->start, rreq->len); + return netfs_begin_read(rreq, true); discard: @@ -408,6 +417,10 @@ retry: ractl._nr_pages = folio_nr_pages(folio); netfs_rreq_expand(rreq, &ractl); + /* Set up the output buffer */ + iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, + rreq->start, rreq->len); + /* We hold the folio locks, so we can drop the references */ folio_get(folio); while (readahead_folio(&ractl)) diff --git a/fs/netfs/io.c b/fs/netfs/io.c index 7f753380e047a..e9d408e211b83 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -21,12 +21,7 @@ */ static void netfs_clear_unread(struct netfs_io_subrequest *subreq) { - struct iov_iter iter; - - iov_iter_xarray(&iter, ITER_DEST, &subreq->rreq->mapping->i_pages, - subreq->start + subreq->transferred, - subreq->len - subreq->transferred); - iov_iter_zero(iov_iter_count(&iter), &iter); + iov_iter_zero(iov_iter_count(&subreq->io_iter), &subreq->io_iter); } static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, @@ -46,14 +41,9 @@ static void netfs_read_from_cache(struct netfs_io_request *rreq, enum netfs_read_from_hole read_hole) { struct netfs_cache_resources *cres = &rreq->cache_resources; - struct iov_iter iter; netfs_stat(&netfs_n_rh_read); - iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, - subreq->start + subreq->transferred, - subreq->len - subreq->transferred); - - cres->ops->read(cres, subreq->start, &iter, read_hole, + cres->ops->read(cres, subreq->start, &subreq->io_iter, read_hole, netfs_cache_read_terminated, subreq); } @@ -88,6 +78,11 @@ static void netfs_read_from_server(struct netfs_io_request *rreq, struct netfs_io_subrequest *subreq) { netfs_stat(&netfs_n_rh_download); + if (iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred) + pr_warn("R=%08x[%u] ITER PRE-MISMATCH %zx != %zx-%zx %lx\n", + rreq->debug_id, subreq->debug_index, + iov_iter_count(&subreq->io_iter), subreq->len, + subreq->transferred, subreq->flags); rreq->netfs_ops->issue_read(subreq); } @@ -259,6 +254,30 @@ static void netfs_rreq_short_read(struct netfs_io_request *rreq, netfs_read_from_server(rreq, subreq); } +/* + * Reset the subrequest iterator prior to resubmission. + */ +static void netfs_reset_subreq_iter(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq) +{ + size_t remaining = subreq->len - subreq->transferred; + size_t count = iov_iter_count(&subreq->io_iter); + + if (count == remaining) + return; + + _debug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x\n", + rreq->debug_id, subreq->debug_index, + iov_iter_count(&subreq->io_iter), subreq->transferred, + subreq->len, rreq->i_size, + subreq->io_iter.iter_type); + + if (count < remaining) + iov_iter_revert(&subreq->io_iter, remaining - count); + else + iov_iter_advance(&subreq->io_iter, count - remaining); +} + /* * Resubmit any short or failed operations. Returns true if we got the rreq * ref back. @@ -287,6 +306,7 @@ static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq) trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead); netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); atomic_inc(&rreq->nr_outstanding); + netfs_reset_subreq_iter(rreq, subreq); netfs_read_from_server(rreq, subreq); } else if (test_bit(NETFS_SREQ_SHORT_IO, &subreq->flags)) { netfs_rreq_short_read(rreq, subreq); @@ -399,9 +419,9 @@ void netfs_subreq_terminated(struct netfs_io_subrequest *subreq, struct netfs_io_request *rreq = subreq->rreq; int u; - _enter("[%u]{%llx,%lx},%zd", - subreq->debug_index, subreq->start, subreq->flags, - transferred_or_error); + _enter("R=%x[%x]{%llx,%lx},%zd", + rreq->debug_id, subreq->debug_index, + subreq->start, subreq->flags, transferred_or_error); switch (subreq->source) { case NETFS_READ_FROM_CACHE: @@ -501,7 +521,8 @@ static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_subrequest */ static enum netfs_io_source netfs_rreq_prepare_read(struct netfs_io_request *rreq, - struct netfs_io_subrequest *subreq) + struct netfs_io_subrequest *subreq, + struct iov_iter *io_iter) { enum netfs_io_source source; @@ -528,9 +549,14 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq, } } - if (WARN_ON(subreq->len == 0)) + if (WARN_ON(subreq->len == 0)) { source = NETFS_INVALID_READ; + goto out; + } + subreq->io_iter = *io_iter; + iov_iter_truncate(&subreq->io_iter, subreq->len); + iov_iter_advance(io_iter, subreq->len); out: subreq->source = source; trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); @@ -541,6 +567,7 @@ out: * Slice off a piece of a read request and submit an I/O request for it. */ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq, + struct iov_iter *io_iter, unsigned int *_debug_index) { struct netfs_io_subrequest *subreq; @@ -565,7 +592,7 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq, * (the starts must coincide), in which case, we go around the loop * again and ask it to download the next piece. */ - source = netfs_rreq_prepare_read(rreq, subreq); + source = netfs_rreq_prepare_read(rreq, subreq, io_iter); if (source == NETFS_INVALID_READ) goto subreq_failed; @@ -603,6 +630,7 @@ subreq_failed: */ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) { + struct iov_iter io_iter; unsigned int debug_index = 0; int ret; @@ -615,6 +643,8 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) return -EIO; } + rreq->io_iter = rreq->iter; + INIT_WORK(&rreq->work, netfs_rreq_work); if (sync) @@ -624,8 +654,9 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) * want and submit each one. */ atomic_set(&rreq->nr_outstanding, 1); + io_iter = rreq->io_iter; do { - if (!netfs_rreq_submit_slice(rreq, &debug_index)) + if (!netfs_rreq_submit_slice(rreq, &io_iter, &debug_index)) break; } while (rreq->submitted < rreq->len); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index fc6d9756a0294..3da962e977f55 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -150,6 +150,7 @@ struct netfs_cache_resources { struct netfs_io_subrequest { struct netfs_io_request *rreq; /* Supervising I/O request */ struct list_head rreq_link; /* Link in rreq->subrequests */ + struct iov_iter io_iter; /* Iterator for this subrequest */ loff_t start; /* Where to start the I/O */ size_t len; /* Size of the I/O */ size_t transferred; /* Amount of data transferred */ @@ -186,6 +187,8 @@ struct netfs_io_request { struct netfs_cache_resources cache_resources; struct list_head proc_link; /* Link in netfs_iorequests */ struct list_head subrequests; /* Contributory I/O operations */ + struct iov_iter iter; /* Unencrypted-side iterator */ + struct iov_iter io_iter; /* I/O (Encrypted-side) iterator */ void *netfs_priv; /* Private data for the netfs */ unsigned int debug_id; atomic_t nr_outstanding; /* Number of ops in progress */ -- GitLab From 7984d22f1315bf30433e11e5010e4ce09ca22037 Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Fri, 22 Dec 2023 16:47:40 -0800 Subject: [PATCH 0544/1290] cxl/region: Add dev_dbg() detail on failure to allocate HPA space When the region driver fails while allocating HPA space for a new region it can be because the parent resource, the CXL Window, has no more available space. In that case, the debug user sees this message: cxl_core:alloc_hpa:555: cxl region2: failed to allocate HPA: -34 Expand the message like this: cxl_core:alloc_hpa:555: cxl region8: HPA allocation error (-34) for size:0x20000000 in CXL Window 0 [mem 0xf010000000-0xf04fffffff flags 0x200] Now the debug user can examine /proc/iomem and consider actions like removing other allocations in that space or reducing the size of their region request. Suggested-by: Dan Williams Signed-off-by: Alison Schofield Reviewed-by: Dave Jiang Reviewed-by: Vishal Verma Reviewed-by: Fan Ni Link: https://lore.kernel.org/r/20231223004740.1401858-1-alison.schofield@intel.com Signed-off-by: Dan Williams --- drivers/cxl/core/region.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index a61703c9d72ab..d904f9752bc38 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -552,8 +552,9 @@ static int alloc_hpa(struct cxl_region *cxlr, resource_size_t size) res = alloc_free_mem_region(cxlrd->res, size, SZ_256M, dev_name(&cxlr->dev)); if (IS_ERR(res)) { - dev_dbg(&cxlr->dev, "failed to allocate HPA: %ld\n", - PTR_ERR(res)); + dev_dbg(&cxlr->dev, + "HPA allocation error (%ld) for size:%#llx in %s %pr\n", + PTR_ERR(res), size, cxlrd->res->name, cxlrd->res); return PTR_ERR(res); } -- GitLab From c04c4ebd452476af1bdfa917105d9da97f01868d Mon Sep 17 00:00:00 2001 From: Andrew Davis Date: Fri, 17 Nov 2023 10:10:04 -0600 Subject: [PATCH 0545/1290] power: reset: gpio-restart: Use devm_register_sys_off_handler() Use device life-cycle managed register function to simplify probe error path and eliminate need for explicit remove function. Signed-off-by: Andrew Davis Signed-off-by: Sebastian Reichel --- drivers/power/reset/gpio-restart.c | 34 ++++++++---------------------- 1 file changed, 9 insertions(+), 25 deletions(-) diff --git a/drivers/power/reset/gpio-restart.c b/drivers/power/reset/gpio-restart.c index 3aa19765772dc..d1e177176fa1f 100644 --- a/drivers/power/reset/gpio-restart.c +++ b/drivers/power/reset/gpio-restart.c @@ -17,17 +17,14 @@ struct gpio_restart { struct gpio_desc *reset_gpio; - struct notifier_block restart_handler; u32 active_delay_ms; u32 inactive_delay_ms; u32 wait_delay_ms; }; -static int gpio_restart_notify(struct notifier_block *this, - unsigned long mode, void *cmd) +static int gpio_restart_notify(struct sys_off_data *data) { - struct gpio_restart *gpio_restart = - container_of(this, struct gpio_restart, restart_handler); + struct gpio_restart *gpio_restart = data->cb_data; /* drive it active, also inactive->active edge */ gpiod_direction_output(gpio_restart->reset_gpio, 1); @@ -52,6 +49,7 @@ static int gpio_restart_probe(struct platform_device *pdev) { struct gpio_restart *gpio_restart; bool open_source = false; + int priority = 129; u32 property; int ret; @@ -71,8 +69,6 @@ static int gpio_restart_probe(struct platform_device *pdev) return ret; } - gpio_restart->restart_handler.notifier_call = gpio_restart_notify; - gpio_restart->restart_handler.priority = 129; gpio_restart->active_delay_ms = 100; gpio_restart->inactive_delay_ms = 100; gpio_restart->wait_delay_ms = 3000; @@ -83,7 +79,7 @@ static int gpio_restart_probe(struct platform_device *pdev) dev_err(&pdev->dev, "Invalid priority property: %u\n", property); else - gpio_restart->restart_handler.priority = property; + priority = property; } of_property_read_u32(pdev->dev.of_node, "active-delay", @@ -93,9 +89,11 @@ static int gpio_restart_probe(struct platform_device *pdev) of_property_read_u32(pdev->dev.of_node, "wait-delay", &gpio_restart->wait_delay_ms); - platform_set_drvdata(pdev, gpio_restart); - - ret = register_restart_handler(&gpio_restart->restart_handler); + ret = devm_register_sys_off_handler(&pdev->dev, + SYS_OFF_MODE_RESTART, + priority, + gpio_restart_notify, + gpio_restart); if (ret) { dev_err(&pdev->dev, "%s: cannot register restart handler, %d\n", __func__, ret); @@ -105,19 +103,6 @@ static int gpio_restart_probe(struct platform_device *pdev) return 0; } -static void gpio_restart_remove(struct platform_device *pdev) -{ - struct gpio_restart *gpio_restart = platform_get_drvdata(pdev); - int ret; - - ret = unregister_restart_handler(&gpio_restart->restart_handler); - if (ret) { - dev_err(&pdev->dev, - "%s: cannot unregister restart handler, %d\n", - __func__, ret); - } -} - static const struct of_device_id of_gpio_restart_match[] = { { .compatible = "gpio-restart", }, {}, @@ -125,7 +110,6 @@ static const struct of_device_id of_gpio_restart_match[] = { static struct platform_driver gpio_restart_driver = { .probe = gpio_restart_probe, - .remove_new = gpio_restart_remove, .driver = { .name = "restart-gpio", .of_match_table = of_gpio_restart_match, -- GitLab From c73cc447751898aeac5ef1c0012a03d45721949a Mon Sep 17 00:00:00 2001 From: Charalampos Mitrodimas Date: Sat, 18 Nov 2023 13:29:57 +0000 Subject: [PATCH 0546/1290] power: supply: Fix indentation and some other warnings These were mentioned by checkpatch: Errors: (1) code indent should use tabs where possible (2) switch and case should be at the same indent Warnings: (1) Missing a blank line after declarations Signed-off-by: Charalampos Mitrodimas Signed-off-by: Sebastian Reichel --- drivers/power/supply/power_supply_core.c | 161 ++++++++++++----------- 1 file changed, 81 insertions(+), 80 deletions(-) diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c index 73265001dd4b2..4a5b570dff449 100644 --- a/drivers/power/supply/power_supply_core.c +++ b/drivers/power/supply/power_supply_core.c @@ -861,44 +861,44 @@ const size_t power_supply_battery_info_properties_size = ARRAY_SIZE(power_supply EXPORT_SYMBOL_GPL(power_supply_battery_info_properties_size); bool power_supply_battery_info_has_prop(struct power_supply_battery_info *info, - enum power_supply_property psp) + enum power_supply_property psp) { if (!info) return false; switch (psp) { - case POWER_SUPPLY_PROP_TECHNOLOGY: - return info->technology != POWER_SUPPLY_TECHNOLOGY_UNKNOWN; - case POWER_SUPPLY_PROP_ENERGY_FULL_DESIGN: - return info->energy_full_design_uwh >= 0; - case POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN: - return info->charge_full_design_uah >= 0; - case POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN: - return info->voltage_min_design_uv >= 0; - case POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN: - return info->voltage_max_design_uv >= 0; - case POWER_SUPPLY_PROP_PRECHARGE_CURRENT: - return info->precharge_current_ua >= 0; - case POWER_SUPPLY_PROP_CHARGE_TERM_CURRENT: - return info->charge_term_current_ua >= 0; - case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX: - return info->constant_charge_current_max_ua >= 0; - case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX: - return info->constant_charge_voltage_max_uv >= 0; - case POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MIN: - return info->temp_ambient_alert_min > INT_MIN; - case POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MAX: - return info->temp_ambient_alert_max < INT_MAX; - case POWER_SUPPLY_PROP_TEMP_ALERT_MIN: - return info->temp_alert_min > INT_MIN; - case POWER_SUPPLY_PROP_TEMP_ALERT_MAX: - return info->temp_alert_max < INT_MAX; - case POWER_SUPPLY_PROP_TEMP_MIN: - return info->temp_min > INT_MIN; - case POWER_SUPPLY_PROP_TEMP_MAX: - return info->temp_max < INT_MAX; - default: - return false; + case POWER_SUPPLY_PROP_TECHNOLOGY: + return info->technology != POWER_SUPPLY_TECHNOLOGY_UNKNOWN; + case POWER_SUPPLY_PROP_ENERGY_FULL_DESIGN: + return info->energy_full_design_uwh >= 0; + case POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN: + return info->charge_full_design_uah >= 0; + case POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN: + return info->voltage_min_design_uv >= 0; + case POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN: + return info->voltage_max_design_uv >= 0; + case POWER_SUPPLY_PROP_PRECHARGE_CURRENT: + return info->precharge_current_ua >= 0; + case POWER_SUPPLY_PROP_CHARGE_TERM_CURRENT: + return info->charge_term_current_ua >= 0; + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX: + return info->constant_charge_current_max_ua >= 0; + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX: + return info->constant_charge_voltage_max_uv >= 0; + case POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MIN: + return info->temp_ambient_alert_min > INT_MIN; + case POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MAX: + return info->temp_ambient_alert_max < INT_MAX; + case POWER_SUPPLY_PROP_TEMP_ALERT_MIN: + return info->temp_alert_min > INT_MIN; + case POWER_SUPPLY_PROP_TEMP_ALERT_MAX: + return info->temp_alert_max < INT_MAX; + case POWER_SUPPLY_PROP_TEMP_MIN: + return info->temp_min > INT_MIN; + case POWER_SUPPLY_PROP_TEMP_MAX: + return info->temp_max < INT_MAX; + default: + return false; } } EXPORT_SYMBOL_GPL(power_supply_battery_info_has_prop); @@ -914,53 +914,53 @@ int power_supply_battery_info_get_prop(struct power_supply_battery_info *info, return -EINVAL; switch (psp) { - case POWER_SUPPLY_PROP_TECHNOLOGY: - val->intval = info->technology; - return 0; - case POWER_SUPPLY_PROP_ENERGY_FULL_DESIGN: - val->intval = info->energy_full_design_uwh; - return 0; - case POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN: - val->intval = info->charge_full_design_uah; - return 0; - case POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN: - val->intval = info->voltage_min_design_uv; - return 0; - case POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN: - val->intval = info->voltage_max_design_uv; - return 0; - case POWER_SUPPLY_PROP_PRECHARGE_CURRENT: - val->intval = info->precharge_current_ua; - return 0; - case POWER_SUPPLY_PROP_CHARGE_TERM_CURRENT: - val->intval = info->charge_term_current_ua; - return 0; - case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX: - val->intval = info->constant_charge_current_max_ua; - return 0; - case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX: - val->intval = info->constant_charge_voltage_max_uv; - return 0; - case POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MIN: - val->intval = info->temp_ambient_alert_min; - return 0; - case POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MAX: - val->intval = info->temp_ambient_alert_max; - return 0; - case POWER_SUPPLY_PROP_TEMP_ALERT_MIN: - val->intval = info->temp_alert_min; - return 0; - case POWER_SUPPLY_PROP_TEMP_ALERT_MAX: - val->intval = info->temp_alert_max; - return 0; - case POWER_SUPPLY_PROP_TEMP_MIN: - val->intval = info->temp_min; - return 0; - case POWER_SUPPLY_PROP_TEMP_MAX: - val->intval = info->temp_max; - return 0; - default: - return -EINVAL; + case POWER_SUPPLY_PROP_TECHNOLOGY: + val->intval = info->technology; + return 0; + case POWER_SUPPLY_PROP_ENERGY_FULL_DESIGN: + val->intval = info->energy_full_design_uwh; + return 0; + case POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN: + val->intval = info->charge_full_design_uah; + return 0; + case POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN: + val->intval = info->voltage_min_design_uv; + return 0; + case POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN: + val->intval = info->voltage_max_design_uv; + return 0; + case POWER_SUPPLY_PROP_PRECHARGE_CURRENT: + val->intval = info->precharge_current_ua; + return 0; + case POWER_SUPPLY_PROP_CHARGE_TERM_CURRENT: + val->intval = info->charge_term_current_ua; + return 0; + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX: + val->intval = info->constant_charge_current_max_ua; + return 0; + case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX: + val->intval = info->constant_charge_voltage_max_uv; + return 0; + case POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MIN: + val->intval = info->temp_ambient_alert_min; + return 0; + case POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MAX: + val->intval = info->temp_ambient_alert_max; + return 0; + case POWER_SUPPLY_PROP_TEMP_ALERT_MIN: + val->intval = info->temp_alert_min; + return 0; + case POWER_SUPPLY_PROP_TEMP_ALERT_MAX: + val->intval = info->temp_alert_max; + return 0; + case POWER_SUPPLY_PROP_TEMP_MIN: + val->intval = info->temp_min; + return 0; + case POWER_SUPPLY_PROP_TEMP_MAX: + val->intval = info->temp_max; + return 0; + default: + return -EINVAL; } } EXPORT_SYMBOL_GPL(power_supply_battery_info_get_prop); @@ -1255,6 +1255,7 @@ EXPORT_SYMBOL_GPL(power_supply_powers); static void power_supply_dev_release(struct device *dev) { struct power_supply *psy = to_power_supply(dev); + dev_dbg(dev, "%s\n", __func__); kfree(psy); } -- GitLab From 3cbbe1be0e3bd4cac5502eb141dd257ab34a4460 Mon Sep 17 00:00:00 2001 From: Charalampos Mitrodimas Date: Sat, 18 Nov 2023 13:29:58 +0000 Subject: [PATCH 0547/1290] power: supply: Use multiple MODULE_AUTHOR statements This resolves checkpatch warning "quoted string split across lines" on: 1640: WARNING: quoted string split across lines 1641: WARNING: quoted string split across lines The motive to use multiple MODULE_AUTHOR statements came from this comment from "include/linux/module.h": /* * Author(s), use "Name " or just "Name", for multiple * authors use multiple MODULE_AUTHOR() statements/lines. */ #define MODULE_AUTHOR(_author) MODULE_INFO(author, _author) Signed-off-by: Charalampos Mitrodimas Signed-off-by: Sebastian Reichel --- drivers/power/supply/power_supply_core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c index 4a5b570dff449..ecef35ac3b7e4 100644 --- a/drivers/power/supply/power_supply_core.c +++ b/drivers/power/supply/power_supply_core.c @@ -1637,6 +1637,6 @@ subsys_initcall(power_supply_class_init); module_exit(power_supply_class_exit); MODULE_DESCRIPTION("Universal power supply monitor class"); -MODULE_AUTHOR("Ian Molton , " - "Szabolcs Gyurko, " - "Anton Vorontsov "); +MODULE_AUTHOR("Ian Molton "); +MODULE_AUTHOR("Szabolcs Gyurko"); +MODULE_AUTHOR("Anton Vorontsov "); -- GitLab From 195c3167865454ea909d717557c8585ccfe1b2a3 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 20 Nov 2023 15:35:32 -0700 Subject: [PATCH 0548/1290] power: reset: at91: Drop '__init' from at91_wakeup_status() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When building with clang, there are two section mismatch warnings: WARNING: modpost: vmlinux: section mismatch in reference: at91_poweroff_probe+0x7c (section: .text) -> at91_wakeup_status (section: .init.text) WARNING: modpost: vmlinux: section mismatch in reference: at91_shdwc_probe+0xcc (section: .text) -> at91_wakeup_status (section: .init.text) Drop '__init' from at91_wakeup_status() to clear up the mismatch. Fixes: dde74a5de817 ("power: reset: at91-sama5d2_shdwc: Stop using module_platform_driver_probe()") Fixes: 099806de68b7 ("power: reset: at91-poweroff: Stop using module_platform_driver_probe()") Signed-off-by: Nathan Chancellor Acked-by: Nicolas Ferre Reviewed-by: Uwe Kleine-König Signed-off-by: Sebastian Reichel --- drivers/power/reset/at91-poweroff.c | 2 +- drivers/power/reset/at91-sama5d2_shdwc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/power/reset/at91-poweroff.c b/drivers/power/reset/at91-poweroff.c index 126e774e210c0..93eece0278652 100644 --- a/drivers/power/reset/at91-poweroff.c +++ b/drivers/power/reset/at91-poweroff.c @@ -57,7 +57,7 @@ static struct shdwc { void __iomem *mpddrc_base; } at91_shdwc; -static void __init at91_wakeup_status(struct platform_device *pdev) +static void at91_wakeup_status(struct platform_device *pdev) { const char *reason; u32 reg = readl(at91_shdwc.shdwc_base + AT91_SHDW_SR); diff --git a/drivers/power/reset/at91-sama5d2_shdwc.c b/drivers/power/reset/at91-sama5d2_shdwc.c index af95c7b39cb3c..959ce0dbe91d1 100644 --- a/drivers/power/reset/at91-sama5d2_shdwc.c +++ b/drivers/power/reset/at91-sama5d2_shdwc.c @@ -107,7 +107,7 @@ static const unsigned long long sdwc_dbc_period[] = { 0, 3, 32, 512, 4096, 32768, }; -static void __init at91_wakeup_status(struct platform_device *pdev) +static void at91_wakeup_status(struct platform_device *pdev) { struct shdwc *shdw = platform_get_drvdata(pdev); const struct reg_config *rcfg = shdw->rcfg; -- GitLab From 88f04bc3e737155e13caddf0ba8ed19db87f0212 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Fri, 24 Nov 2023 15:50:21 +0800 Subject: [PATCH 0549/1290] power: supply: Fix null pointer dereference in smb2_probe devm_kasprintf and devm_kzalloc return a pointer to dynamically allocated memory which can be NULL upon failure. Fixes: 8648aeb5d7b7 ("power: supply: add Qualcomm PMI8998 SMB2 Charger driver") Signed-off-by: Kunwu Chan Link: https://lore.kernel.org/r/20231124075021.1335289-1-chentao@kylinos.cn Signed-off-by: Sebastian Reichel --- drivers/power/supply/qcom_pmi8998_charger.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/power/supply/qcom_pmi8998_charger.c b/drivers/power/supply/qcom_pmi8998_charger.c index 8acf63ee6897f..9bb7774060138 100644 --- a/drivers/power/supply/qcom_pmi8998_charger.c +++ b/drivers/power/supply/qcom_pmi8998_charger.c @@ -972,10 +972,14 @@ static int smb2_probe(struct platform_device *pdev) supply_config.of_node = pdev->dev.of_node; desc = devm_kzalloc(chip->dev, sizeof(smb2_psy_desc), GFP_KERNEL); + if (!desc) + return -ENOMEM; memcpy(desc, &smb2_psy_desc, sizeof(smb2_psy_desc)); desc->name = devm_kasprintf(chip->dev, GFP_KERNEL, "%s-charger", (const char *)device_get_match_data(chip->dev)); + if (!desc->name) + return -ENOMEM; chip->chg_psy = devm_power_supply_register(chip->dev, desc, &supply_config); -- GitLab From 523100208bd22e3eee7b76025ee4584b5d1ea0ee Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 30 Nov 2023 18:30:17 +0100 Subject: [PATCH 0550/1290] dt-bindings: power: reset: qcom-pon: fix inconsistent example The current PON example is a bit of a mess after converting the binding document to yaml and in the process updating parts of the example to match the pmk8350 binding while leaving parts from the older pm8998 example in place. Clean up the example and make it consistent by adding some newline separators; dropping labels; removing stray spaces; fixing the PON node name; and fixing the unit address so that it matches the interrupt specifiers (which re-encodes the PON base address, 0x800 => 0x8). Fixes: 76ba1900cb67 ("dt-bindings: power: reset: qcom-pon: Convert qcom PON binding to yaml") Signed-off-by: Johan Hovold Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20231130173017.12723-1-johan+linaro@kernel.org Signed-off-by: Sebastian Reichel --- .../devicetree/bindings/power/reset/qcom,pon.yaml | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/Documentation/devicetree/bindings/power/reset/qcom,pon.yaml b/Documentation/devicetree/bindings/power/reset/qcom,pon.yaml index 5e460128b0d10..fc8105a7b9b26 100644 --- a/Documentation/devicetree/bindings/power/reset/qcom,pon.yaml +++ b/Documentation/devicetree/bindings/power/reset/qcom,pon.yaml @@ -111,21 +111,24 @@ examples: #include #include #include - spmi_bus: spmi@c440000 { + + spmi@c440000 { reg = <0x0c440000 0x1100>; #address-cells = <2>; #size-cells = <0>; - pmk8350: pmic@0 { + + pmic@0 { reg = <0x0 SPMI_USID>; #address-cells = <1>; #size-cells = <0>; - pmk8350_pon: pon_hlos@1300 { - reg = <0x1300>; + + pon@800 { compatible = "qcom,pm8998-pon"; + reg = <0x800>; pwrkey { compatible = "qcom,pm8941-pwrkey"; - interrupts = < 0x0 0x8 0 IRQ_TYPE_EDGE_BOTH >; + interrupts = <0x0 0x8 0 IRQ_TYPE_EDGE_BOTH>; debounce = <15625>; bias-pull-up; linux,code = ; -- GitLab From b43f7ddc2b7a5a90447d96cb4d3c6d142dd4a810 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Mon, 18 Dec 2023 15:41:52 +0100 Subject: [PATCH 0551/1290] power: supply: qcom_battmgr: Register the power supplies after PDR is up Currently, a not-yet-entirely-initialized battmgr (e.g. with pd-mapper not having yet started or ADSP not being up etc.) results in a couple of zombie power supply devices hanging around. This is particularly noticeable when trying to suspend the device (even s2idle): the PSY-internal thermal zone is inaccessible and returns -ENODEV, which causes log spam. Register the power supplies only after we received some notification indicating battmgr is ready to take off. Signed-off-by: Konrad Dybcio Tested-by: Luca Weiss Link: https://lore.kernel.org/r/20231218-topic-battmgr_fixture_attempt-v1-1-6145745f34fe@linaro.org Signed-off-by: Sebastian Reichel --- drivers/power/supply/qcom_battmgr.c | 109 +++++++++++++++------------- 1 file changed, 60 insertions(+), 49 deletions(-) diff --git a/drivers/power/supply/qcom_battmgr.c b/drivers/power/supply/qcom_battmgr.c index ec163d1bcd189..a12e2a66d516f 100644 --- a/drivers/power/supply/qcom_battmgr.c +++ b/drivers/power/supply/qcom_battmgr.c @@ -282,6 +282,7 @@ struct qcom_battmgr_wireless { struct qcom_battmgr { struct device *dev; + struct auxiliary_device *adev; struct pmic_glink_client *client; enum qcom_battmgr_variant variant; @@ -1293,11 +1294,69 @@ static void qcom_battmgr_enable_worker(struct work_struct *work) dev_err(battmgr->dev, "failed to request power notifications\n"); } +static char *qcom_battmgr_battery[] = { "battery" }; + +static void qcom_battmgr_register_psy(struct qcom_battmgr *battmgr) +{ + struct power_supply_config psy_cfg_supply = {}; + struct auxiliary_device *adev = battmgr->adev; + struct power_supply_config psy_cfg = {}; + struct device *dev = &adev->dev; + + psy_cfg.drv_data = battmgr; + psy_cfg.of_node = adev->dev.of_node; + + psy_cfg_supply.drv_data = battmgr; + psy_cfg_supply.of_node = adev->dev.of_node; + psy_cfg_supply.supplied_to = qcom_battmgr_battery; + psy_cfg_supply.num_supplicants = 1; + + if (battmgr->variant == QCOM_BATTMGR_SC8280XP) { + battmgr->bat_psy = devm_power_supply_register(dev, &sc8280xp_bat_psy_desc, &psy_cfg); + if (IS_ERR(battmgr->bat_psy)) + dev_err(dev, "failed to register battery power supply (%ld)\n", + PTR_ERR(battmgr->bat_psy)); + + battmgr->ac_psy = devm_power_supply_register(dev, &sc8280xp_ac_psy_desc, &psy_cfg_supply); + if (IS_ERR(battmgr->ac_psy)) + dev_err(dev, "failed to register AC power supply (%ld)\n", + PTR_ERR(battmgr->ac_psy)); + + battmgr->usb_psy = devm_power_supply_register(dev, &sc8280xp_usb_psy_desc, &psy_cfg_supply); + if (IS_ERR(battmgr->usb_psy)) + dev_err(dev, "failed to register USB power supply (%ld)\n", + PTR_ERR(battmgr->usb_psy)); + + battmgr->wls_psy = devm_power_supply_register(dev, &sc8280xp_wls_psy_desc, &psy_cfg_supply); + if (IS_ERR(battmgr->wls_psy)) + dev_err(dev, "failed to register wireless charing power supply (%ld)\n", + PTR_ERR(battmgr->wls_psy)); + } else { + battmgr->bat_psy = devm_power_supply_register(dev, &sm8350_bat_psy_desc, &psy_cfg); + if (IS_ERR(battmgr->bat_psy)) + dev_err(dev, "failed to register battery power supply (%ld)\n", + PTR_ERR(battmgr->bat_psy)); + + battmgr->usb_psy = devm_power_supply_register(dev, &sm8350_usb_psy_desc, &psy_cfg_supply); + if (IS_ERR(battmgr->usb_psy)) + dev_err(dev, "failed to register USB power supply (%ld)\n", + PTR_ERR(battmgr->usb_psy)); + + battmgr->wls_psy = devm_power_supply_register(dev, &sm8350_wls_psy_desc, &psy_cfg_supply); + if (IS_ERR(battmgr->wls_psy)) + dev_err(dev, "failed to register wireless charing power supply (%ld)\n", + PTR_ERR(battmgr->wls_psy)); + } +} + static void qcom_battmgr_pdr_notify(void *priv, int state) { struct qcom_battmgr *battmgr = priv; if (state == SERVREG_SERVICE_STATE_UP) { + if (!battmgr->bat_psy) + qcom_battmgr_register_psy(battmgr); + battmgr->service_up = true; schedule_work(&battmgr->enable_work); } else { @@ -1312,13 +1371,9 @@ static const struct of_device_id qcom_battmgr_of_variants[] = { {} }; -static char *qcom_battmgr_battery[] = { "battery" }; - static int qcom_battmgr_probe(struct auxiliary_device *adev, const struct auxiliary_device_id *id) { - struct power_supply_config psy_cfg_supply = {}; - struct power_supply_config psy_cfg = {}; const struct of_device_id *match; struct qcom_battmgr *battmgr; struct device *dev = &adev->dev; @@ -1328,14 +1383,7 @@ static int qcom_battmgr_probe(struct auxiliary_device *adev, return -ENOMEM; battmgr->dev = dev; - - psy_cfg.drv_data = battmgr; - psy_cfg.of_node = adev->dev.of_node; - - psy_cfg_supply.drv_data = battmgr; - psy_cfg_supply.of_node = adev->dev.of_node; - psy_cfg_supply.supplied_to = qcom_battmgr_battery; - psy_cfg_supply.num_supplicants = 1; + battmgr->adev = adev; INIT_WORK(&battmgr->enable_work, qcom_battmgr_enable_worker); mutex_init(&battmgr->lock); @@ -1347,43 +1395,6 @@ static int qcom_battmgr_probe(struct auxiliary_device *adev, else battmgr->variant = QCOM_BATTMGR_SM8350; - if (battmgr->variant == QCOM_BATTMGR_SC8280XP) { - battmgr->bat_psy = devm_power_supply_register(dev, &sc8280xp_bat_psy_desc, &psy_cfg); - if (IS_ERR(battmgr->bat_psy)) - return dev_err_probe(dev, PTR_ERR(battmgr->bat_psy), - "failed to register battery power supply\n"); - - battmgr->ac_psy = devm_power_supply_register(dev, &sc8280xp_ac_psy_desc, &psy_cfg_supply); - if (IS_ERR(battmgr->ac_psy)) - return dev_err_probe(dev, PTR_ERR(battmgr->ac_psy), - "failed to register AC power supply\n"); - - battmgr->usb_psy = devm_power_supply_register(dev, &sc8280xp_usb_psy_desc, &psy_cfg_supply); - if (IS_ERR(battmgr->usb_psy)) - return dev_err_probe(dev, PTR_ERR(battmgr->usb_psy), - "failed to register USB power supply\n"); - - battmgr->wls_psy = devm_power_supply_register(dev, &sc8280xp_wls_psy_desc, &psy_cfg_supply); - if (IS_ERR(battmgr->wls_psy)) - return dev_err_probe(dev, PTR_ERR(battmgr->wls_psy), - "failed to register wireless charing power supply\n"); - } else { - battmgr->bat_psy = devm_power_supply_register(dev, &sm8350_bat_psy_desc, &psy_cfg); - if (IS_ERR(battmgr->bat_psy)) - return dev_err_probe(dev, PTR_ERR(battmgr->bat_psy), - "failed to register battery power supply\n"); - - battmgr->usb_psy = devm_power_supply_register(dev, &sm8350_usb_psy_desc, &psy_cfg_supply); - if (IS_ERR(battmgr->usb_psy)) - return dev_err_probe(dev, PTR_ERR(battmgr->usb_psy), - "failed to register USB power supply\n"); - - battmgr->wls_psy = devm_power_supply_register(dev, &sm8350_wls_psy_desc, &psy_cfg_supply); - if (IS_ERR(battmgr->wls_psy)) - return dev_err_probe(dev, PTR_ERR(battmgr->wls_psy), - "failed to register wireless charing power supply\n"); - } - battmgr->client = devm_pmic_glink_register_client(dev, PMIC_GLINK_OWNER_BATTMGR, qcom_battmgr_callback, -- GitLab From 97b9b383976e3347b05d8c409527c6e03c90cf72 Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Thu, 21 Dec 2023 13:27:56 +0100 Subject: [PATCH 0552/1290] dt-bindings: power: reset: xilinx: Rename node names in examples Rename zynqmp-power node name to power-management which is more aligned with generic node name recommendation. Signed-off-by: Michal Simek Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/920c839ae2c9c5803c6c08b8705a0d8338bb94bc.1703161663.git.michal.simek@amd.com Signed-off-by: Sebastian Reichel --- .../devicetree/bindings/power/reset/xlnx,zynqmp-power.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/power/reset/xlnx,zynqmp-power.yaml b/Documentation/devicetree/bindings/power/reset/xlnx,zynqmp-power.yaml index 45792e216981a..799831636194f 100644 --- a/Documentation/devicetree/bindings/power/reset/xlnx,zynqmp-power.yaml +++ b/Documentation/devicetree/bindings/power/reset/xlnx,zynqmp-power.yaml @@ -57,7 +57,7 @@ examples: firmware { zynqmp-firmware { - zynqmp-power { + power-management { compatible = "xlnx,zynqmp-power"; interrupts = <0 35 4>; }; @@ -70,7 +70,7 @@ examples: firmware { zynqmp-firmware { - zynqmp-power { + power-management { compatible = "xlnx,zynqmp-power"; interrupt-parent = <&gic>; interrupts = <0 35 4>; -- GitLab From 370cc1579a79a29a6dba4d9ea8d4d0147aa41861 Mon Sep 17 00:00:00 2001 From: Hermes Zhang Date: Fri, 8 Dec 2023 11:47:07 +0800 Subject: [PATCH 0553/1290] dt-bindings: power: supply: bq24190: Add BQ24296 compatible The BQ24296 is most similar to the BQ24196, but the: 1. OTG config is split from CHG config (REG01) 2. ICHG (Fast Charge Current limit) range is smaller (<=3008mA) 3. NTC fault is simplified to 2 bits Signed-off-by: Hermes Zhang Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20231208034708.1248389-2-Hermes.Zhang@axis.com Signed-off-by: Sebastian Reichel --- Documentation/devicetree/bindings/power/supply/bq24190.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/power/supply/bq24190.yaml b/Documentation/devicetree/bindings/power/supply/bq24190.yaml index d3ebc9de8c0b4..131b7e57d22f4 100644 --- a/Documentation/devicetree/bindings/power/supply/bq24190.yaml +++ b/Documentation/devicetree/bindings/power/supply/bq24190.yaml @@ -20,6 +20,7 @@ properties: - ti,bq24192 - ti,bq24192i - ti,bq24196 + - ti,bq24296 reg: maxItems: 1 -- GitLab From b150a703b56fb6eb282d059b421652ccd9155c23 Mon Sep 17 00:00:00 2001 From: Hermes Zhang Date: Fri, 8 Dec 2023 11:47:08 +0800 Subject: [PATCH 0554/1290] power: supply: bq24190_charger: Add support for BQ24296 The BQ24296 is most similar to the BQ24196, but the: 1. OTG config is split from CHG config (REG01) 2. ICHG (Fast Charge Current limit) range is smaller (<=3008mA) 3. NTC fault is simplified to 2 bits Signed-off-by: Hermes Zhang Link: https://lore.kernel.org/r/20231208034708.1248389-3-Hermes.Zhang@axis.com Signed-off-by: Sebastian Reichel --- drivers/power/supply/bq24190_charger.c | 457 +++++++++++++++++++------ 1 file changed, 354 insertions(+), 103 deletions(-) diff --git a/drivers/power/supply/bq24190_charger.c b/drivers/power/supply/bq24190_charger.c index 1db290ee2591a..a8995a21fadbf 100644 --- a/drivers/power/supply/bq24190_charger.c +++ b/drivers/power/supply/bq24190_charger.c @@ -36,10 +36,16 @@ #define BQ24190_REG_POC_WDT_RESET_SHIFT 6 #define BQ24190_REG_POC_CHG_CONFIG_MASK (BIT(5) | BIT(4)) #define BQ24190_REG_POC_CHG_CONFIG_SHIFT 4 -#define BQ24190_REG_POC_CHG_CONFIG_DISABLE 0x0 -#define BQ24190_REG_POC_CHG_CONFIG_CHARGE 0x1 -#define BQ24190_REG_POC_CHG_CONFIG_OTG 0x2 -#define BQ24190_REG_POC_CHG_CONFIG_OTG_ALT 0x3 +#define BQ24190_REG_POC_CHG_CONFIG_DISABLE 0x0 +#define BQ24190_REG_POC_CHG_CONFIG_CHARGE 0x1 +#define BQ24190_REG_POC_CHG_CONFIG_OTG 0x2 +#define BQ24190_REG_POC_CHG_CONFIG_OTG_ALT 0x3 +#define BQ24296_REG_POC_OTG_CONFIG_MASK BIT(5) +#define BQ24296_REG_POC_OTG_CONFIG_SHIFT 5 +#define BQ24296_REG_POC_CHG_CONFIG_MASK BIT(4) +#define BQ24296_REG_POC_CHG_CONFIG_SHIFT 4 +#define BQ24296_REG_POC_OTG_CONFIG_DISABLE 0x0 +#define BQ24296_REG_POC_OTG_CONFIG_OTG 0x1 #define BQ24190_REG_POC_SYS_MIN_MASK (BIT(3) | BIT(2) | BIT(1)) #define BQ24190_REG_POC_SYS_MIN_SHIFT 1 #define BQ24190_REG_POC_SYS_MIN_MIN 3000 @@ -134,58 +140,23 @@ #define BQ24190_REG_F_BAT_FAULT_SHIFT 3 #define BQ24190_REG_F_NTC_FAULT_MASK (BIT(2) | BIT(1) | BIT(0)) #define BQ24190_REG_F_NTC_FAULT_SHIFT 0 +#define BQ24296_REG_F_NTC_FAULT_MASK (BIT(1) | BIT(0)) +#define BQ24296_REG_F_NTC_FAULT_SHIFT 0 #define BQ24190_REG_VPRS 0x0A /* Vendor/Part/Revision Status */ #define BQ24190_REG_VPRS_PN_MASK (BIT(5) | BIT(4) | BIT(3)) #define BQ24190_REG_VPRS_PN_SHIFT 3 -#define BQ24190_REG_VPRS_PN_24190 0x4 -#define BQ24190_REG_VPRS_PN_24192 0x5 /* Also 24193, 24196 */ -#define BQ24190_REG_VPRS_PN_24192I 0x3 +#define BQ24190_REG_VPRS_PN_24190 0x4 +#define BQ24190_REG_VPRS_PN_24192 0x5 /* Also 24193, 24196 */ +#define BQ24190_REG_VPRS_PN_24192I 0x3 +#define BQ24296_REG_VPRS_PN_MASK (BIT(7) | BIT(6) | BIT(5)) +#define BQ24296_REG_VPRS_PN_SHIFT 5 +#define BQ24296_REG_VPRS_PN_24296 0x1 #define BQ24190_REG_VPRS_TS_PROFILE_MASK BIT(2) #define BQ24190_REG_VPRS_TS_PROFILE_SHIFT 2 #define BQ24190_REG_VPRS_DEV_REG_MASK (BIT(1) | BIT(0)) #define BQ24190_REG_VPRS_DEV_REG_SHIFT 0 -/* - * The FAULT register is latched by the bq24190 (except for NTC_FAULT) - * so the first read after a fault returns the latched value and subsequent - * reads return the current value. In order to return the fault status - * to the user, have the interrupt handler save the reg's value and retrieve - * it in the appropriate health/status routine. - */ -struct bq24190_dev_info { - struct i2c_client *client; - struct device *dev; - struct extcon_dev *edev; - struct power_supply *charger; - struct power_supply *battery; - struct delayed_work input_current_limit_work; - char model_name[I2C_NAME_SIZE]; - bool initialized; - bool irq_event; - bool otg_vbus_enabled; - int charge_type; - u16 sys_min; - u16 iprechg; - u16 iterm; - u32 ichg; - u32 ichg_max; - u32 vreg; - u32 vreg_max; - struct mutex f_reg_lock; - u8 f_reg; - u8 ss_reg; - u8 watchdog; -}; - -static int bq24190_charger_set_charge_type(struct bq24190_dev_info *bdi, - const union power_supply_propval *val); - -static const unsigned int bq24190_usb_extcon_cable[] = { - EXTCON_USB, - EXTCON_NONE, -}; - /* * The tables below provide a 2-way mapping for the value that goes in * the register field and the real-world value that it represents. @@ -211,6 +182,9 @@ static const int bq24190_ccc_ichg_values[] = { 4096000, 4160000, 4224000, 4288000, 4352000, 4416000, 4480000, 4544000 }; +/* ICHG higher than 3008mA is not supported in BQ24296 */ +#define BQ24296_CCC_ICHG_VALUES_LEN 40 + /* REG04[7:2] (VREG) in uV */ static const int bq24190_cvc_vreg_values[] = { 3504000, 3520000, 3536000, 3552000, 3568000, 3584000, 3600000, 3616000, @@ -228,6 +202,68 @@ static const int bq24190_ictrc_treg_values[] = { 600, 800, 1000, 1200 }; +enum bq24190_chip { + BQ24190, + BQ24192, + BQ24192i, + BQ24196, + BQ24296, +}; + +/* + * The FAULT register is latched by the bq24190 (except for NTC_FAULT) + * so the first read after a fault returns the latched value and subsequent + * reads return the current value. In order to return the fault status + * to the user, have the interrupt handler save the reg's value and retrieve + * it in the appropriate health/status routine. + */ +struct bq24190_dev_info { + struct i2c_client *client; + struct device *dev; + struct extcon_dev *edev; + struct power_supply *charger; + struct power_supply *battery; + struct delayed_work input_current_limit_work; + char model_name[I2C_NAME_SIZE]; + bool initialized; + bool irq_event; + bool otg_vbus_enabled; + int charge_type; + u16 sys_min; + u16 iprechg; + u16 iterm; + u32 ichg; + u32 ichg_max; + u32 vreg; + u32 vreg_max; + struct mutex f_reg_lock; + u8 f_reg; + u8 ss_reg; + u8 watchdog; + const struct bq24190_chip_info *info; +}; + +struct bq24190_chip_info { + int ichg_array_size; +#ifdef CONFIG_REGULATOR + const struct regulator_desc vbus_desc; +#endif + int (*check_chip)(struct bq24190_dev_info *bdi); + int (*set_chg_config)(struct bq24190_dev_info *bdi, const u8 chg_config); + int (*set_otg_vbus)(struct bq24190_dev_info *bdi, bool enable); + u8 ntc_fault_mask; + int (*get_ntc_status)(const u8 value); +}; + +static int bq24190_charger_set_charge_type(struct bq24190_dev_info *bdi, + const union power_supply_propval *val); + +static const unsigned int bq24190_usb_extcon_cable[] = { + EXTCON_USB, + EXTCON_NONE, +}; + + /* * Return the index in 'tbl' of greatest value that is less than or equal to * 'val'. The index range returned is 0 to 'tbl_size' - 1. Assumes that @@ -529,6 +565,43 @@ static int bq24190_set_otg_vbus(struct bq24190_dev_info *bdi, bool enable) return ret; } +static int bq24296_set_otg_vbus(struct bq24190_dev_info *bdi, bool enable) +{ + int ret; + + ret = pm_runtime_resume_and_get(bdi->dev); + if (ret < 0) { + dev_warn(bdi->dev, "pm_runtime_get failed: %i\n", ret); + return ret; + } + + bdi->otg_vbus_enabled = enable; + if (enable) { + ret = bq24190_write_mask(bdi, BQ24190_REG_POC, + BQ24296_REG_POC_CHG_CONFIG_MASK, + BQ24296_REG_POC_CHG_CONFIG_SHIFT, + BQ24190_REG_POC_CHG_CONFIG_DISABLE); + + if (ret < 0) + goto out; + + ret = bq24190_write_mask(bdi, BQ24190_REG_POC, + BQ24296_REG_POC_OTG_CONFIG_MASK, + BQ24296_REG_POC_CHG_CONFIG_SHIFT, + BQ24296_REG_POC_OTG_CONFIG_OTG); + } else + ret = bq24190_write_mask(bdi, BQ24190_REG_POC, + BQ24296_REG_POC_OTG_CONFIG_MASK, + BQ24296_REG_POC_CHG_CONFIG_SHIFT, + BQ24296_REG_POC_OTG_CONFIG_DISABLE); + +out: + pm_runtime_mark_last_busy(bdi->dev); + pm_runtime_put_autosuspend(bdi->dev); + + return ret; +} + #ifdef CONFIG_REGULATOR static int bq24190_vbus_enable(struct regulator_dev *dev) { @@ -567,6 +640,43 @@ static int bq24190_vbus_is_enabled(struct regulator_dev *dev) return bdi->otg_vbus_enabled; } +static int bq24296_vbus_enable(struct regulator_dev *dev) +{ + return bq24296_set_otg_vbus(rdev_get_drvdata(dev), true); +} + +static int bq24296_vbus_disable(struct regulator_dev *dev) +{ + return bq24296_set_otg_vbus(rdev_get_drvdata(dev), false); +} + +static int bq24296_vbus_is_enabled(struct regulator_dev *dev) +{ + struct bq24190_dev_info *bdi = rdev_get_drvdata(dev); + int ret; + u8 val; + + ret = pm_runtime_resume_and_get(bdi->dev); + if (ret < 0) { + dev_warn(bdi->dev, "pm_runtime_get failed: %i\n", ret); + return ret; + } + + ret = bq24190_read_mask(bdi, BQ24190_REG_POC, + BQ24296_REG_POC_OTG_CONFIG_MASK, + BQ24296_REG_POC_OTG_CONFIG_SHIFT, &val); + + pm_runtime_mark_last_busy(bdi->dev); + pm_runtime_put_autosuspend(bdi->dev); + + if (ret) + return ret; + + bdi->otg_vbus_enabled = (val == BQ24296_REG_POC_OTG_CONFIG_OTG); + + return bdi->otg_vbus_enabled; +} + static const struct regulator_ops bq24190_vbus_ops = { .enable = bq24190_vbus_enable, .disable = bq24190_vbus_disable, @@ -583,6 +693,22 @@ static const struct regulator_desc bq24190_vbus_desc = { .n_voltages = 1, }; +static const struct regulator_ops bq24296_vbus_ops = { + .enable = bq24296_vbus_enable, + .disable = bq24296_vbus_disable, + .is_enabled = bq24296_vbus_is_enabled, +}; + +static const struct regulator_desc bq24296_vbus_desc = { + .name = "usb_otg_vbus", + .of_match = "usb-otg-vbus", + .type = REGULATOR_VOLTAGE, + .owner = THIS_MODULE, + .ops = &bq24296_vbus_ops, + .fixed_uV = 5000000, + .n_voltages = 1, +}; + static const struct regulator_init_data bq24190_vbus_init_data = { .constraints = { .valid_ops_mask = REGULATOR_CHANGE_STATUS, @@ -602,7 +728,7 @@ static int bq24190_register_vbus_regulator(struct bq24190_dev_info *bdi) else cfg.init_data = &bq24190_vbus_init_data; cfg.driver_data = bdi; - reg = devm_regulator_register(bdi->dev, &bq24190_vbus_desc, &cfg); + reg = devm_regulator_register(bdi->dev, &bdi->info->vbus_desc, &cfg); if (IS_ERR(reg)) { ret = PTR_ERR(reg); dev_err(bdi->dev, "Can't register regulator: %d\n", ret); @@ -678,7 +804,7 @@ static int bq24190_set_config(struct bq24190_dev_info *bdi) BQ24190_REG_CCC_ICHG_MASK, BQ24190_REG_CCC_ICHG_SHIFT, bq24190_ccc_ichg_values, - ARRAY_SIZE(bq24190_ccc_ichg_values), + bdi->info->ichg_array_size, bdi->ichg); if (ret < 0) return ret; @@ -777,6 +903,24 @@ static int bq24190_charger_get_charge_type(struct bq24190_dev_info *bdi, return 0; } +static int bq24190_battery_set_chg_config(struct bq24190_dev_info *bdi, + const u8 chg_config) +{ + return bq24190_write_mask(bdi, BQ24190_REG_POC, + BQ24190_REG_POC_CHG_CONFIG_MASK, + BQ24190_REG_POC_CHG_CONFIG_SHIFT, + chg_config); +} + +static int bq24296_battery_set_chg_config(struct bq24190_dev_info *bdi, + const u8 chg_config) +{ + return bq24190_write_mask(bdi, BQ24190_REG_POC, + BQ24296_REG_POC_CHG_CONFIG_MASK, + BQ24296_REG_POC_CHG_CONFIG_SHIFT, + chg_config); +} + static int bq24190_charger_set_charge_type(struct bq24190_dev_info *bdi, const union power_supply_propval *val) { @@ -835,9 +979,50 @@ static int bq24190_charger_set_charge_type(struct bq24190_dev_info *bdi, return ret; } - return bq24190_write_mask(bdi, BQ24190_REG_POC, - BQ24190_REG_POC_CHG_CONFIG_MASK, - BQ24190_REG_POC_CHG_CONFIG_SHIFT, chg_config); + return bdi->info->set_chg_config(bdi, chg_config); +} + +static int bq24190_charger_get_ntc_status(u8 value) +{ + int health; + + switch (value >> BQ24190_REG_F_NTC_FAULT_SHIFT & 0x7) { + case 0x1: /* TS1 Cold */ + case 0x3: /* TS2 Cold */ + case 0x5: /* Both Cold */ + health = POWER_SUPPLY_HEALTH_COLD; + break; + case 0x2: /* TS1 Hot */ + case 0x4: /* TS2 Hot */ + case 0x6: /* Both Hot */ + health = POWER_SUPPLY_HEALTH_OVERHEAT; + break; + default: + health = POWER_SUPPLY_HEALTH_UNKNOWN; + } + + return health; +} + +static int bq24296_charger_get_ntc_status(u8 value) +{ + int health; + + switch (value >> BQ24296_REG_F_NTC_FAULT_SHIFT & 0x3) { + case 0x0: /* Normal */ + health = POWER_SUPPLY_HEALTH_GOOD; + break; + case 0x1: /* Hot */ + health = POWER_SUPPLY_HEALTH_OVERHEAT; + break; + case 0x2: /* Cold */ + health = POWER_SUPPLY_HEALTH_COLD; + break; + default: + health = POWER_SUPPLY_HEALTH_UNKNOWN; + } + + return health; } static int bq24190_charger_get_health(struct bq24190_dev_info *bdi, @@ -850,21 +1035,8 @@ static int bq24190_charger_get_health(struct bq24190_dev_info *bdi, v = bdi->f_reg; mutex_unlock(&bdi->f_reg_lock); - if (v & BQ24190_REG_F_NTC_FAULT_MASK) { - switch (v >> BQ24190_REG_F_NTC_FAULT_SHIFT & 0x7) { - case 0x1: /* TS1 Cold */ - case 0x3: /* TS2 Cold */ - case 0x5: /* Both Cold */ - health = POWER_SUPPLY_HEALTH_COLD; - break; - case 0x2: /* TS1 Hot */ - case 0x4: /* TS2 Hot */ - case 0x6: /* Both Hot */ - health = POWER_SUPPLY_HEALTH_OVERHEAT; - break; - default: - health = POWER_SUPPLY_HEALTH_UNKNOWN; - } + if (v & bdi->info->ntc_fault_mask) { + health = bdi->info->get_ntc_status(v); } else if (v & BQ24190_REG_F_BAT_FAULT_MASK) { health = POWER_SUPPLY_HEALTH_OVERVOLTAGE; } else if (v & BQ24190_REG_F_CHRG_FAULT_MASK) { @@ -1015,7 +1187,7 @@ static int bq24190_charger_get_current(struct bq24190_dev_info *bdi, ret = bq24190_get_field_val(bdi, BQ24190_REG_CCC, BQ24190_REG_CCC_ICHG_MASK, BQ24190_REG_CCC_ICHG_SHIFT, bq24190_ccc_ichg_values, - ARRAY_SIZE(bq24190_ccc_ichg_values), &curr); + bdi->info->ichg_array_size, &curr); if (ret < 0) return ret; @@ -1055,7 +1227,7 @@ static int bq24190_charger_set_current(struct bq24190_dev_info *bdi, ret = bq24190_set_field_val(bdi, BQ24190_REG_CCC, BQ24190_REG_CCC_ICHG_MASK, BQ24190_REG_CCC_ICHG_SHIFT, bq24190_ccc_ichg_values, - ARRAY_SIZE(bq24190_ccc_ichg_values), curr); + bdi->info->ichg_array_size, curr); if (ret < 0) return ret; @@ -1395,26 +1567,9 @@ static int bq24190_battery_get_health(struct bq24190_dev_info *bdi, if (v & BQ24190_REG_F_BAT_FAULT_MASK) { health = POWER_SUPPLY_HEALTH_OVERVOLTAGE; } else { - v &= BQ24190_REG_F_NTC_FAULT_MASK; - v >>= BQ24190_REG_F_NTC_FAULT_SHIFT; + v &= bdi->info->ntc_fault_mask; - switch (v) { - case 0x0: /* Normal */ - health = POWER_SUPPLY_HEALTH_GOOD; - break; - case 0x1: /* TS1 Cold */ - case 0x3: /* TS2 Cold */ - case 0x5: /* Both Cold */ - health = POWER_SUPPLY_HEALTH_COLD; - break; - case 0x2: /* TS1 Hot */ - case 0x4: /* TS2 Hot */ - case 0x6: /* Both Hot */ - health = POWER_SUPPLY_HEALTH_OVERHEAT; - break; - default: - health = POWER_SUPPLY_HEALTH_UNKNOWN; - } + health = v ? bdi->info->get_ntc_status(v) : POWER_SUPPLY_HEALTH_GOOD; } val->intval = health; @@ -1601,12 +1756,13 @@ static int bq24190_configure_usb_otg(struct bq24190_dev_info *bdi, u8 ss_reg) static void bq24190_check_status(struct bq24190_dev_info *bdi) { const u8 battery_mask_ss = BQ24190_REG_SS_CHRG_STAT_MASK; - const u8 battery_mask_f = BQ24190_REG_F_BAT_FAULT_MASK - | BQ24190_REG_F_NTC_FAULT_MASK; + u8 battery_mask_f = BQ24190_REG_F_BAT_FAULT_MASK; bool alert_charger = false, alert_battery = false; u8 ss_reg = 0, f_reg = 0; int i, ret; + battery_mask_f |= bdi->info->ntc_fault_mask; + ret = bq24190_read(bdi, BQ24190_REG_SS, &ss_reg); if (ret < 0) { dev_err(bdi->dev, "Can't read SS reg: %d\n", ret); @@ -1633,7 +1789,7 @@ static void bq24190_check_status(struct bq24190_dev_info *bdi) !!(f_reg & BQ24190_REG_F_BOOST_FAULT_MASK), !!(f_reg & BQ24190_REG_F_CHRG_FAULT_MASK), !!(f_reg & BQ24190_REG_F_BAT_FAULT_MASK), - !!(f_reg & BQ24190_REG_F_NTC_FAULT_MASK)); + !!(f_reg & bdi->info->ntc_fault_mask)); mutex_lock(&bdi->f_reg_lock); if ((bdi->f_reg & battery_mask_f) != (f_reg & battery_mask_f)) @@ -1696,12 +1852,11 @@ static irqreturn_t bq24190_irq_handler_thread(int irq, void *data) return IRQ_HANDLED; } -static int bq24190_hw_init(struct bq24190_dev_info *bdi) +static int bq24190_check_chip(struct bq24190_dev_info *bdi) { u8 v; int ret; - /* First check that the device really is what its supposed to be */ ret = bq24190_read_mask(bdi, BQ24190_REG_VPRS, BQ24190_REG_VPRS_PN_MASK, BQ24190_REG_VPRS_PN_SHIFT, @@ -1719,6 +1874,40 @@ static int bq24190_hw_init(struct bq24190_dev_info *bdi) return -ENODEV; } + return 0; +} + +static int bq24296_check_chip(struct bq24190_dev_info *bdi) +{ + u8 v; + int ret; + + ret = bq24190_read_mask(bdi, BQ24190_REG_VPRS, + BQ24296_REG_VPRS_PN_MASK, + BQ24296_REG_VPRS_PN_SHIFT, + &v); + if (ret < 0) + return ret; + + switch (v) { + case BQ24296_REG_VPRS_PN_24296: + break; + default: + dev_err(bdi->dev, "Error unknown model: 0x%02x\n", v); + return -ENODEV; + } + + return 0; +} + +static int bq24190_hw_init(struct bq24190_dev_info *bdi) +{ + int ret; + + ret = bdi->info->check_chip(bdi); + if (ret < 0) + return ret; + ret = bq24190_register_reset(bdi); if (ret < 0) return ret; @@ -1736,7 +1925,8 @@ static int bq24190_get_config(struct bq24190_dev_info *bdi) struct power_supply_battery_info *info; int v, idx; - idx = ARRAY_SIZE(bq24190_ccc_ichg_values) - 1; + idx = bdi->info->ichg_array_size - 1; + bdi->ichg_max = bq24190_ccc_ichg_values[idx]; idx = ARRAY_SIZE(bq24190_cvc_vreg_values) - 1; @@ -1781,6 +1971,64 @@ static int bq24190_get_config(struct bq24190_dev_info *bdi) return 0; } +static const struct bq24190_chip_info bq24190_chip_info_tbl[] = { + [BQ24190] = { + .ichg_array_size = ARRAY_SIZE(bq24190_ccc_ichg_values), +#ifdef CONFIG_REGULATOR + .vbus_desc = bq24190_vbus_desc, +#endif + .check_chip = bq24190_check_chip, + .set_chg_config = bq24190_battery_set_chg_config, + .ntc_fault_mask = BQ24190_REG_F_NTC_FAULT_MASK, + .get_ntc_status = bq24190_charger_get_ntc_status, + .set_otg_vbus = bq24190_set_otg_vbus, + }, + [BQ24192] = { + .ichg_array_size = ARRAY_SIZE(bq24190_ccc_ichg_values), +#ifdef CONFIG_REGULATOR + .vbus_desc = bq24190_vbus_desc, +#endif + .check_chip = bq24190_check_chip, + .set_chg_config = bq24190_battery_set_chg_config, + .ntc_fault_mask = BQ24190_REG_F_NTC_FAULT_MASK, + .get_ntc_status = bq24190_charger_get_ntc_status, + .set_otg_vbus = bq24190_set_otg_vbus, + }, + [BQ24192i] = { + .ichg_array_size = ARRAY_SIZE(bq24190_ccc_ichg_values), +#ifdef CONFIG_REGULATOR + .vbus_desc = bq24190_vbus_desc, +#endif + .check_chip = bq24190_check_chip, + .set_chg_config = bq24190_battery_set_chg_config, + .ntc_fault_mask = BQ24190_REG_F_NTC_FAULT_MASK, + .get_ntc_status = bq24190_charger_get_ntc_status, + .set_otg_vbus = bq24190_set_otg_vbus, + }, + [BQ24196] = { + .ichg_array_size = ARRAY_SIZE(bq24190_ccc_ichg_values), +#ifdef CONFIG_REGULATOR + .vbus_desc = bq24190_vbus_desc, +#endif + .check_chip = bq24190_check_chip, + .set_chg_config = bq24190_battery_set_chg_config, + .ntc_fault_mask = BQ24190_REG_F_NTC_FAULT_MASK, + .get_ntc_status = bq24190_charger_get_ntc_status, + .set_otg_vbus = bq24190_set_otg_vbus, + }, + [BQ24296] = { + .ichg_array_size = BQ24296_CCC_ICHG_VALUES_LEN, +#ifdef CONFIG_REGULATOR + .vbus_desc = bq24296_vbus_desc, +#endif + .check_chip = bq24296_check_chip, + .set_chg_config = bq24296_battery_set_chg_config, + .ntc_fault_mask = BQ24296_REG_F_NTC_FAULT_MASK, + .get_ntc_status = bq24296_charger_get_ntc_status, + .set_otg_vbus = bq24296_set_otg_vbus, + }, +}; + static int bq24190_probe(struct i2c_client *client) { const struct i2c_device_id *id = i2c_client_get_device_id(client); @@ -1804,6 +2052,7 @@ static int bq24190_probe(struct i2c_client *client) bdi->client = client; bdi->dev = dev; strscpy(bdi->model_name, id->name, sizeof(bdi->model_name)); + bdi->info = i2c_get_match_data(client); mutex_init(&bdi->f_reg_lock); bdi->charge_type = POWER_SUPPLY_CHARGE_TYPE_FAST; bdi->f_reg = 0; @@ -1940,7 +2189,7 @@ static void bq24190_shutdown(struct i2c_client *client) struct bq24190_dev_info *bdi = i2c_get_clientdata(client); /* Turn off 5V boost regulator on shutdown */ - bq24190_set_otg_vbus(bdi, false); + bdi->info->set_otg_vbus(bdi, false); } static __maybe_unused int bq24190_runtime_suspend(struct device *dev) @@ -2029,19 +2278,21 @@ static const struct dev_pm_ops bq24190_pm_ops = { }; static const struct i2c_device_id bq24190_i2c_ids[] = { - { "bq24190" }, - { "bq24192" }, - { "bq24192i" }, - { "bq24196" }, + { "bq24190", (kernel_ulong_t)&bq24190_chip_info_tbl[BQ24190] }, + { "bq24192", (kernel_ulong_t)&bq24190_chip_info_tbl[BQ24192] }, + { "bq24192i", (kernel_ulong_t)&bq24190_chip_info_tbl[BQ24192i] }, + { "bq24196", (kernel_ulong_t)&bq24190_chip_info_tbl[BQ24196] }, + { "bq24296", (kernel_ulong_t)&bq24190_chip_info_tbl[BQ24296] }, { }, }; MODULE_DEVICE_TABLE(i2c, bq24190_i2c_ids); static const struct of_device_id bq24190_of_match[] = { - { .compatible = "ti,bq24190", }, - { .compatible = "ti,bq24192", }, - { .compatible = "ti,bq24192i", }, - { .compatible = "ti,bq24196", }, + { .compatible = "ti,bq24190", .data = &bq24190_chip_info_tbl[BQ24190] }, + { .compatible = "ti,bq24192", .data = &bq24190_chip_info_tbl[BQ24192] }, + { .compatible = "ti,bq24192i", .data = &bq24190_chip_info_tbl[BQ24192i] }, + { .compatible = "ti,bq24196", .data = &bq24190_chip_info_tbl[BQ24196] }, + { .compatible = "ti,bq24296", .data = &bq24190_chip_info_tbl[BQ24296] }, { }, }; MODULE_DEVICE_TABLE(of, bq24190_of_match); -- GitLab From c1b9f2c66eed3261db76cccd8a22a9affae8dcbf Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 20 Oct 2022 21:21:09 +0200 Subject: [PATCH 0555/1290] vdpa: Fix an error handling path in eni_vdpa_probe() After a successful vp_legacy_probe() call, vp_legacy_remove() should be called in the error handling path, as already done in the remove function. Add the missing call. Fixes: e85087beedca ("eni_vdpa: add vDPA driver for Alibaba ENI") Signed-off-by: Christophe JAILLET Message-Id: Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/alibaba/eni_vdpa.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/vdpa/alibaba/eni_vdpa.c b/drivers/vdpa/alibaba/eni_vdpa.c index 5a09a09cca709..cce3d1837104c 100644 --- a/drivers/vdpa/alibaba/eni_vdpa.c +++ b/drivers/vdpa/alibaba/eni_vdpa.c @@ -497,7 +497,7 @@ static int eni_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (!eni_vdpa->vring) { ret = -ENOMEM; ENI_ERR(pdev, "failed to allocate virtqueues\n"); - goto err; + goto err_remove_vp_legacy; } for (i = 0; i < eni_vdpa->queues; i++) { @@ -509,11 +509,13 @@ static int eni_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) ret = vdpa_register_device(&eni_vdpa->vdpa, eni_vdpa->queues); if (ret) { ENI_ERR(pdev, "failed to register to vdpa bus\n"); - goto err; + goto err_remove_vp_legacy; } return 0; +err_remove_vp_legacy: + vp_legacy_remove(&eni_vdpa->ldev); err: put_device(&eni_vdpa->vdpa.dev); return ret; -- GitLab From b91cf01cf3e63a627b3b65f4284dcf9a4deb80f9 Mon Sep 17 00:00:00 2001 From: Inochi Amaoto Date: Mon, 4 Dec 2023 17:51:08 +0800 Subject: [PATCH 0556/1290] dt-bindings: timer: thead,c900-aclint-mtimer: separate mtime and mtimecmp regs The timer registers of aclint don't follow the clint layout and can be mapped on any different offset. As sg2042 uses separated timer and mswi for its clint, it should follow the aclint spec and have separated registers. The previous patch introduced a new type of T-HEAD aclint timer which has clint timer layout. Although it has the clint timer layout, it should follow the aclint spec and uses the separated mtime and mtimecmp regs. So a ABI change is needed to make the timer fit the aclint spec. To make T-HEAD aclint timer more closer to the aclint spec, use regs-names to represent the mtimecmp register, which can avoid hack for unsupport mtime register of T-HEAD aclint timer. Also, as T-HEAD aclint only supports mtimecmp, it is unnecessary to implement the whole aclint spec. To make this binding T-HEAD specific, only add reg-name for existed register. For details, see the discussion in the last link. Signed-off-by: Inochi Amaoto Fixes: 4734449f7311 ("dt-bindings: timer: Add Sophgo sg2042 CLINT timer") Link: https://lists.infradead.org/pipermail/opensbi/2023-October/005693.html Link: https://github.com/riscv/riscv-aclint/blob/main/riscv-aclint.adoc Link: https://lore.kernel.org/all/IA1PR20MB4953F9D77FFC76A9D236922DBBB6A@IA1PR20MB4953.namprd20.prod.outlook.com/ Acked-by: Guo Ren Acked-by: Conor Dooley Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/IA1PR20MB49531ED1BCC00D6B265C2D10BB86A@IA1PR20MB4953.namprd20.prod.outlook.com --- .../bindings/timer/thead,c900-aclint-mtimer.yaml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/timer/thead,c900-aclint-mtimer.yaml b/Documentation/devicetree/bindings/timer/thead,c900-aclint-mtimer.yaml index fbd235650e52c..2e92bcdeb423a 100644 --- a/Documentation/devicetree/bindings/timer/thead,c900-aclint-mtimer.yaml +++ b/Documentation/devicetree/bindings/timer/thead,c900-aclint-mtimer.yaml @@ -17,7 +17,12 @@ properties: - const: thead,c900-aclint-mtimer reg: - maxItems: 1 + items: + - description: MTIMECMP Registers + + reg-names: + items: + - const: mtimecmp interrupts-extended: minItems: 1 @@ -28,6 +33,7 @@ additionalProperties: false required: - compatible - reg + - reg-names - interrupts-extended examples: @@ -39,5 +45,6 @@ examples: <&cpu3intc 7>, <&cpu4intc 7>; reg = <0xac000000 0x00010000>; + reg-names = "mtimecmp"; }; ... -- GitLab From e0cf60151e6317c654c42ba0e8b1fb6ff477489a Mon Sep 17 00:00:00 2001 From: Sia Jee Heng Date: Fri, 1 Dec 2023 20:14:07 +0800 Subject: [PATCH 0557/1290] dt-bindings: timer: Add StarFive JH8100 clint Add compatible string for the StarFive JH8100 clint. Signed-off-by: Sia Jee Heng Reviewed-by: Ley Foon Tan Acked-by: Conor Dooley Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20231201121410.95298-4-jeeheng.sia@starfivetech.com --- Documentation/devicetree/bindings/timer/sifive,clint.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/timer/sifive,clint.yaml b/Documentation/devicetree/bindings/timer/sifive,clint.yaml index e8be6c4703640..01254261e1563 100644 --- a/Documentation/devicetree/bindings/timer/sifive,clint.yaml +++ b/Documentation/devicetree/bindings/timer/sifive,clint.yaml @@ -33,6 +33,7 @@ properties: - sifive,fu540-c000-clint # SiFive FU540 - starfive,jh7100-clint # StarFive JH7100 - starfive,jh7110-clint # StarFive JH7110 + - starfive,jh8100-clint # StarFive JH8100 - const: sifive,clint0 # SiFive CLINT v0 IP block - items: - enum: -- GitLab From 6a902b118e7f30dbf0e6248f7b0f97e12c0939c3 Mon Sep 17 00:00:00 2001 From: Joshua Yeong Date: Thu, 16 Nov 2023 18:53:12 +0800 Subject: [PATCH 0558/1290] clocksource/timer-riscv: Add riscv_clock_shutdown callback Add clocksource detach/shutdown callback to disable RISC-V timer interrupt when switching out riscv timer as clock source Signed-off-by: Joshua Yeong Reviewed-by: Anup Patel Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20231116105312.4800-1-joshua.yeong@starfivetech.com --- drivers/clocksource/timer-riscv.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/clocksource/timer-riscv.c b/drivers/clocksource/timer-riscv.c index 57857c0dfba97..e66dcbd665665 100644 --- a/drivers/clocksource/timer-riscv.c +++ b/drivers/clocksource/timer-riscv.c @@ -61,12 +61,19 @@ static int riscv_clock_next_event(unsigned long delta, return 0; } +static int riscv_clock_shutdown(struct clock_event_device *evt) +{ + riscv_clock_event_stop(); + return 0; +} + static unsigned int riscv_clock_event_irq; static DEFINE_PER_CPU(struct clock_event_device, riscv_clock_event) = { .name = "riscv_timer_clockevent", .features = CLOCK_EVT_FEAT_ONESHOT, .rating = 100, .set_next_event = riscv_clock_next_event, + .set_state_shutdown = riscv_clock_shutdown, }; /* -- GitLab From b99a212a7697c542b460adaa15d4a98abf8223f0 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Tue, 14 Nov 2023 09:29:30 +0200 Subject: [PATCH 0559/1290] clocksource/drivers/timer-ti-dm: Fix make W=n kerneldoc warnings Kernel test robot reports of kerneldoc related warnings that happen with make W=n for "parameter or member not described". These were caused by changes to function parameter names with earlier commits where the kerneldoc parts were not updated. Fixes: 49cd16bb573e ("clocksource/drivers/timer-ti-dm: Simplify register writes with dmtimer_write()") Fixes: a6e543f61531 ("clocksource/drivers/timer-ti-dm: Move struct omap_dm_timer fields to driver") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202311040403.DzIiBuwU-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202311040606.XL5OcR9O-lkp@intel.com/ Signed-off-by: Tony Lindgren Reviewed-by: Randy Dunlap Tested-by: Randy Dunlap Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20231114072930.40615-1-tony@atomide.com --- drivers/clocksource/timer-ti-dm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/timer-ti-dm.c b/drivers/clocksource/timer-ti-dm.c index 5f60f6bd33866..56acf26172621 100644 --- a/drivers/clocksource/timer-ti-dm.c +++ b/drivers/clocksource/timer-ti-dm.c @@ -183,7 +183,7 @@ static inline u32 dmtimer_read(struct dmtimer *timer, u32 reg) * dmtimer_write - write timer registers in posted and non-posted mode * @timer: timer pointer over which write operation is to perform * @reg: lowest byte holds the register offset - * @value: data to write into the register + * @val: data to write into the register * * The posted mode bit is encoded in reg. Note that in posted mode, the write * pending bit must be checked. Otherwise a write on a register which has a @@ -949,7 +949,7 @@ static int omap_dm_timer_set_int_enable(struct omap_dm_timer *cookie, /** * omap_dm_timer_set_int_disable - disable timer interrupts - * @timer: pointer to timer handle + * @cookie: pointer to timer cookie * @mask: bit mask of interrupts to be disabled * * Disables the specified timer interrupts for a timer. -- GitLab From 0515c73467fd550249ef83062e1d03d99c718b4f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 5 Dec 2023 15:04:48 -0800 Subject: [PATCH 0560/1290] clocksource/drivers/cadence-ttc: Fix some kernel-doc warnings Fix some function kernel-doc warnings to placate scripts/kernel-doc. timer-cadence-ttc.c:79: warning: Function parameter or member 'clk_rate_change_nb' not described in 'ttc_timer' timer-cadence-ttc.c:158: warning: Function parameter or member 'cs' not described in '__ttc_clocksource_read' timer-cadence-ttc.c:194: warning: expecting prototype for ttc_set_{shutdown|oneshot|periodic}(). Prototype was for ttc_shutdown() instead timer-cadence-ttc.c:196: warning: No description found for return value of 'ttc_shutdown' timer-cadence-ttc.c:212: warning: No description found for return value of 'ttc_set_periodic' Signed-off-by: Randy Dunlap Cc: Michal Simek Cc: Daniel Lezcano Cc: Thomas Gleixner Cc: linux-arm-kernel@lists.infradead.org Acked-by: Michal Simek Tested-by: Michal Simek Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20231205230448.772-1-rdunlap@infradead.org --- drivers/clocksource/timer-cadence-ttc.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/drivers/clocksource/timer-cadence-ttc.c b/drivers/clocksource/timer-cadence-ttc.c index 32daaac9b1320..ca7a06489c405 100644 --- a/drivers/clocksource/timer-cadence-ttc.c +++ b/drivers/clocksource/timer-cadence-ttc.c @@ -69,7 +69,7 @@ * @base_addr: Base address of timer * @freq: Timer input clock frequency * @clk: Associated clock source - * @clk_rate_change_nb Notifier block for clock rate changes + * @clk_rate_change_nb: Notifier block for clock rate changes */ struct ttc_timer { void __iomem *base_addr; @@ -134,7 +134,7 @@ static void ttc_set_interval(struct ttc_timer *timer, * @irq: IRQ number of the Timer * @dev_id: void pointer to the ttc_timer instance * - * returns: Always IRQ_HANDLED - success + * Returns: Always IRQ_HANDLED - success **/ static irqreturn_t ttc_clock_event_interrupt(int irq, void *dev_id) { @@ -151,8 +151,9 @@ static irqreturn_t ttc_clock_event_interrupt(int irq, void *dev_id) /** * __ttc_clocksource_read - Reads the timer counter register + * @cs: &clocksource to read from * - * returns: Current timer counter register value + * Returns: Current timer counter register value **/ static u64 __ttc_clocksource_read(struct clocksource *cs) { @@ -173,7 +174,7 @@ static u64 notrace ttc_sched_clock_read(void) * @cycles: Timer interval ticks * @evt: Address of clock event instance * - * returns: Always 0 - success + * Returns: Always %0 - success **/ static int ttc_set_next_event(unsigned long cycles, struct clock_event_device *evt) @@ -186,9 +187,12 @@ static int ttc_set_next_event(unsigned long cycles, } /** - * ttc_set_{shutdown|oneshot|periodic} - Sets the state of timer - * + * ttc_shutdown - Sets the state of timer * @evt: Address of clock event instance + * + * Used for shutdown or oneshot. + * + * Returns: Always %0 - success **/ static int ttc_shutdown(struct clock_event_device *evt) { @@ -202,6 +206,12 @@ static int ttc_shutdown(struct clock_event_device *evt) return 0; } +/** + * ttc_set_periodic - Sets the state of timer + * @evt: Address of clock event instance + * + * Returns: Always %0 - success + */ static int ttc_set_periodic(struct clock_event_device *evt) { struct ttc_timer_clockevent *ttce = to_ttc_timer_clkevent(evt); -- GitLab From c0c4579d79d0df841e825c68df450909a0032faf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 12 Dec 2023 22:46:07 +0100 Subject: [PATCH 0561/1290] clocksource/drivers/ep93xx: Fix error handling during probe When the interrupt property fails to be parsed, ep93xx_timer_of_init() return code ends up uninitialized: drivers/clocksource/timer-ep93xx.c:160:6: error: variable 'ret' is used uninitialized whenever 'if' condition is true [-Werror,-Wsometimes-uninitialized] if (irq < 0) { ^~~~~~~ drivers/clocksource/timer-ep93xx.c:188:9: note: uninitialized use occurs here return ret; ^~~ drivers/clocksource/timer-ep93xx.c:160:2: note: remove the 'if' if its condition is always false if (irq < 0) { ^~~~~~~~~~~~~~ Simplify this portion to use the normal construct of just checking whether a valid interrupt was returned. Note that irq_of_parse_and_map() never returns a negative value and no other callers check for that either. Fixes: c28ca80ba3b5 ("clocksource: ep93xx: Add driver for Cirrus Logic EP93xx") Signed-off-by: Arnd Bergmann Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20231212214616.193098-1-arnd@kernel.org --- drivers/clocksource/timer-ep93xx.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/clocksource/timer-ep93xx.c b/drivers/clocksource/timer-ep93xx.c index bc0ca6e123349..6981ff3ac8a94 100644 --- a/drivers/clocksource/timer-ep93xx.c +++ b/drivers/clocksource/timer-ep93xx.c @@ -155,9 +155,8 @@ static int __init ep93xx_timer_of_init(struct device_node *np) ep93xx_tcu = tcu; irq = irq_of_parse_and_map(np, 0); - if (irq == 0) - irq = -EINVAL; - if (irq < 0) { + if (!irq) { + ret = -EINVAL; pr_err("EP93XX Timer Can't parse IRQ %d", irq); goto out_free; } -- GitLab From ab78ffe1ff7d17102972348bb9b1a16ec2696a2b Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Tue, 26 Dec 2023 18:28:27 +0000 Subject: [PATCH 0562/1290] vhost-vdpa: account iommu allocations iommu allocations should be accounted in order to allow admins to monitor and limit the amount of iommu memory. Signed-off-by: Pasha Tatashin Acked-by: Michael S. Tsirkin Message-Id: <20231226182827.294158-1-pasha.tatashin@soleen.com> Signed-off-by: Michael S. Tsirkin Acked-by: David Rientjes --- drivers/vhost/vdpa.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index da7ec77cdaff0..a51c69c078d93 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -968,7 +968,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, struct vhost_iotlb *iotlb, r = ops->set_map(vdpa, asid, iotlb); } else { r = iommu_map(v->domain, iova, pa, size, - perm_to_iommu_flags(perm), GFP_KERNEL); + perm_to_iommu_flags(perm), + GFP_KERNEL_ACCOUNT); } if (r) { vhost_iotlb_del_range(iotlb, iova, iova + size - 1); -- GitLab From d2c4f1928a3f7f4a1f28a0cfc022e8a145ce6903 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Tue, 26 Dec 2023 17:43:33 +0800 Subject: [PATCH 0563/1290] virtio_net: fix missing dma unmap for resize For rq, we have three cases getting buffers from virtio core: 1. virtqueue_get_buf{,_ctx} 2. virtqueue_detach_unused_buf 3. callback for virtqueue_resize But in commit 295525e29a5b("virtio_net: merge dma operations when filling mergeable buffers"), I missed the dma unmap for the #3 case. That will leak some memory, because I did not release the pages referred by the unused buffers. If we do such script, we will make the system OOM. while true do ethtool -G ens4 rx 128 ethtool -G ens4 rx 256 free -m done Fixes: 295525e29a5b ("virtio_net: merge dma operations when filling mergeable buffers") Signed-off-by: Xuan Zhuo Message-Id: <20231226094333.47740-1-xuanzhuo@linux.alibaba.com> Signed-off-by: Michael S. Tsirkin --- drivers/net/virtio_net.c | 60 ++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index d16f592c2061f..51b1868d2f220 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -334,7 +334,6 @@ struct virtio_net_common_hdr { }; }; -static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf); static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf); static bool is_xdp_frame(void *ptr) @@ -408,6 +407,17 @@ static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask) return p; } +static void virtnet_rq_free_buf(struct virtnet_info *vi, + struct receive_queue *rq, void *buf) +{ + if (vi->mergeable_rx_bufs) + put_page(virt_to_head_page(buf)); + else if (vi->big_packets) + give_pages(rq, buf); + else + put_page(virt_to_head_page(buf)); +} + static void enable_delayed_refill(struct virtnet_info *vi) { spin_lock_bh(&vi->refill_lock); @@ -634,17 +644,6 @@ static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx) return buf; } -static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq) -{ - void *buf; - - buf = virtqueue_detach_unused_buf(rq->vq); - if (buf && rq->do_dma) - virtnet_rq_unmap(rq, buf, 0); - - return buf; -} - static void virtnet_rq_init_one_sg(struct receive_queue *rq, void *buf, u32 len) { struct virtnet_rq_dma *dma; @@ -744,6 +743,20 @@ static void virtnet_rq_set_premapped(struct virtnet_info *vi) } } +static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf) +{ + struct virtnet_info *vi = vq->vdev->priv; + struct receive_queue *rq; + int i = vq2rxq(vq); + + rq = &vi->rq[i]; + + if (rq->do_dma) + virtnet_rq_unmap(rq, buf, 0); + + virtnet_rq_free_buf(vi, rq, buf); +} + static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi) { unsigned int len; @@ -1764,7 +1777,7 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq, if (unlikely(len < vi->hdr_len + ETH_HLEN)) { pr_debug("%s: short packet %i\n", dev->name, len); DEV_STATS_INC(dev, rx_length_errors); - virtnet_rq_free_unused_buf(rq->vq, buf); + virtnet_rq_free_buf(vi, rq, buf); return; } @@ -2392,7 +2405,7 @@ static int virtnet_rx_resize(struct virtnet_info *vi, if (running) napi_disable(&rq->napi); - err = virtqueue_resize(rq->vq, ring_num, virtnet_rq_free_unused_buf); + err = virtqueue_resize(rq->vq, ring_num, virtnet_rq_unmap_free_buf); if (err) netdev_err(vi->dev, "resize rx fail: rx queue index: %d err: %d\n", qindex, err); @@ -4031,19 +4044,6 @@ static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf) xdp_return_frame(ptr_to_xdp(buf)); } -static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) -{ - struct virtnet_info *vi = vq->vdev->priv; - int i = vq2rxq(vq); - - if (vi->mergeable_rx_bufs) - put_page(virt_to_head_page(buf)); - else if (vi->big_packets) - give_pages(&vi->rq[i], buf); - else - put_page(virt_to_head_page(buf)); -} - static void free_unused_bufs(struct virtnet_info *vi) { void *buf; @@ -4057,10 +4057,10 @@ static void free_unused_bufs(struct virtnet_info *vi) } for (i = 0; i < vi->max_queue_pairs; i++) { - struct receive_queue *rq = &vi->rq[i]; + struct virtqueue *vq = vi->rq[i].vq; - while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL) - virtnet_rq_free_unused_buf(rq->vq, buf); + while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) + virtnet_rq_unmap_free_buf(vq, buf); cond_resched(); } } -- GitLab From dff4fa0e57856045359440d05af9e9b7f7048f52 Mon Sep 17 00:00:00 2001 From: David Stevens Date: Fri, 8 Dec 2023 16:07:54 +0900 Subject: [PATCH 0564/1290] virtio: Add support for no-reset virtio PCI PM If a virtio_pci_device supports native PCI power management and has the No_Soft_Reset bit set, then skip resetting and reinitializing the device when suspending and restoring the device. This allows system-wide low power states like s2idle to be used in systems with stateful virtio devices that can't simply be re-initialized (e.g. virtio-fs). Signed-off-by: David Stevens Message-Id: <20231208070754.3132339-1-stevensd@chromium.org> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/virtio/virtio_pci_common.c | 34 +++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index 7a5593997e0ef..1d21d1a1b3f55 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -492,8 +492,40 @@ static int virtio_pci_restore(struct device *dev) return virtio_device_restore(&vp_dev->vdev); } +static bool vp_supports_pm_no_reset(struct device *dev) +{ + struct pci_dev *pci_dev = to_pci_dev(dev); + u16 pmcsr; + + if (!pci_dev->pm_cap) + return false; + + pci_read_config_word(pci_dev, pci_dev->pm_cap + PCI_PM_CTRL, &pmcsr); + if (PCI_POSSIBLE_ERROR(pmcsr)) { + dev_err(dev, "Unable to query pmcsr"); + return false; + } + + return pmcsr & PCI_PM_CTRL_NO_SOFT_RESET; +} + +static int virtio_pci_suspend(struct device *dev) +{ + return vp_supports_pm_no_reset(dev) ? 0 : virtio_pci_freeze(dev); +} + +static int virtio_pci_resume(struct device *dev) +{ + return vp_supports_pm_no_reset(dev) ? 0 : virtio_pci_restore(dev); +} + static const struct dev_pm_ops virtio_pci_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(virtio_pci_freeze, virtio_pci_restore) + .suspend = virtio_pci_suspend, + .resume = virtio_pci_resume, + .freeze = virtio_pci_freeze, + .thaw = virtio_pci_restore, + .poweroff = virtio_pci_freeze, + .restore = virtio_pci_restore, }; #endif -- GitLab From c271fcd9095f066e0e7104e6d8919e2cf26f0899 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 10 Dec 2023 18:51:50 +0100 Subject: [PATCH 0565/1290] vdpa: Remove usage of the deprecated ida_simple_xx() API ida_alloc() and ida_free() should be preferred to the deprecated ida_simple_get() and ida_simple_remove(). This is less verbose. Signed-off-by: Christophe JAILLET Message-Id: Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/vdpa.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index a7612e0783b36..d0695680b282e 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -131,7 +131,7 @@ static void vdpa_release_dev(struct device *d) if (ops->free) ops->free(vdev); - ida_simple_remove(&vdpa_index_ida, vdev->index); + ida_free(&vdpa_index_ida, vdev->index); kfree(vdev->driver_override); kfree(vdev); } @@ -205,7 +205,7 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, return vdev; err_name: - ida_simple_remove(&vdpa_index_ida, vdev->index); + ida_free(&vdpa_index_ida, vdev->index); err_ida: kfree(vdev); err: -- GitLab From 092e39d1456bda5c3d7dab0aa72a24e4b0b4f7a5 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 3 Dec 2023 19:25:25 +0900 Subject: [PATCH 0566/1290] kconfig: squash menu_has_help() and menu_get_help() menu_has_help() and menu_get_help() functions are only used within menu_get_ext_help(). Squash them into menu_get_ext_help(). It revealed the if-conditional in menu_get_help() was unneeded, as menu_has_help() has already checked that menu->help is not NULL. Signed-off-by: Masahiro Yamada --- scripts/kconfig/lkc.h | 2 -- scripts/kconfig/menu.c | 17 ++--------------- 2 files changed, 2 insertions(+), 17 deletions(-) diff --git a/scripts/kconfig/lkc.h b/scripts/kconfig/lkc.h index 471a59acecec6..5cdc8f5e6446a 100644 --- a/scripts/kconfig/lkc.h +++ b/scripts/kconfig/lkc.h @@ -99,8 +99,6 @@ bool menu_is_visible(struct menu *menu); bool menu_has_prompt(struct menu *menu); const char *menu_get_prompt(struct menu *menu); struct menu *menu_get_parent_menu(struct menu *menu); -bool menu_has_help(struct menu *menu); -const char *menu_get_help(struct menu *menu); int get_jump_key_char(void); struct gstr get_relations_str(struct symbol **sym_arr, struct list_head *head); void menu_get_ext_help(struct menu *menu, struct gstr *help); diff --git a/scripts/kconfig/menu.c b/scripts/kconfig/menu.c index 61c442d84aef4..2cce8b651f615 100644 --- a/scripts/kconfig/menu.c +++ b/scripts/kconfig/menu.c @@ -673,19 +673,6 @@ struct menu *menu_get_parent_menu(struct menu *menu) return menu; } -bool menu_has_help(struct menu *menu) -{ - return menu->help != NULL; -} - -const char *menu_get_help(struct menu *menu) -{ - if (menu->help) - return menu->help; - else - return ""; -} - static void get_def_str(struct gstr *r, struct menu *menu) { str_printf(r, "Defined at %s:%d\n", @@ -856,10 +843,10 @@ void menu_get_ext_help(struct menu *menu, struct gstr *help) struct symbol *sym = menu->sym; const char *help_text = nohelp_text; - if (menu_has_help(menu)) { + if (menu->help) { if (sym->name) str_printf(help, "%s%s:\n\n", CONFIG_, sym->name); - help_text = menu_get_help(menu); + help_text = menu->help; } str_printf(help, "%s\n", help_text); if (sym) -- GitLab From 405d2cb209b5836910b5dac01cf97fcbd186c0af Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 3 Dec 2023 19:25:26 +0900 Subject: [PATCH 0567/1290] kconfig: add include guard to lkc_proto.h Signed-off-by: Masahiro Yamada --- scripts/kconfig/lkc_proto.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/kconfig/lkc_proto.h b/scripts/kconfig/lkc_proto.h index edd1e617b25c5..687d8698d801c 100644 --- a/scripts/kconfig/lkc_proto.h +++ b/scripts/kconfig/lkc_proto.h @@ -1,4 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LKC_PROTO_H +#define LKC_PROTO_H + #include /* confdata.c */ @@ -50,3 +53,5 @@ char *expand_one_token(const char **str); /* expr.c */ void expr_print(struct expr *e, void (*fn)(void *, struct symbol *, const char *), void *data, int prevtoken); + +#endif /* LKC_PROTO_H */ -- GitLab From 9ad86d747c46f2bdc097c908481647fcdda1d035 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 3 Dec 2023 19:25:27 +0900 Subject: [PATCH 0568/1290] kconfig: remove unreachable printf() Remove the unreachable code detected by clang. $ make HOSTCC=clang HOSTCFLAGS=-Wunreachable-code defconfig [ snip ] scripts/kconfig/expr.c:1134:2: warning: code will never be executed [-Wunreachable-code] printf("[%dgt%d?]", t1, t2); ^~~~~~ 1 warning generated. Signed-off-by: Masahiro Yamada --- scripts/kconfig/expr.c | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/kconfig/expr.c b/scripts/kconfig/expr.c index 81ebf8108ca74..a290de36307ba 100644 --- a/scripts/kconfig/expr.c +++ b/scripts/kconfig/expr.c @@ -1131,7 +1131,6 @@ static int expr_compare_type(enum expr_type t1, enum expr_type t2) default: return -1; } - printf("[%dgt%d?]", t1, t2); return 0; } -- GitLab From 407868deb2a344e9baa7909e1b13aec35c7217b2 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 3 Dec 2023 19:25:28 +0900 Subject: [PATCH 0569/1290] kconfig: remove redundant NULL pointer check before free() Passing NULL to free() is allowed and is a no-op. Remove redundant NULL pointer checks. Signed-off-by: Masahiro Yamada --- scripts/kconfig/confdata.c | 3 +-- scripts/kconfig/util.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/scripts/kconfig/confdata.c b/scripts/kconfig/confdata.c index bd14aae1db582..f1197e6724316 100644 --- a/scripts/kconfig/confdata.c +++ b/scripts/kconfig/confdata.c @@ -432,8 +432,7 @@ load: case S_INT: case S_HEX: case S_STRING: - if (sym->def[def].val) - free(sym->def[def].val); + free(sym->def[def].val); /* fall through */ default: sym->def[def].val = NULL; diff --git a/scripts/kconfig/util.c b/scripts/kconfig/util.c index b78f114ad48cc..92e5b2b9761d7 100644 --- a/scripts/kconfig/util.c +++ b/scripts/kconfig/util.c @@ -42,8 +42,7 @@ struct gstr str_new(void) /* Free storage for growable string */ void str_free(struct gstr *gs) { - if (gs->s) - free(gs->s); + free(gs->s); gs->s = NULL; gs->len = 0; } -- GitLab From ac14947c77a36270d5cb1ff07afffbf221ac8af1 Mon Sep 17 00:00:00 2001 From: Markus Schneider-Pargmann Date: Tue, 5 Dec 2023 11:45:59 +0100 Subject: [PATCH 0570/1290] kconfig: Use KCONFIG_CONFIG instead of .config When using a custom location for kernel config files this merge config command fails as it doesn't use the configuration set with KCONFIG_CONFIG. Signed-off-by: Markus Schneider-Pargmann Signed-off-by: Masahiro Yamada --- scripts/kconfig/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/kconfig/Makefile b/scripts/kconfig/Makefile index 7c025f82718e2..ea1bf3b3dbde1 100644 --- a/scripts/kconfig/Makefile +++ b/scripts/kconfig/Makefile @@ -107,7 +107,7 @@ config-fragments = $(call configfiles,$@) %.config: $(obj)/conf $(if $(config-fragments),, $(error $@ fragment does not exists on this architecture)) - $(Q)$(CONFIG_SHELL) $(srctree)/scripts/kconfig/merge_config.sh -m .config $(config-fragments) + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/kconfig/merge_config.sh -m $(KCONFIG_CONFIG) $(config-fragments) $(Q)$(MAKE) -f $(srctree)/Makefile olddefconfig PHONY += tinyconfig -- GitLab From 5a602de99797bddc9dd7f73592281a507196f69d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Huguet?= Date: Thu, 1 Jun 2023 09:53:33 +0200 Subject: [PATCH 0571/1290] Add .editorconfig file for basic formatting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit EditorConfig is a specification to define the most basic code formatting stuff, and it's supported by many editors and IDEs, either directly or via plugins, including VSCode/VSCodium, Vim, emacs and more. It allows to define formatting style related to indentation, charset, end of lines and trailing whitespaces. It also allows to apply different formats for different files based on wildcards, so for example it is possible to apply different configs to *.{c,h}, *.py and *.rs. In linux project, defining a .editorconfig might help to those people that work on different projects with different indentation styles, so they cannot define a global style. Now they will directly see the correct indentation on every fresh clone of the project. See https://editorconfig.org Co-developed-by: Danny Lin Signed-off-by: Danny Lin Signed-off-by: Íñigo Huguet Acked-by: Mickaël Salaün Reviewed-by: Vincent Mailhol Tested-by: Vincent Mailhol Signed-off-by: Masahiro Yamada --- .editorconfig | 32 ++++++++++++++++++++++++++ .gitignore | 1 + Documentation/process/4.Coding.rst | 4 ++++ Documentation/process/coding-style.rst | 4 ++++ 4 files changed, 41 insertions(+) create mode 100644 .editorconfig diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000000000..854773350cc5a --- /dev/null +++ b/.editorconfig @@ -0,0 +1,32 @@ +# SPDX-License-Identifier: GPL-2.0-only + +root = true + +[{*.{awk,c,dts,dtsi,dtso,h,mk,s,S},Kconfig,Makefile,Makefile.*}] +charset = utf-8 +end_of_line = lf +trim_trailing_whitespace = true +insert_final_newline = true +indent_style = tab +indent_size = 8 + +[*.{json,py,rs}] +charset = utf-8 +end_of_line = lf +trim_trailing_whitespace = true +insert_final_newline = true +indent_style = space +indent_size = 4 + +# this must be below the general *.py to overwrite it +[tools/{perf,power,rcu,testing/kunit}/**.py,] +indent_style = tab +indent_size = 8 + +[*.yaml] +charset = utf-8 +end_of_line = lf +trim_trailing_whitespace = unset +insert_final_newline = true +indent_style = space +indent_size = 2 diff --git a/.gitignore b/.gitignore index 98274e1160d7b..689a4fa3f5477 100644 --- a/.gitignore +++ b/.gitignore @@ -96,6 +96,7 @@ modules.order # !.clang-format !.cocciconfig +!.editorconfig !.get_maintainer.ignore !.gitattributes !.gitignore diff --git a/Documentation/process/4.Coding.rst b/Documentation/process/4.Coding.rst index 1f0d81f44e14b..c2046dec0c2f4 100644 --- a/Documentation/process/4.Coding.rst +++ b/Documentation/process/4.Coding.rst @@ -66,6 +66,10 @@ for aligning variables/macros, for reflowing text and other similar tasks. See the file :ref:`Documentation/process/clang-format.rst ` for more details. +Some basic editor settings, such as indentation and line endings, will be +set automatically if you are using an editor that is compatible with +EditorConfig. See the official EditorConfig website for more information: +https://editorconfig.org/ Abstraction layers ****************** diff --git a/Documentation/process/coding-style.rst b/Documentation/process/coding-style.rst index 6db37a46d3059..c48382c6b4774 100644 --- a/Documentation/process/coding-style.rst +++ b/Documentation/process/coding-style.rst @@ -735,6 +735,10 @@ for aligning variables/macros, for reflowing text and other similar tasks. See the file :ref:`Documentation/process/clang-format.rst ` for more details. +Some basic editor settings, such as indentation and line endings, will be +set automatically if you are using an editor that is compatible with +EditorConfig. See the official EditorConfig website for more information: +https://editorconfig.org/ 10) Kconfig configuration files ------------------------------- -- GitLab From 21d706d5cf570917594b21edee81893bdce09ab8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 9 Jul 2021 08:41:17 +0100 Subject: [PATCH 0572/1290] netfs: Add support for DIO buffering Add a bvec array pointer and an iterator to netfs_io_request for either holding a copy of a DIO iterator or a list of all the bits of buffer pointed to by a DIO iterator. There are two problems: Firstly, if an iovec-class iov_iter is passed to ->read_iter() or ->write_iter(), this cannot be passed directly to kernel_sendmsg() or kernel_recvmsg() as that may cause locking recursion if a fault is generated, so we need to keep track of the pages involved separately. Secondly, if the I/O is asynchronous, we must copy the iov_iter describing the buffer before returning to the caller as it may be immediately deallocated. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/netfs/objects.c | 10 ++++++++++ include/linux/netfs.h | 3 +++ 2 files changed, 13 insertions(+) diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index 1bd20bdad9836..4df5e5eeada68 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -76,6 +76,7 @@ static void netfs_free_request(struct work_struct *work) { struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work); + unsigned int i; trace_netfs_rreq(rreq, netfs_rreq_trace_free); netfs_proc_del_rreq(rreq); @@ -84,6 +85,15 @@ static void netfs_free_request(struct work_struct *work) rreq->netfs_ops->free_request(rreq); if (rreq->cache_resources.ops) rreq->cache_resources.ops->end_operation(&rreq->cache_resources); + if (rreq->direct_bv) { + for (i = 0; i < rreq->direct_bv_count; i++) { + if (rreq->direct_bv[i].bv_page) { + if (rreq->direct_bv_unpin) + unpin_user_page(rreq->direct_bv[i].bv_page); + } + } + kvfree(rreq->direct_bv); + } kfree_rcu(rreq, rcu); netfs_stat_d(&netfs_n_rh_rreq); } diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 3da962e977f55..2bb1273b38f42 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -190,6 +190,8 @@ struct netfs_io_request { struct iov_iter iter; /* Unencrypted-side iterator */ struct iov_iter io_iter; /* I/O (Encrypted-side) iterator */ void *netfs_priv; /* Private data for the netfs */ + struct bio_vec *direct_bv; /* DIO buffer list (when handling iovec-iter) */ + unsigned int direct_bv_count; /* Number of elements in direct_bv[] */ unsigned int debug_id; atomic_t nr_outstanding; /* Number of ops in progress */ atomic_t nr_copy_ops; /* Number of copy-to-cache ops in progress */ @@ -197,6 +199,7 @@ struct netfs_io_request { size_t len; /* Length of the request */ short error; /* 0 or error that occurred */ enum netfs_io_origin origin; /* Origin of the request */ + bool direct_bv_unpin; /* T if direct_bv[] must be unpinned */ loff_t i_size; /* Size of the file */ loff_t start; /* Start position */ pgoff_t no_unlock_folio; /* Don't unlock this folio after read */ -- GitLab From 7d828a06634799aba0fa392913c7fe2953eb64a6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 22 Sep 2023 13:25:22 +0100 Subject: [PATCH 0573/1290] netfs: Provide tools to create a buffer in an xarray Provide tools to create a buffer in an xarray, with a function to add new folios with a mark. This will be used to create bounce buffer and can be used more easily to create a list of folios the span of which would require more than a page's worth of bio_vec structs. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/netfs/internal.h | 13 +++++++ fs/netfs/misc.c | 81 +++++++++++++++++++++++++++++++++++++++++++ include/linux/netfs.h | 4 +++ 3 files changed, 98 insertions(+) diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index 4708fb15446bd..b908c7e0a9013 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -56,6 +56,19 @@ static inline void netfs_proc_add_rreq(struct netfs_io_request *rreq) {} static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {} #endif +/* + * misc.c + */ +#define NETFS_FLAG_PUT_MARK BIT(0) +#define NETFS_FLAG_PAGECACHE_MARK BIT(1) +int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index, + struct folio *folio, unsigned int flags, + gfp_t gfp_mask); +int netfs_add_folios_to_buffer(struct xarray *buffer, + struct address_space *mapping, + pgoff_t index, pgoff_t to, gfp_t gfp_mask); +void netfs_clear_buffer(struct xarray *buffer); + /* * objects.c */ diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 45bb19ec9a638..5d545073fe03c 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -8,6 +8,87 @@ #include #include "internal.h" +/* + * Attach a folio to the buffer and maybe set marks on it to say that we need + * to put the folio later and twiddle the pagecache flags. + */ +int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index, + struct folio *folio, unsigned int flags, + gfp_t gfp_mask) +{ + XA_STATE_ORDER(xas, xa, index, folio_order(folio)); + +retry: + xas_lock(&xas); + for (;;) { + xas_store(&xas, folio); + if (!xas_error(&xas)) + break; + xas_unlock(&xas); + if (!xas_nomem(&xas, gfp_mask)) + return xas_error(&xas); + goto retry; + } + + if (flags & NETFS_FLAG_PUT_MARK) + xas_set_mark(&xas, NETFS_BUF_PUT_MARK); + if (flags & NETFS_FLAG_PAGECACHE_MARK) + xas_set_mark(&xas, NETFS_BUF_PAGECACHE_MARK); + xas_unlock(&xas); + return xas_error(&xas); +} + +/* + * Create the specified range of folios in the buffer attached to the read + * request. The folios are marked with NETFS_BUF_PUT_MARK so that we know that + * these need freeing later. + */ +int netfs_add_folios_to_buffer(struct xarray *buffer, + struct address_space *mapping, + pgoff_t index, pgoff_t to, gfp_t gfp_mask) +{ + struct folio *folio; + int ret; + + if (to + 1 == index) /* Page range is inclusive */ + return 0; + + do { + /* TODO: Figure out what order folio can be allocated here */ + folio = filemap_alloc_folio(readahead_gfp_mask(mapping), 0); + if (!folio) + return -ENOMEM; + folio->index = index; + ret = netfs_xa_store_and_mark(buffer, index, folio, + NETFS_FLAG_PUT_MARK, gfp_mask); + if (ret < 0) { + folio_put(folio); + return ret; + } + + index += folio_nr_pages(folio); + } while (index <= to && index != 0); + + return 0; +} + +/* + * Clear an xarray buffer, putting a ref on the folios that have + * NETFS_BUF_PUT_MARK set. + */ +void netfs_clear_buffer(struct xarray *buffer) +{ + struct folio *folio; + XA_STATE(xas, buffer, 0); + + rcu_read_lock(); + xas_for_each_marked(&xas, folio, ULONG_MAX, NETFS_BUF_PUT_MARK) { + folio_put(folio); + } + rcu_read_unlock(); + xa_destroy(buffer); +} + /** * netfs_dirty_folio - Mark folio dirty and pin a cache object for writeback * @mapping: The mapping the folio belongs to. diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 2bb1273b38f42..c05365e3f4281 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -109,6 +109,10 @@ static inline int wait_on_page_fscache_killable(struct page *page) return folio_wait_private_2_killable(page_folio(page)); } +/* Marks used on xarray-based buffers */ +#define NETFS_BUF_PUT_MARK XA_MARK_0 /* - Page needs putting */ +#define NETFS_BUF_PAGECACHE_MARK XA_MARK_1 /* - Page needs wb/dirty flag wrangling */ + enum netfs_io_source { NETFS_FILL_WITH_ZEROES, NETFS_DOWNLOAD_FROM_SERVER, -- GitLab From cae932d3aee55035a54415dcea8e7ecf2ec469b5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 22 Sep 2023 14:49:47 +0100 Subject: [PATCH 0574/1290] netfs: Add func to calculate pagecount/size-limited span of an iterator Add a function to work out how much of an ITER_BVEC or ITER_XARRAY iterator we can use in a pagecount-limited and size-limited span. This will be used, for example, to limit the number of segments in a subrequest to the maximum number of elements that an RDMA transfer can handle. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/netfs/iterator.c | 97 +++++++++++++++++++++++++++++++++++++++++++ include/linux/netfs.h | 2 + 2 files changed, 99 insertions(+) diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c index 2ff07ba655a07..b781bbbf1d8d6 100644 --- a/fs/netfs/iterator.c +++ b/fs/netfs/iterator.c @@ -101,3 +101,100 @@ ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len, return npages; } EXPORT_SYMBOL_GPL(netfs_extract_user_iter); + +/* + * Select the span of a bvec iterator we're going to use. Limit it by both maximum + * size and maximum number of segments. Returns the size of the span in bytes. + */ +static size_t netfs_limit_bvec(const struct iov_iter *iter, size_t start_offset, + size_t max_size, size_t max_segs) +{ + const struct bio_vec *bvecs = iter->bvec; + unsigned int nbv = iter->nr_segs, ix = 0, nsegs = 0; + size_t len, span = 0, n = iter->count; + size_t skip = iter->iov_offset + start_offset; + + if (WARN_ON(!iov_iter_is_bvec(iter)) || + WARN_ON(start_offset > n) || + n == 0) + return 0; + + while (n && ix < nbv && skip) { + len = bvecs[ix].bv_len; + if (skip < len) + break; + skip -= len; + n -= len; + ix++; + } + + while (n && ix < nbv) { + len = min3(n, bvecs[ix].bv_len - skip, max_size); + span += len; + nsegs++; + ix++; + if (span >= max_size || nsegs >= max_segs) + break; + skip = 0; + n -= len; + } + + return min(span, max_size); +} + +/* + * Select the span of an xarray iterator we're going to use. Limit it by both + * maximum size and maximum number of segments. It is assumed that segments + * can be larger than a page in size, provided they're physically contiguous. + * Returns the size of the span in bytes. + */ +static size_t netfs_limit_xarray(const struct iov_iter *iter, size_t start_offset, + size_t max_size, size_t max_segs) +{ + struct folio *folio; + unsigned int nsegs = 0; + loff_t pos = iter->xarray_start + iter->iov_offset; + pgoff_t index = pos / PAGE_SIZE; + size_t span = 0, n = iter->count; + + XA_STATE(xas, iter->xarray, index); + + if (WARN_ON(!iov_iter_is_xarray(iter)) || + WARN_ON(start_offset > n) || + n == 0) + return 0; + max_size = min(max_size, n - start_offset); + + rcu_read_lock(); + xas_for_each(&xas, folio, ULONG_MAX) { + size_t offset, flen, len; + if (xas_retry(&xas, folio)) + continue; + if (WARN_ON(xa_is_value(folio))) + break; + if (WARN_ON(folio_test_hugetlb(folio))) + break; + + flen = folio_size(folio); + offset = offset_in_folio(folio, pos); + len = min(max_size, flen - offset); + span += len; + nsegs++; + if (span >= max_size || nsegs >= max_segs) + break; + } + + rcu_read_unlock(); + return min(span, max_size); +} + +size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset, + size_t max_size, size_t max_segs) +{ + if (iov_iter_is_bvec(iter)) + return netfs_limit_bvec(iter, start_offset, max_size, max_segs); + if (iov_iter_is_xarray(iter)) + return netfs_limit_xarray(iter, start_offset, max_size, max_segs); + BUG(); +} +EXPORT_SYMBOL(netfs_limit_iter); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index c05365e3f4281..d673d0785b9d9 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -321,6 +321,8 @@ void netfs_put_subrequest(struct netfs_io_subrequest *subreq, ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len, struct iov_iter *new, iov_iter_extraction_t extraction_flags); +size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset, + size_t max_size, size_t max_segs); int netfs_start_io_read(struct inode *inode); void netfs_end_io_read(struct inode *inode); -- GitLab From 768ddb1eacf5dd997ecf393e7bab9796bad047e0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 27 May 2022 13:45:28 +0100 Subject: [PATCH 0575/1290] netfs: Limit subrequest by size or number of segments Limit a subrequest to a maximum size and/or a maximum number of contiguous physical regions. This permits, for instance, an subreq's iterator to be limited to the number of DMA'able segments that a large RDMA request can handle. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/netfs/io.c | 18 ++++++++++++++++++ include/linux/netfs.h | 1 + include/trace/events/netfs.h | 1 + 3 files changed, 20 insertions(+) diff --git a/fs/netfs/io.c b/fs/netfs/io.c index e9d408e211b83..e228bfb530ea5 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -525,6 +525,7 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq, struct iov_iter *io_iter) { enum netfs_io_source source; + size_t lsize; _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size); @@ -547,13 +548,30 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq, source = NETFS_INVALID_READ; goto out; } + + if (subreq->max_nr_segs) { + lsize = netfs_limit_iter(io_iter, 0, subreq->len, + subreq->max_nr_segs); + if (subreq->len > lsize) { + subreq->len = lsize; + trace_netfs_sreq(subreq, netfs_sreq_trace_limited); + } + } } + if (subreq->len > rreq->len) + pr_warn("R=%08x[%u] SREQ>RREQ %zx > %zx\n", + rreq->debug_id, subreq->debug_index, + subreq->len, rreq->len); + if (WARN_ON(subreq->len == 0)) { source = NETFS_INVALID_READ; goto out; } + subreq->source = source; + trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); + subreq->io_iter = *io_iter; iov_iter_truncate(&subreq->io_iter, subreq->len); iov_iter_advance(io_iter, subreq->len); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index d673d0785b9d9..44cd13ad695af 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -161,6 +161,7 @@ struct netfs_io_subrequest { refcount_t ref; short error; /* 0 or error that occurred */ unsigned short debug_index; /* Index in list (for debugging output) */ + unsigned int max_nr_segs; /* 0 or max number of segments in an iterator */ enum netfs_io_source source; /* Where to read from/write to */ unsigned long flags; #define NETFS_SREQ_COPY_TO_CACHE 0 /* Set if should copy the data to the cache */ diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index beec534cbaab2..fce6d0bc78e5e 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -44,6 +44,7 @@ #define netfs_sreq_traces \ EM(netfs_sreq_trace_download_instead, "RDOWN") \ EM(netfs_sreq_trace_free, "FREE ") \ + EM(netfs_sreq_trace_limited, "LIMIT") \ EM(netfs_sreq_trace_prepare, "PREP ") \ EM(netfs_sreq_trace_resubmit_short, "SHORT") \ EM(netfs_sreq_trace_submit, "SUBMT") \ -- GitLab From 16af134ca4b7051b1587108f2066ec90ae029f74 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 9 Feb 2022 19:52:13 +0000 Subject: [PATCH 0576/1290] netfs: Extend the netfs_io_*request structs to handle writes Modify the netfs_io_request struct to act as a point around which writes can be coordinated. It represents and pins a range of pages that need writing and a list of regions of dirty data in that range of pages. If RMW is required, the original data can be downloaded into the bounce buffer, decrypted if necessary, the modifications made, then the modified data can be reencrypted/recompressed and sent back to the server. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/netfs/internal.h | 6 ++++++ fs/netfs/main.c | 3 ++- fs/netfs/objects.c | 6 ++++++ fs/netfs/stats.c | 16 +++++++++++++--- include/linux/netfs.h | 15 ++++++++++++++- include/trace/events/netfs.h | 8 ++++++-- 6 files changed, 47 insertions(+), 7 deletions(-) diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index b908c7e0a9013..2bf2e82b2ad79 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -110,6 +110,12 @@ extern atomic_t netfs_n_rh_write_begin; extern atomic_t netfs_n_rh_write_done; extern atomic_t netfs_n_rh_write_failed; extern atomic_t netfs_n_rh_write_zskip; +extern atomic_t netfs_n_wh_upload; +extern atomic_t netfs_n_wh_upload_done; +extern atomic_t netfs_n_wh_upload_failed; +extern atomic_t netfs_n_wh_write; +extern atomic_t netfs_n_wh_write_done; +extern atomic_t netfs_n_wh_write_failed; int netfs_stats_show(struct seq_file *m, void *v); diff --git a/fs/netfs/main.c b/fs/netfs/main.c index 97ce1436615b7..ab6cac1106762 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -25,10 +25,11 @@ MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask"); LIST_HEAD(netfs_io_requests); DEFINE_SPINLOCK(netfs_proc_lock); -static const char *netfs_origins[] = { +static const char *netfs_origins[nr__netfs_io_origin] = { [NETFS_READAHEAD] = "RA", [NETFS_READPAGE] = "RP", [NETFS_READ_FOR_WRITE] = "RW", + [NETFS_WRITEBACK] = "WB", }; /* diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index 4df5e5eeada68..65a17dd4ab492 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -20,6 +20,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, struct inode *inode = file ? file_inode(file) : mapping->host; struct netfs_inode *ctx = netfs_inode(inode); struct netfs_io_request *rreq; + bool cached = netfs_is_cache_enabled(ctx); int ret; rreq = kzalloc(ctx->ops->io_request_size ?: sizeof(struct netfs_io_request), @@ -37,7 +38,10 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, rreq->debug_id = atomic_inc_return(&debug_ids); INIT_LIST_HEAD(&rreq->subrequests); refcount_set(&rreq->ref, 1); + __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); + if (cached) + __set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags); if (rreq->netfs_ops->init_request) { ret = rreq->netfs_ops->init_request(rreq, file); if (ret < 0) { @@ -46,6 +50,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, } } + trace_netfs_rreq_ref(rreq->debug_id, 1, netfs_rreq_trace_new); netfs_proc_add_rreq(rreq); netfs_stat(&netfs_n_rh_rreq); return rreq; @@ -129,6 +134,7 @@ struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq sizeof(struct netfs_io_subrequest), GFP_KERNEL); if (subreq) { + INIT_WORK(&subreq->work, NULL); INIT_LIST_HEAD(&subreq->rreq_link); refcount_set(&subreq->ref, 2); subreq->rreq = rreq; diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c index 6025dc485f7ea..c1f85cd595a47 100644 --- a/fs/netfs/stats.c +++ b/fs/netfs/stats.c @@ -27,6 +27,12 @@ atomic_t netfs_n_rh_write_begin; atomic_t netfs_n_rh_write_done; atomic_t netfs_n_rh_write_failed; atomic_t netfs_n_rh_write_zskip; +atomic_t netfs_n_wh_upload; +atomic_t netfs_n_wh_upload_done; +atomic_t netfs_n_wh_upload_failed; +atomic_t netfs_n_wh_write; +atomic_t netfs_n_wh_write_done; +atomic_t netfs_n_wh_write_failed; int netfs_stats_show(struct seq_file *m, void *v) { @@ -50,10 +56,14 @@ int netfs_stats_show(struct seq_file *m, void *v) atomic_read(&netfs_n_rh_read), atomic_read(&netfs_n_rh_read_done), atomic_read(&netfs_n_rh_read_failed)); + seq_printf(m, "Netfs : UL=%u us=%u uf=%u\n", + atomic_read(&netfs_n_wh_upload), + atomic_read(&netfs_n_wh_upload_done), + atomic_read(&netfs_n_wh_upload_failed)); seq_printf(m, "Netfs : WR=%u ws=%u wf=%u\n", - atomic_read(&netfs_n_rh_write), - atomic_read(&netfs_n_rh_write_done), - atomic_read(&netfs_n_rh_write_failed)); + atomic_read(&netfs_n_wh_write), + atomic_read(&netfs_n_wh_write_done), + atomic_read(&netfs_n_wh_write_failed)); return fscache_stats_show(m); } EXPORT_SYMBOL(netfs_stats_show); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 44cd13ad695af..f302123a3e384 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -118,6 +118,9 @@ enum netfs_io_source { NETFS_DOWNLOAD_FROM_SERVER, NETFS_READ_FROM_CACHE, NETFS_INVALID_READ, + NETFS_UPLOAD_TO_SERVER, + NETFS_WRITE_TO_CACHE, + NETFS_INVALID_WRITE, } __mode(byte); typedef void (*netfs_io_terminated_t)(void *priv, ssize_t transferred_or_error, @@ -149,9 +152,14 @@ struct netfs_cache_resources { }; /* - * Descriptor for a single component subrequest. + * Descriptor for a single component subrequest. Each operation represents an + * individual read/write from/to a server, a cache, a journal, etc.. + * + * The buffer iterator is persistent for the life of the subrequest struct and + * the pages it points to can be relied on to exist for the duration. */ struct netfs_io_subrequest { + struct work_struct work; struct netfs_io_request *rreq; /* Supervising I/O request */ struct list_head rreq_link; /* Link in rreq->subrequests */ struct iov_iter io_iter; /* Iterator for this subrequest */ @@ -176,6 +184,8 @@ enum netfs_io_origin { NETFS_READAHEAD, /* This read was triggered by readahead */ NETFS_READPAGE, /* This read is a synchronous read */ NETFS_READ_FOR_WRITE, /* This read is to prepare a write */ + NETFS_WRITEBACK, /* This write was triggered by writepages */ + nr__netfs_io_origin } __mode(byte); /* @@ -198,6 +208,7 @@ struct netfs_io_request { struct bio_vec *direct_bv; /* DIO buffer list (when handling iovec-iter) */ unsigned int direct_bv_count; /* Number of elements in direct_bv[] */ unsigned int debug_id; + unsigned int subreq_counter; /* Next subreq->debug_index */ atomic_t nr_outstanding; /* Number of ops in progress */ atomic_t nr_copy_ops; /* Number of copy-to-cache ops in progress */ size_t submitted; /* Amount submitted for I/O so far */ @@ -216,6 +227,8 @@ struct netfs_io_request { #define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */ #define NETFS_RREQ_FAILED 4 /* The request failed */ #define NETFS_RREQ_IN_PROGRESS 5 /* Unlocked when the request completes */ +#define NETFS_RREQ_WRITE_TO_CACHE 7 /* Need to write to the cache */ +#define NETFS_RREQ_UPLOAD_TO_SERVER 8 /* Need to write to the server */ const struct netfs_request_ops *netfs_ops; }; diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index fce6d0bc78e5e..4ea4e34d279f2 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -24,7 +24,8 @@ #define netfs_rreq_origins \ EM(NETFS_READAHEAD, "RA") \ EM(NETFS_READPAGE, "RP") \ - E_(NETFS_READ_FOR_WRITE, "RW") + EM(NETFS_READ_FOR_WRITE, "RW") \ + E_(NETFS_WRITEBACK, "WB") #define netfs_rreq_traces \ EM(netfs_rreq_trace_assess, "ASSESS ") \ @@ -39,7 +40,10 @@ EM(NETFS_FILL_WITH_ZEROES, "ZERO") \ EM(NETFS_DOWNLOAD_FROM_SERVER, "DOWN") \ EM(NETFS_READ_FROM_CACHE, "READ") \ - E_(NETFS_INVALID_READ, "INVL") \ + EM(NETFS_INVALID_READ, "INVL") \ + EM(NETFS_UPLOAD_TO_SERVER, "UPLD") \ + EM(NETFS_WRITE_TO_CACHE, "WRIT") \ + E_(NETFS_INVALID_WRITE, "INVL") #define netfs_sreq_traces \ EM(netfs_sreq_trace_download_instead, "RDOWN") \ -- GitLab From c6dc54dd91bbf597942b4975b8adec660a16827d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 25 Feb 2022 12:27:53 +0000 Subject: [PATCH 0577/1290] netfs: Add a hook to allow tell the netfs to update its i_size Add a hook for netfslib's write helpers to call to tell the network filesystem that it should update its i_size. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/netfs.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/netfs.h b/include/linux/netfs.h index f302123a3e384..3fc41f616621b 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -242,6 +242,7 @@ struct netfs_request_ops { void (*free_request)(struct netfs_io_request *rreq); void (*free_subrequest)(struct netfs_io_subrequest *rreq); + /* Read request handling */ void (*expand_readahead)(struct netfs_io_request *rreq); bool (*clamp_length)(struct netfs_io_subrequest *subreq); void (*issue_read)(struct netfs_io_subrequest *subreq); @@ -249,6 +250,9 @@ struct netfs_request_ops { int (*check_write_begin)(struct file *file, loff_t pos, unsigned len, struct folio **foliop, void **_fsdata); void (*done)(struct netfs_io_request *rreq); + + /* Modification handling */ + void (*update_i_size)(struct inode *inode, loff_t i_size); }; /* -- GitLab From 6ba22d8d1521f35ca1343e64f69d7857f0340e5e Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 29 Sep 2023 14:35:17 +0100 Subject: [PATCH 0578/1290] netfs: Make netfs_put_request() handle a NULL pointer Make netfs_put_request() just return if given a NULL request pointer. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/netfs/objects.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index 65a17dd4ab492..3aa0bfbc04ec3 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -106,19 +106,22 @@ static void netfs_free_request(struct work_struct *work) void netfs_put_request(struct netfs_io_request *rreq, bool was_async, enum netfs_rreq_ref_trace what) { - unsigned int debug_id = rreq->debug_id; + unsigned int debug_id; bool dead; int r; - dead = __refcount_dec_and_test(&rreq->ref, &r); - trace_netfs_rreq_ref(debug_id, r - 1, what); - if (dead) { - if (was_async) { - rreq->work.func = netfs_free_request; - if (!queue_work(system_unbound_wq, &rreq->work)) - BUG(); - } else { - netfs_free_request(&rreq->work); + if (rreq) { + debug_id = rreq->debug_id; + dead = __refcount_dec_and_test(&rreq->ref, &r); + trace_netfs_rreq_ref(debug_id, r - 1, what); + if (dead) { + if (was_async) { + rreq->work.func = netfs_free_request; + if (!queue_work(system_unbound_wq, &rreq->work)) + BUG(); + } else { + netfs_free_request(&rreq->work); + } } } } -- GitLab From 4fcccc38ebbdcff74494701c50a8e2fe4689837e Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Oct 2023 16:15:48 +0100 Subject: [PATCH 0579/1290] netfs: Make the refcounting of netfs_begin_read() easier to use Make the refcounting of netfs_begin_read() easier to use by not eating the caller's ref on the netfs_io_request it's given. This makes it easier to use when we need to look in the request struct after. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/netfs/buffered_read.c | 6 +++++- fs/netfs/io.c | 28 +++++++++++++--------------- include/trace/events/netfs.h | 9 +++++---- 3 files changed, 23 insertions(+), 20 deletions(-) diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 751556faa70b1..6b9a44cafbac6 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -210,6 +210,7 @@ void netfs_readahead(struct readahead_control *ractl) ; netfs_begin_read(rreq, false); + netfs_put_request(rreq, false, netfs_rreq_trace_put_return); return; cleanup_free: @@ -260,7 +261,9 @@ int netfs_read_folio(struct file *file, struct folio *folio) iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, rreq->start, rreq->len); - return netfs_begin_read(rreq, true); + ret = netfs_begin_read(rreq, true); + netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + return ret; discard: netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); @@ -429,6 +432,7 @@ retry: ret = netfs_begin_read(rreq, true); if (ret < 0) goto error; + netfs_put_request(rreq, false, netfs_rreq_trace_put_return); have_folio: ret = folio_wait_fscache_killable(folio); diff --git a/fs/netfs/io.c b/fs/netfs/io.c index e228bfb530ea5..e83ef5835d250 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -362,6 +362,7 @@ again: netfs_rreq_unlock_folios(rreq); + trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip); clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags); wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS); @@ -657,7 +658,6 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) if (rreq->len == 0) { pr_err("Zero-sized read [R=%x]\n", rreq->debug_id); - netfs_put_request(rreq, false, netfs_rreq_trace_put_zero_len); return -EIO; } @@ -665,12 +665,10 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) INIT_WORK(&rreq->work, netfs_rreq_work); - if (sync) - netfs_get_request(rreq, netfs_rreq_trace_get_hold); - /* Chop the read into slices according to what the cache and the netfs * want and submit each one. */ + netfs_get_request(rreq, netfs_rreq_trace_get_for_outstanding); atomic_set(&rreq->nr_outstanding, 1); io_iter = rreq->io_iter; do { @@ -680,25 +678,25 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) } while (rreq->submitted < rreq->len); if (sync) { - /* Keep nr_outstanding incremented so that the ref always belongs to - * us, and the service code isn't punted off to a random thread pool to - * process. + /* Keep nr_outstanding incremented so that the ref always + * belongs to us, and the service code isn't punted off to a + * random thread pool to process. Note that this might start + * further work, such as writing to the cache. */ - for (;;) { - wait_var_event(&rreq->nr_outstanding, - atomic_read(&rreq->nr_outstanding) == 1); + wait_var_event(&rreq->nr_outstanding, + atomic_read(&rreq->nr_outstanding) == 1); + if (atomic_dec_and_test(&rreq->nr_outstanding)) netfs_rreq_assess(rreq, false); - if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags)) - break; - cond_resched(); - } + + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip); + wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS, + TASK_UNINTERRUPTIBLE); ret = rreq->error; if (ret == 0 && rreq->submitted < rreq->len) { trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read); ret = -EIO; } - netfs_put_request(rreq, false, netfs_rreq_trace_put_hold); } else { /* If we decrement nr_outstanding to 0, the ref belongs to us. */ if (atomic_dec_and_test(&rreq->nr_outstanding)) diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 4ea4e34d279f2..6daadf2aac8aa 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -34,7 +34,9 @@ EM(netfs_rreq_trace_free, "FREE ") \ EM(netfs_rreq_trace_resubmit, "RESUBMT") \ EM(netfs_rreq_trace_unlock, "UNLOCK ") \ - E_(netfs_rreq_trace_unmark, "UNMARK ") + EM(netfs_rreq_trace_unmark, "UNMARK ") \ + EM(netfs_rreq_trace_wait_ip, "WAIT-IP") \ + E_(netfs_rreq_trace_wake_ip, "WAKE-IP") #define netfs_sreq_sources \ EM(NETFS_FILL_WITH_ZEROES, "ZERO") \ @@ -65,14 +67,13 @@ E_(netfs_fail_prepare_write, "prep-write") #define netfs_rreq_ref_traces \ - EM(netfs_rreq_trace_get_hold, "GET HOLD ") \ + EM(netfs_rreq_trace_get_for_outstanding,"GET OUTSTND") \ EM(netfs_rreq_trace_get_subreq, "GET SUBREQ ") \ EM(netfs_rreq_trace_put_complete, "PUT COMPLT ") \ EM(netfs_rreq_trace_put_discard, "PUT DISCARD") \ EM(netfs_rreq_trace_put_failed, "PUT FAILED ") \ - EM(netfs_rreq_trace_put_hold, "PUT HOLD ") \ + EM(netfs_rreq_trace_put_return, "PUT RETURN ") \ EM(netfs_rreq_trace_put_subreq, "PUT SUBREQ ") \ - EM(netfs_rreq_trace_put_zero_len, "PUT ZEROLEN") \ E_(netfs_rreq_trace_new, "NEW ") #define netfs_sreq_ref_traces \ -- GitLab From 9ebff83e648148b9ece97d4e4890dd84ca54d6ce Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 29 Sep 2023 17:28:25 +0100 Subject: [PATCH 0580/1290] netfs: Prep to use folio->private for write grouping and streaming write Prepare to use folio->private to hold information write grouping and streaming write. These are implemented in the same commit as they both make use of folio->private and will be both checked at the same time in several places. "Write grouping" involves ordering the writeback of groups of writes, such as is needed for ceph snaps. A group is represented by a filesystem-supplied object which must contain a netfs_group struct. This contains just a refcount and a pointer to a destructor. "Streaming write" is the storage of data in folios that are marked dirty, but not uptodate, to avoid unnecessary reads of data. This is represented by a netfs_folio struct. This contains the offset and length of the modified region plus the otherwise displaced write grouping pointer. The way folio->private is multiplexed is: (1) If private is NULL then neither is in operation on a dirty folio. (2) If private is set, with bit 0 clear, then this points to a group. (3) If private is set, with bit 0 set, then this points to a netfs_folio struct (with bit 0 AND'ed out). Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/netfs/internal.h | 28 ++++++++++++++++++++++++++ fs/netfs/misc.c | 46 +++++++++++++++++++++++++++++++++++++++++++ include/linux/netfs.h | 41 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 115 insertions(+) diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index 2bf2e82b2ad79..d72292e40f9b9 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -149,6 +149,34 @@ static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx) #endif } +/* + * Get a ref on a netfs group attached to a dirty page (e.g. a ceph snap). + */ +static inline struct netfs_group *netfs_get_group(struct netfs_group *netfs_group) +{ + if (netfs_group) + refcount_inc(&netfs_group->ref); + return netfs_group; +} + +/* + * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap). + */ +static inline void netfs_put_group(struct netfs_group *netfs_group) +{ + if (netfs_group && refcount_dec_and_test(&netfs_group->ref)) + netfs_group->free(netfs_group); +} + +/* + * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap). + */ +static inline void netfs_put_group_many(struct netfs_group *netfs_group, int nr) +{ + if (netfs_group && refcount_sub_and_test(nr, &netfs_group->ref)) + netfs_group->free(netfs_group); +} + /* * fscache-cache.c */ diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 5d545073fe03c..eeb44abe59c52 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -177,9 +177,55 @@ EXPORT_SYMBOL(netfs_clear_inode_writeback); */ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) { + struct netfs_folio *finfo = NULL; + size_t flen = folio_size(folio); + _enter("{%lx},%zx,%zx", folio_index(folio), offset, length); folio_wait_fscache(folio); + + if (!folio_test_private(folio)) + return; + + finfo = netfs_folio_info(folio); + + if (offset == 0 && length >= flen) + goto erase_completely; + + if (finfo) { + /* We have a partially uptodate page from a streaming write. */ + unsigned int fstart = finfo->dirty_offset; + unsigned int fend = fstart + finfo->dirty_len; + unsigned int end = offset + length; + + if (offset >= fend) + return; + if (end <= fstart) + return; + if (offset <= fstart && end >= fend) + goto erase_completely; + if (offset <= fstart && end > fstart) + goto reduce_len; + if (offset > fstart && end >= fend) + goto move_start; + /* A partial write was split. The caller has already zeroed + * it, so just absorb the hole. + */ + } + return; + +erase_completely: + netfs_put_group(netfs_folio_group(folio)); + folio_detach_private(folio); + folio_clear_uptodate(folio); + kfree(finfo); + return; +reduce_len: + finfo->dirty_len = offset + length - finfo->dirty_offset; + return; +move_start: + finfo->dirty_len -= offset - finfo->dirty_offset; + finfo->dirty_offset = offset; } EXPORT_SYMBOL(netfs_invalidate_folio); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 3fc41f616621b..cfba83e3e3d2a 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -140,6 +140,47 @@ struct netfs_inode { #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ }; +/* + * A netfs group - for instance a ceph snap. This is marked on dirty pages and + * pages marked with a group must be flushed before they can be written under + * the domain of another group. + */ +struct netfs_group { + refcount_t ref; + void (*free)(struct netfs_group *netfs_group); +}; + +/* + * Information about a dirty page (attached only if necessary). + * folio->private + */ +struct netfs_folio { + struct netfs_group *netfs_group; /* Filesystem's grouping marker (or NULL). */ + unsigned int dirty_offset; /* Write-streaming dirty data offset */ + unsigned int dirty_len; /* Write-streaming dirty data length */ +}; +#define NETFS_FOLIO_INFO 0x1UL /* OR'd with folio->private. */ + +static inline struct netfs_folio *netfs_folio_info(struct folio *folio) +{ + void *priv = folio_get_private(folio); + + if ((unsigned long)priv & NETFS_FOLIO_INFO) + return (struct netfs_folio *)((unsigned long)priv & ~NETFS_FOLIO_INFO); + return NULL; +} + +static inline struct netfs_group *netfs_folio_group(struct folio *folio) +{ + struct netfs_folio *finfo; + void *priv = folio_get_private(folio); + + finfo = netfs_folio_info(folio); + if (finfo) + return finfo->netfs_group; + return priv; +} + /* * Resources required to do operations on a cache. */ -- GitLab From 0e0f2dfe880fb19e4b15a7ca468623eb0b4ba586 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 29 Jun 2021 22:31:48 +0100 Subject: [PATCH 0581/1290] netfs: Dispatch write requests to process a writeback slice Dispatch one or more write reqeusts to process a writeback slice, where a slice is tailored more to logical block divisions within the file (such as crypto blocks, an object layout or cache granules) than the protocol RPC maximum capacity. The dispatch doesn't happen until throttling allows, at which point the entire writeback slice is processed and queued. A slice may be written to multiple destinations (one or more servers and the local cache) and the writes to each destination might be split up along different lines. The writeback slice holds the required folios pinned. An iov_iter is provided in netfs_write_request that describes the buffer to be used. This may be part of the pagecache, may have auxiliary padding pages attached or may be a bounce buffer resulting from crypto or compression. Consequently, the filesystem must not twiddle the folio markings directly. The following API is available to the filesystem: (1) The ->create_write_requests() method is called to ask the filesystem to create the requests it needs. This is passed the writeback slice to be processed. (2) The filesystem should then call netfs_create_write_request() to create the requests it needs. (3) Once a request is initialised, netfs_queue_write_request() can be called to dispatch it asynchronously, if not completed immediately. (4) netfs_write_request_completed() should be called to note the completion of a request. (5) netfs_get_write_request() and netfs_put_write_request() are provided to refcount a request. These take constants from the netfs_wreq_trace enum for logging into ftrace. (6) The ->free_write_request is method is called to ask the filesystem to clean up a request. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/netfs/Makefile | 3 +- fs/netfs/internal.h | 6 + fs/netfs/output.c | 363 +++++++++++++++++++++++++++++++++++ include/linux/netfs.h | 13 ++ include/trace/events/netfs.h | 50 ++++- 5 files changed, 432 insertions(+), 3 deletions(-) create mode 100644 fs/netfs/output.c diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile index cf3fc847b8ace..c69c6775b8acf 100644 --- a/fs/netfs/Makefile +++ b/fs/netfs/Makefile @@ -7,7 +7,8 @@ netfs-y := \ locking.o \ main.o \ misc.o \ - objects.o + objects.o \ + output.o netfs-$(CONFIG_NETFS_STATS) += stats.o diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index d72292e40f9b9..0f20587f5a9b1 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -88,6 +88,12 @@ static inline void netfs_see_request(struct netfs_io_request *rreq, trace_netfs_rreq_ref(rreq->debug_id, refcount_read(&rreq->ref), what); } +/* + * output.c + */ +int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait, + enum netfs_write_trace what); + /* * stats.c */ diff --git a/fs/netfs/output.c b/fs/netfs/output.c new file mode 100644 index 0000000000000..2ad0fd8c32be2 --- /dev/null +++ b/fs/netfs/output.c @@ -0,0 +1,363 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Network filesystem high-level write support. + * + * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/** + * netfs_create_write_request - Create a write operation. + * @wreq: The write request this is storing from. + * @dest: The destination type + * @start: Start of the region this write will modify + * @len: Length of the modification + * @worker: The worker function to handle the write(s) + * + * Allocate a write operation, set it up and add it to the list on a write + * request. + */ +struct netfs_io_subrequest *netfs_create_write_request(struct netfs_io_request *wreq, + enum netfs_io_source dest, + loff_t start, size_t len, + work_func_t worker) +{ + struct netfs_io_subrequest *subreq; + + subreq = netfs_alloc_subrequest(wreq); + if (subreq) { + INIT_WORK(&subreq->work, worker); + subreq->source = dest; + subreq->start = start; + subreq->len = len; + subreq->debug_index = wreq->subreq_counter++; + + switch (subreq->source) { + case NETFS_UPLOAD_TO_SERVER: + netfs_stat(&netfs_n_wh_upload); + break; + case NETFS_WRITE_TO_CACHE: + netfs_stat(&netfs_n_wh_write); + break; + default: + BUG(); + } + + subreq->io_iter = wreq->io_iter; + iov_iter_advance(&subreq->io_iter, subreq->start - wreq->start); + iov_iter_truncate(&subreq->io_iter, subreq->len); + + trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index, + refcount_read(&subreq->ref), + netfs_sreq_trace_new); + atomic_inc(&wreq->nr_outstanding); + list_add_tail(&subreq->rreq_link, &wreq->subrequests); + trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); + } + + return subreq; +} +EXPORT_SYMBOL(netfs_create_write_request); + +/* + * Process a completed write request once all the component operations have + * been completed. + */ +static void netfs_write_terminated(struct netfs_io_request *wreq, bool was_async) +{ + struct netfs_io_subrequest *subreq; + struct netfs_inode *ctx = netfs_inode(wreq->inode); + + _enter("R=%x[]", wreq->debug_id); + + trace_netfs_rreq(wreq, netfs_rreq_trace_write_done); + + list_for_each_entry(subreq, &wreq->subrequests, rreq_link) { + if (!subreq->error) + continue; + switch (subreq->source) { + case NETFS_UPLOAD_TO_SERVER: + /* Depending on the type of failure, this may prevent + * writeback completion unless we're in disconnected + * mode. + */ + if (!wreq->error) + wreq->error = subreq->error; + break; + + case NETFS_WRITE_TO_CACHE: + /* Failure doesn't prevent writeback completion unless + * we're in disconnected mode. + */ + if (subreq->error != -ENOBUFS) + ctx->ops->invalidate_cache(wreq); + break; + + default: + WARN_ON_ONCE(1); + if (!wreq->error) + wreq->error = -EIO; + return; + } + } + + wreq->cleanup(wreq); + + _debug("finished"); + trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip); + clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags); + wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS); + + netfs_clear_subrequests(wreq, was_async); + netfs_put_request(wreq, was_async, netfs_rreq_trace_put_complete); +} + +/* + * Deal with the completion of writing the data to the cache. + */ +void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, + bool was_async) +{ + struct netfs_io_subrequest *subreq = _op; + struct netfs_io_request *wreq = subreq->rreq; + unsigned int u; + + _enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error); + + switch (subreq->source) { + case NETFS_UPLOAD_TO_SERVER: + netfs_stat(&netfs_n_wh_upload_done); + break; + case NETFS_WRITE_TO_CACHE: + netfs_stat(&netfs_n_wh_write_done); + break; + case NETFS_INVALID_WRITE: + break; + default: + BUG(); + } + + if (IS_ERR_VALUE(transferred_or_error)) { + subreq->error = transferred_or_error; + trace_netfs_failure(wreq, subreq, transferred_or_error, + netfs_fail_write); + goto failed; + } + + if (WARN(transferred_or_error > subreq->len - subreq->transferred, + "Subreq excess write: R%x[%x] %zd > %zu - %zu", + wreq->debug_id, subreq->debug_index, + transferred_or_error, subreq->len, subreq->transferred)) + transferred_or_error = subreq->len - subreq->transferred; + + subreq->error = 0; + subreq->transferred += transferred_or_error; + + if (iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred) + pr_warn("R=%08x[%u] ITER POST-MISMATCH %zx != %zx-%zx %x\n", + wreq->debug_id, subreq->debug_index, + iov_iter_count(&subreq->io_iter), subreq->len, + subreq->transferred, subreq->io_iter.iter_type); + + if (subreq->transferred < subreq->len) + goto incomplete; + + __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); +out: + trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); + + /* If we decrement nr_outstanding to 0, the ref belongs to us. */ + u = atomic_dec_return(&wreq->nr_outstanding); + if (u == 0) + netfs_write_terminated(wreq, was_async); + else if (u == 1) + wake_up_var(&wreq->nr_outstanding); + + netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); + return; + +incomplete: + if (transferred_or_error == 0) { + if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) { + subreq->error = -ENODATA; + goto failed; + } + } else { + __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); + } + + __set_bit(NETFS_SREQ_SHORT_IO, &subreq->flags); + set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags); + goto out; + +failed: + switch (subreq->source) { + case NETFS_WRITE_TO_CACHE: + netfs_stat(&netfs_n_wh_write_failed); + set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags); + break; + case NETFS_UPLOAD_TO_SERVER: + netfs_stat(&netfs_n_wh_upload_failed); + set_bit(NETFS_RREQ_FAILED, &wreq->flags); + wreq->error = subreq->error; + break; + default: + break; + } + goto out; +} +EXPORT_SYMBOL(netfs_write_subrequest_terminated); + +static void netfs_write_to_cache_op(struct netfs_io_subrequest *subreq) +{ + struct netfs_io_request *wreq = subreq->rreq; + struct netfs_cache_resources *cres = &wreq->cache_resources; + + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + + cres->ops->write(cres, subreq->start, &subreq->io_iter, + netfs_write_subrequest_terminated, subreq); +} + +static void netfs_write_to_cache_op_worker(struct work_struct *work) +{ + struct netfs_io_subrequest *subreq = + container_of(work, struct netfs_io_subrequest, work); + + netfs_write_to_cache_op(subreq); +} + +/** + * netfs_queue_write_request - Queue a write request for attention + * @subreq: The write request to be queued + * + * Queue the specified write request for processing by a worker thread. We + * pass the caller's ref on the request to the worker thread. + */ +void netfs_queue_write_request(struct netfs_io_subrequest *subreq) +{ + if (!queue_work(system_unbound_wq, &subreq->work)) + netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_wip); +} +EXPORT_SYMBOL(netfs_queue_write_request); + +/* + * Set up a op for writing to the cache. + */ +static void netfs_set_up_write_to_cache(struct netfs_io_request *wreq) +{ + struct netfs_cache_resources *cres; + struct netfs_io_subrequest *subreq; + struct netfs_inode *ctx = netfs_inode(wreq->inode); + struct fscache_cookie *cookie = netfs_i_cookie(ctx); + loff_t start = wreq->start; + size_t len = wreq->len; + int ret; + + if (!fscache_cookie_enabled(cookie)) { + clear_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags); + return; + } + + _debug("write to cache"); + subreq = netfs_create_write_request(wreq, NETFS_WRITE_TO_CACHE, start, len, + netfs_write_to_cache_op_worker); + if (!subreq) + return; + + cres = &wreq->cache_resources; + ret = fscache_begin_read_operation(cres, cookie); + if (ret < 0) { + netfs_write_subrequest_terminated(subreq, ret, false); + return; + } + + ret = cres->ops->prepare_write(cres, &start, &len, i_size_read(wreq->inode), + true); + if (ret < 0) { + netfs_write_subrequest_terminated(subreq, ret, false); + return; + } + + netfs_queue_write_request(subreq); +} + +/* + * Begin the process of writing out a chunk of data. + * + * We are given a write request that holds a series of dirty regions and + * (partially) covers a sequence of folios, all of which are present. The + * pages must have been marked as writeback as appropriate. + * + * We need to perform the following steps: + * + * (1) If encrypting, create an output buffer and encrypt each block of the + * data into it, otherwise the output buffer will point to the original + * folios. + * + * (2) If the data is to be cached, set up a write op for the entire output + * buffer to the cache, if the cache wants to accept it. + * + * (3) If the data is to be uploaded (ie. not merely cached): + * + * (a) If the data is to be compressed, create a compression buffer and + * compress the data into it. + * + * (b) For each destination we want to upload to, set up write ops to write + * to that destination. We may need multiple writes if the data is not + * contiguous or the span exceeds wsize for a server. + */ +int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait, + enum netfs_write_trace what) +{ + struct netfs_inode *ctx = netfs_inode(wreq->inode); + + _enter("R=%x %llx-%llx f=%lx", + wreq->debug_id, wreq->start, wreq->start + wreq->len - 1, + wreq->flags); + + trace_netfs_write(wreq, what); + if (wreq->len == 0 || wreq->iter.count == 0) { + pr_err("Zero-sized write [R=%x]\n", wreq->debug_id); + return -EIO; + } + + wreq->io_iter = wreq->iter; + + /* ->outstanding > 0 carries a ref */ + netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding); + atomic_set(&wreq->nr_outstanding, 1); + + /* Start the encryption/compression going. We can do that in the + * background whilst we generate a list of write ops that we want to + * perform. + */ + // TODO: Encrypt or compress the region as appropriate + + /* We need to write all of the region to the cache */ + if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags)) + netfs_set_up_write_to_cache(wreq); + + /* However, we don't necessarily write all of the region to the server. + * Caching of reads is being managed this way also. + */ + if (test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags)) + ctx->ops->create_write_requests(wreq, wreq->start, wreq->len); + + if (atomic_dec_and_test(&wreq->nr_outstanding)) + netfs_write_terminated(wreq, false); + + if (!may_wait) + return -EIOCBQUEUED; + + wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, + TASK_UNINTERRUPTIBLE); + return wreq->error; +} diff --git a/include/linux/netfs.h b/include/linux/netfs.h index cfba83e3e3d2a..890a5d8b22992 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -249,6 +249,7 @@ struct netfs_io_request { struct bio_vec *direct_bv; /* DIO buffer list (when handling iovec-iter) */ unsigned int direct_bv_count; /* Number of elements in direct_bv[] */ unsigned int debug_id; + unsigned int wsize; /* Maximum write size (0 for none) */ unsigned int subreq_counter; /* Next subreq->debug_index */ atomic_t nr_outstanding; /* Number of ops in progress */ atomic_t nr_copy_ops; /* Number of copy-to-cache ops in progress */ @@ -271,6 +272,7 @@ struct netfs_io_request { #define NETFS_RREQ_WRITE_TO_CACHE 7 /* Need to write to the cache */ #define NETFS_RREQ_UPLOAD_TO_SERVER 8 /* Need to write to the server */ const struct netfs_request_ops *netfs_ops; + void (*cleanup)(struct netfs_io_request *req); }; /* @@ -294,6 +296,11 @@ struct netfs_request_ops { /* Modification handling */ void (*update_i_size)(struct inode *inode, loff_t i_size); + + /* Write request handling */ + void (*create_write_requests)(struct netfs_io_request *wreq, + loff_t start, size_t len); + void (*invalidate_cache)(struct netfs_io_request *wreq); }; /* @@ -382,6 +389,12 @@ ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len, iov_iter_extraction_t extraction_flags); size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset, size_t max_size, size_t max_segs); +struct netfs_io_subrequest *netfs_create_write_request( + struct netfs_io_request *wreq, enum netfs_io_source dest, + loff_t start, size_t len, work_func_t worker); +void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, + bool was_async); +void netfs_queue_write_request(struct netfs_io_subrequest *subreq); int netfs_start_io_read(struct inode *inode); void netfs_end_io_read(struct inode *inode); diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 6daadf2aac8aa..e03635172760e 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -21,6 +21,11 @@ EM(netfs_read_trace_readpage, "READPAGE ") \ E_(netfs_read_trace_write_begin, "WRITEBEGN") +#define netfs_write_traces \ + EM(netfs_write_trace_dio_write, "DIO-WRITE") \ + EM(netfs_write_trace_unbuffered_write, "UNB-WRITE") \ + E_(netfs_write_trace_writeback, "WRITEBACK") + #define netfs_rreq_origins \ EM(NETFS_READAHEAD, "RA") \ EM(NETFS_READPAGE, "RP") \ @@ -32,11 +37,13 @@ EM(netfs_rreq_trace_copy, "COPY ") \ EM(netfs_rreq_trace_done, "DONE ") \ EM(netfs_rreq_trace_free, "FREE ") \ + EM(netfs_rreq_trace_redirty, "REDIRTY") \ EM(netfs_rreq_trace_resubmit, "RESUBMT") \ EM(netfs_rreq_trace_unlock, "UNLOCK ") \ EM(netfs_rreq_trace_unmark, "UNMARK ") \ EM(netfs_rreq_trace_wait_ip, "WAIT-IP") \ - E_(netfs_rreq_trace_wake_ip, "WAKE-IP") + EM(netfs_rreq_trace_wake_ip, "WAKE-IP") \ + E_(netfs_rreq_trace_write_done, "WR-DONE") #define netfs_sreq_sources \ EM(NETFS_FILL_WITH_ZEROES, "ZERO") \ @@ -64,7 +71,8 @@ EM(netfs_fail_copy_to_cache, "copy-to-cache") \ EM(netfs_fail_read, "read") \ EM(netfs_fail_short_read, "short-read") \ - E_(netfs_fail_prepare_write, "prep-write") + EM(netfs_fail_prepare_write, "prep-write") \ + E_(netfs_fail_write, "write") #define netfs_rreq_ref_traces \ EM(netfs_rreq_trace_get_for_outstanding,"GET OUTSTND") \ @@ -74,6 +82,8 @@ EM(netfs_rreq_trace_put_failed, "PUT FAILED ") \ EM(netfs_rreq_trace_put_return, "PUT RETURN ") \ EM(netfs_rreq_trace_put_subreq, "PUT SUBREQ ") \ + EM(netfs_rreq_trace_put_work, "PUT WORK ") \ + EM(netfs_rreq_trace_see_work, "SEE WORK ") \ E_(netfs_rreq_trace_new, "NEW ") #define netfs_sreq_ref_traces \ @@ -82,9 +92,12 @@ EM(netfs_sreq_trace_get_short_read, "GET SHORTRD") \ EM(netfs_sreq_trace_new, "NEW ") \ EM(netfs_sreq_trace_put_clear, "PUT CLEAR ") \ + EM(netfs_sreq_trace_put_discard, "PUT DISCARD") \ EM(netfs_sreq_trace_put_failed, "PUT FAILED ") \ EM(netfs_sreq_trace_put_merged, "PUT MERGED ") \ EM(netfs_sreq_trace_put_no_copy, "PUT NO COPY") \ + EM(netfs_sreq_trace_put_wip, "PUT WIP ") \ + EM(netfs_sreq_trace_put_work, "PUT WORK ") \ E_(netfs_sreq_trace_put_terminated, "PUT TERM ") #ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY @@ -96,6 +109,7 @@ #define E_(a, b) a enum netfs_read_trace { netfs_read_traces } __mode(byte); +enum netfs_write_trace { netfs_write_traces } __mode(byte); enum netfs_rreq_trace { netfs_rreq_traces } __mode(byte); enum netfs_sreq_trace { netfs_sreq_traces } __mode(byte); enum netfs_failure { netfs_failures } __mode(byte); @@ -113,6 +127,7 @@ enum netfs_sreq_ref_trace { netfs_sreq_ref_traces } __mode(byte); #define E_(a, b) TRACE_DEFINE_ENUM(a); netfs_read_traces; +netfs_write_traces; netfs_rreq_origins; netfs_rreq_traces; netfs_sreq_sources; @@ -320,6 +335,37 @@ TRACE_EVENT(netfs_sreq_ref, __entry->ref) ); +TRACE_EVENT(netfs_write, + TP_PROTO(const struct netfs_io_request *wreq, + enum netfs_write_trace what), + + TP_ARGS(wreq, what), + + TP_STRUCT__entry( + __field(unsigned int, wreq ) + __field(unsigned int, cookie ) + __field(enum netfs_write_trace, what ) + __field(unsigned long long, start ) + __field(size_t, len ) + ), + + TP_fast_assign( + struct netfs_inode *__ctx = netfs_inode(wreq->inode); + struct fscache_cookie *__cookie = netfs_i_cookie(__ctx); + __entry->wreq = wreq->debug_id; + __entry->cookie = __cookie ? __cookie->debug_id : 0; + __entry->what = what; + __entry->start = wreq->start; + __entry->len = wreq->len; + ), + + TP_printk("R=%08x %s c=%08x by=%llx-%llx", + __entry->wreq, + __print_symbolic(__entry->what, netfs_write_traces), + __entry->cookie, + __entry->start, __entry->start + __entry->len - 1) + ); + #undef EM #undef E_ #endif /* _TRACE_NETFS_H */ -- GitLab From c38f4e96e605f17990e871214e6ea1496bc4e65f Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 17 Jun 2021 13:09:21 +0100 Subject: [PATCH 0582/1290] netfs: Provide func to copy data to pagecache for buffered write Provide a netfs write helper, netfs_perform_write() to buffer data to be written in the pagecache and mark the modified folios dirty. It will perform "streaming writes" for folios that aren't currently resident, if possible, storing data in partially modified folios that are marked dirty, but not uptodate. It will also tag pages as belonging to fs-specific write groups if so directed by the filesystem. This is derived from generic_perform_write(), but doesn't use ->write_begin() and ->write_end(), having that logic rolled in instead. Signed-off-by: David Howells cc: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/netfs/Makefile | 1 + fs/netfs/buffered_read.c | 49 ++++++ fs/netfs/buffered_write.c | 330 +++++++++++++++++++++++++++++++++++ fs/netfs/internal.h | 2 + fs/netfs/io.c | 1 + include/linux/netfs.h | 5 + include/trace/events/netfs.h | 73 ++++++++ 7 files changed, 461 insertions(+) create mode 100644 fs/netfs/buffered_write.c diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile index c69c6775b8acf..85d8333a1ed46 100644 --- a/fs/netfs/Makefile +++ b/fs/netfs/Makefile @@ -2,6 +2,7 @@ netfs-y := \ buffered_read.o \ + buffered_write.o \ io.o \ iterator.o \ locking.o \ diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 6b9a44cafbac6..73a6e4d61f9de 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -63,6 +63,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) break; } if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) { + trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); folio_start_fscache(folio); folio_started = true; } @@ -454,3 +455,51 @@ error: return ret; } EXPORT_SYMBOL(netfs_write_begin); + +/* + * Preload the data into a page we're proposing to write into. + */ +int netfs_prefetch_for_write(struct file *file, struct folio *folio, + size_t offset, size_t len) +{ + struct netfs_io_request *rreq; + struct address_space *mapping = folio_file_mapping(folio); + struct netfs_inode *ctx = netfs_inode(mapping->host); + unsigned long long start = folio_pos(folio); + size_t flen = folio_size(folio); + int ret; + + _enter("%zx @%llx", flen, start); + + ret = -ENOMEM; + + rreq = netfs_alloc_request(mapping, file, start, flen, + NETFS_READ_FOR_WRITE); + if (IS_ERR(rreq)) { + ret = PTR_ERR(rreq); + goto error; + } + + rreq->no_unlock_folio = folio_index(folio); + __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); + ret = netfs_begin_cache_read(rreq, ctx); + if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) + goto error_put; + + netfs_stat(&netfs_n_rh_write_begin); + trace_netfs_read(rreq, start, flen, netfs_read_trace_prefetch_for_write); + + /* Set up the output buffer */ + iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, + rreq->start, rreq->len); + + ret = netfs_begin_read(rreq, true); + netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + return ret; + +error_put: + netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); +error: + _leave(" = %d", ret); + return ret; +} diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c new file mode 100644 index 0000000000000..6e7f06d9962db --- /dev/null +++ b/fs/netfs/buffered_write.c @@ -0,0 +1,330 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Network filesystem high-level write support. + * + * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/* + * Determined write method. Adjust netfs_folio_traces if this is changed. + */ +enum netfs_how_to_modify { + NETFS_FOLIO_IS_UPTODATE, /* Folio is uptodate already */ + NETFS_JUST_PREFETCH, /* We have to read the folio anyway */ + NETFS_WHOLE_FOLIO_MODIFY, /* We're going to overwrite the whole folio */ + NETFS_MODIFY_AND_CLEAR, /* We can assume there is no data to be downloaded. */ + NETFS_STREAMING_WRITE, /* Store incomplete data in non-uptodate page. */ + NETFS_STREAMING_WRITE_CONT, /* Continue streaming write. */ + NETFS_FLUSH_CONTENT, /* Flush incompatible content. */ +}; + +static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) +{ + if (netfs_group && !folio_get_private(folio)) + folio_attach_private(folio, netfs_get_group(netfs_group)); +} + +/* + * Decide how we should modify a folio. We might be attempting to do + * write-streaming, in which case we don't want to a local RMW cycle if we can + * avoid it. If we're doing local caching or content crypto, we award that + * priority over avoiding RMW. If the file is open readably, then we also + * assume that we may want to read what we wrote. + */ +static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx, + struct file *file, + struct folio *folio, + void *netfs_group, + size_t flen, + size_t offset, + size_t len, + bool maybe_trouble) +{ + struct netfs_folio *finfo = netfs_folio_info(folio); + loff_t pos = folio_file_pos(folio); + + _enter(""); + + if (netfs_folio_group(folio) != netfs_group) + return NETFS_FLUSH_CONTENT; + + if (folio_test_uptodate(folio)) + return NETFS_FOLIO_IS_UPTODATE; + + if (pos >= ctx->remote_i_size) + return NETFS_MODIFY_AND_CLEAR; + + if (!maybe_trouble && offset == 0 && len >= flen) + return NETFS_WHOLE_FOLIO_MODIFY; + + if (file->f_mode & FMODE_READ) + return NETFS_JUST_PREFETCH; + + if (netfs_is_cache_enabled(ctx)) + return NETFS_JUST_PREFETCH; + + if (!finfo) + return NETFS_STREAMING_WRITE; + + /* We can continue a streaming write only if it continues on from the + * previous. If it overlaps, we must flush lest we suffer a partial + * copy and disjoint dirty regions. + */ + if (offset == finfo->dirty_offset + finfo->dirty_len) + return NETFS_STREAMING_WRITE_CONT; + return NETFS_FLUSH_CONTENT; +} + +/* + * Grab a folio for writing and lock it. + */ +static struct folio *netfs_grab_folio_for_write(struct address_space *mapping, + loff_t pos, size_t part) +{ + pgoff_t index = pos / PAGE_SIZE; + + return __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); +} + +/** + * netfs_perform_write - Copy data into the pagecache. + * @iocb: The operation parameters + * @iter: The source buffer + * @netfs_group: Grouping for dirty pages (eg. ceph snaps). + * + * Copy data into pagecache pages attached to the inode specified by @iocb. + * The caller must hold appropriate inode locks. + * + * Dirty pages are tagged with a netfs_folio struct if they're not up to date + * to indicate the range modified. Dirty pages may also be tagged with a + * netfs-specific grouping such that data from an old group gets flushed before + * a new one is started. + */ +ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, + struct netfs_group *netfs_group) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct address_space *mapping = inode->i_mapping; + struct netfs_inode *ctx = netfs_inode(inode); + struct netfs_folio *finfo; + struct folio *folio; + enum netfs_how_to_modify howto; + enum netfs_folio_trace trace; + unsigned int bdp_flags = (iocb->ki_flags & IOCB_SYNC) ? 0: BDP_ASYNC; + ssize_t written = 0, ret; + loff_t i_size, pos = iocb->ki_pos, from, to; + size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER; + bool maybe_trouble = false; + + do { + size_t flen; + size_t offset; /* Offset into pagecache folio */ + size_t part; /* Bytes to write to folio */ + size_t copied; /* Bytes copied from user */ + + ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags); + if (unlikely(ret < 0)) + break; + + offset = pos & (max_chunk - 1); + part = min(max_chunk - offset, iov_iter_count(iter)); + + /* Bring in the user pages that we will copy from _first_ lest + * we hit a nasty deadlock on copying from the same page as + * we're writing to, without it being marked uptodate. + * + * Not only is this an optimisation, but it is also required to + * check that the address is actually valid, when atomic + * usercopies are used below. + * + * We rely on the page being held onto long enough by the LRU + * that we can grab it below if this causes it to be read. + */ + ret = -EFAULT; + if (unlikely(fault_in_iov_iter_readable(iter, part) == part)) + break; + + ret = -ENOMEM; + folio = netfs_grab_folio_for_write(mapping, pos, part); + if (!folio) + break; + + flen = folio_size(folio); + offset = pos & (flen - 1); + part = min_t(size_t, flen - offset, part); + + if (signal_pending(current)) { + ret = written ? -EINTR : -ERESTARTSYS; + goto error_folio_unlock; + } + + /* See if we need to prefetch the area we're going to modify. + * We need to do this before we get a lock on the folio in case + * there's more than one writer competing for the same cache + * block. + */ + howto = netfs_how_to_modify(ctx, file, folio, netfs_group, + flen, offset, part, maybe_trouble); + _debug("howto %u", howto); + switch (howto) { + case NETFS_JUST_PREFETCH: + ret = netfs_prefetch_for_write(file, folio, offset, part); + if (ret < 0) { + _debug("prefetch = %zd", ret); + goto error_folio_unlock; + } + break; + case NETFS_FOLIO_IS_UPTODATE: + case NETFS_WHOLE_FOLIO_MODIFY: + case NETFS_STREAMING_WRITE_CONT: + break; + case NETFS_MODIFY_AND_CLEAR: + zero_user_segment(&folio->page, 0, offset); + break; + case NETFS_STREAMING_WRITE: + ret = -EIO; + if (WARN_ON(folio_get_private(folio))) + goto error_folio_unlock; + break; + case NETFS_FLUSH_CONTENT: + trace_netfs_folio(folio, netfs_flush_content); + from = folio_pos(folio); + to = from + folio_size(folio) - 1; + folio_unlock(folio); + folio_put(folio); + ret = filemap_write_and_wait_range(mapping, from, to); + if (ret < 0) + goto error_folio_unlock; + continue; + } + + if (mapping_writably_mapped(mapping)) + flush_dcache_folio(folio); + + copied = copy_folio_from_iter_atomic(folio, offset, part, iter); + + flush_dcache_folio(folio); + + /* Deal with a (partially) failed copy */ + if (copied == 0) { + ret = -EFAULT; + goto error_folio_unlock; + } + + trace = (enum netfs_folio_trace)howto; + switch (howto) { + case NETFS_FOLIO_IS_UPTODATE: + case NETFS_JUST_PREFETCH: + netfs_set_group(folio, netfs_group); + break; + case NETFS_MODIFY_AND_CLEAR: + zero_user_segment(&folio->page, offset + copied, flen); + netfs_set_group(folio, netfs_group); + folio_mark_uptodate(folio); + break; + case NETFS_WHOLE_FOLIO_MODIFY: + if (unlikely(copied < part)) { + maybe_trouble = true; + iov_iter_revert(iter, copied); + copied = 0; + goto retry; + } + netfs_set_group(folio, netfs_group); + folio_mark_uptodate(folio); + break; + case NETFS_STREAMING_WRITE: + if (offset == 0 && copied == flen) { + netfs_set_group(folio, netfs_group); + folio_mark_uptodate(folio); + trace = netfs_streaming_filled_page; + break; + } + finfo = kzalloc(sizeof(*finfo), GFP_KERNEL); + if (!finfo) { + iov_iter_revert(iter, copied); + ret = -ENOMEM; + goto error_folio_unlock; + } + finfo->netfs_group = netfs_get_group(netfs_group); + finfo->dirty_offset = offset; + finfo->dirty_len = copied; + folio_attach_private(folio, (void *)((unsigned long)finfo | + NETFS_FOLIO_INFO)); + break; + case NETFS_STREAMING_WRITE_CONT: + finfo = netfs_folio_info(folio); + finfo->dirty_len += copied; + if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) { + if (finfo->netfs_group) + folio_change_private(folio, finfo->netfs_group); + else + folio_detach_private(folio); + folio_mark_uptodate(folio); + kfree(finfo); + trace = netfs_streaming_cont_filled_page; + } + break; + default: + WARN(true, "Unexpected modify type %u ix=%lx\n", + howto, folio_index(folio)); + ret = -EIO; + goto error_folio_unlock; + } + + trace_netfs_folio(folio, trace); + + /* Update the inode size if we moved the EOF marker */ + i_size = i_size_read(inode); + pos += copied; + if (pos > i_size) { + if (ctx->ops->update_i_size) { + ctx->ops->update_i_size(inode, pos); + } else { + i_size_write(inode, pos); +#if IS_ENABLED(CONFIG_FSCACHE) + fscache_update_cookie(ctx->cache, NULL, &pos); +#endif + } + } + written += copied; + + folio_mark_dirty(folio); + retry: + folio_unlock(folio); + folio_put(folio); + folio = NULL; + + cond_resched(); + } while (iov_iter_count(iter)); + +out: + if (likely(written)) { + /* Flush and wait for a write that requires immediate synchronisation. */ + if (iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) { + _debug("dsync"); + ret = filemap_fdatawait_range(mapping, iocb->ki_pos, + iocb->ki_pos + written); + } + + iocb->ki_pos += written; + } + + _leave(" = %zd [%zd]", written, ret); + return written ? written : ret; + +error_folio_unlock: + folio_unlock(folio); + folio_put(folio); + goto out; +} +EXPORT_SYMBOL(netfs_perform_write); diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index 0f20587f5a9b1..17e4ea4456c75 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -23,6 +23,8 @@ * buffered_read.c */ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq); +int netfs_prefetch_for_write(struct file *file, struct folio *folio, + size_t offset, size_t len); /* * io.c diff --git a/fs/netfs/io.c b/fs/netfs/io.c index e83ef5835d250..774aef6ea4cbc 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -125,6 +125,7 @@ static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq, if (have_unlocked && folio_index(folio) <= unlocked) continue; unlocked = folio_index(folio); + trace_netfs_folio(folio, netfs_folio_trace_end_copy); folio_end_fscache(folio); have_unlocked = true; } diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 890a5d8b22992..70f578cf3715f 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -367,6 +367,11 @@ struct netfs_cache_ops { loff_t *_data_start, size_t *_data_len); }; +/* High-level write API */ +ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, + struct netfs_group *netfs_group); + +/* Address operations API */ struct readahead_control; void netfs_readahead(struct readahead_control *); int netfs_read_folio(struct file *, struct folio *); diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index e03635172760e..8308b81f36bed 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -19,6 +19,7 @@ EM(netfs_read_trace_expanded, "EXPANDED ") \ EM(netfs_read_trace_readahead, "READAHEAD") \ EM(netfs_read_trace_readpage, "READPAGE ") \ + EM(netfs_read_trace_prefetch_for_write, "PREFETCHW") \ E_(netfs_read_trace_write_begin, "WRITEBEGN") #define netfs_write_traces \ @@ -100,6 +101,31 @@ EM(netfs_sreq_trace_put_work, "PUT WORK ") \ E_(netfs_sreq_trace_put_terminated, "PUT TERM ") +#define netfs_folio_traces \ + /* The first few correspond to enum netfs_how_to_modify */ \ + EM(netfs_folio_is_uptodate, "mod-uptodate") \ + EM(netfs_just_prefetch, "mod-prefetch") \ + EM(netfs_whole_folio_modify, "mod-whole-f") \ + EM(netfs_modify_and_clear, "mod-n-clear") \ + EM(netfs_streaming_write, "mod-streamw") \ + EM(netfs_streaming_write_cont, "mod-streamw+") \ + EM(netfs_flush_content, "flush") \ + EM(netfs_streaming_filled_page, "mod-streamw-f") \ + EM(netfs_streaming_cont_filled_page, "mod-streamw-f+") \ + /* The rest are for writeback */ \ + EM(netfs_folio_trace_clear, "clear") \ + EM(netfs_folio_trace_clear_s, "clear-s") \ + EM(netfs_folio_trace_clear_g, "clear-g") \ + EM(netfs_folio_trace_copy_to_cache, "copy") \ + EM(netfs_folio_trace_end_copy, "end-copy") \ + EM(netfs_folio_trace_kill, "kill") \ + EM(netfs_folio_trace_mkwrite, "mkwrite") \ + EM(netfs_folio_trace_mkwrite_plus, "mkwrite+") \ + EM(netfs_folio_trace_redirty, "redirty") \ + EM(netfs_folio_trace_redirtied, "redirtied") \ + EM(netfs_folio_trace_store, "store") \ + E_(netfs_folio_trace_store_plus, "store+") + #ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY #define __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY @@ -115,6 +141,7 @@ enum netfs_sreq_trace { netfs_sreq_traces } __mode(byte); enum netfs_failure { netfs_failures } __mode(byte); enum netfs_rreq_ref_trace { netfs_rreq_ref_traces } __mode(byte); enum netfs_sreq_ref_trace { netfs_sreq_ref_traces } __mode(byte); +enum netfs_folio_trace { netfs_folio_traces } __mode(byte); #endif @@ -135,6 +162,7 @@ netfs_sreq_traces; netfs_failures; netfs_rreq_ref_traces; netfs_sreq_ref_traces; +netfs_folio_traces; /* * Now redefine the EM() and E_() macros to map the enums to the strings that @@ -335,6 +363,51 @@ TRACE_EVENT(netfs_sreq_ref, __entry->ref) ); +TRACE_EVENT(netfs_folio, + TP_PROTO(struct folio *folio, enum netfs_folio_trace why), + + TP_ARGS(folio, why), + + TP_STRUCT__entry( + __field(ino_t, ino) + __field(pgoff_t, index) + __field(unsigned int, nr) + __field(enum netfs_folio_trace, why) + ), + + TP_fast_assign( + __entry->ino = folio->mapping->host->i_ino; + __entry->why = why; + __entry->index = folio_index(folio); + __entry->nr = folio_nr_pages(folio); + ), + + TP_printk("i=%05lx ix=%05lx-%05lx %s", + __entry->ino, __entry->index, __entry->index + __entry->nr - 1, + __print_symbolic(__entry->why, netfs_folio_traces)) + ); + +TRACE_EVENT(netfs_write_iter, + TP_PROTO(const struct kiocb *iocb, const struct iov_iter *from), + + TP_ARGS(iocb, from), + + TP_STRUCT__entry( + __field(unsigned long long, start ) + __field(size_t, len ) + __field(unsigned int, flags ) + ), + + TP_fast_assign( + __entry->start = iocb->ki_pos; + __entry->len = iov_iter_count(from); + __entry->flags = iocb->ki_flags; + ), + + TP_printk("WRITE-ITER s=%llx l=%zx f=%x", + __entry->start, __entry->len, __entry->flags) + ); + TRACE_EVENT(netfs_write, TP_PROTO(const struct netfs_io_request *wreq, enum netfs_write_trace what), -- GitLab From 7f84a7b9892d1c9429a6f5d6f67916c61b3fc183 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 2 Oct 2023 12:51:19 +0100 Subject: [PATCH 0583/1290] netfs: Make netfs_read_folio() handle streaming-write pages netfs_read_folio() needs to handle partially-valid pages that are marked dirty, but not uptodate in the event that someone tries to read a page was used to cache data by a streaming write. In such a case, make netfs_read_folio() set up a bvec iterator that points to the parts of the folio that need filling and to a sink page for the data that should be discarded and use that instead of i_pages as the iterator to be written to. This requires netfs_rreq_unlock_folios() to convert the page into a normal dirty uptodate page, getting rid of the partial write record and bumping the group pointer over to folio->private. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/netfs/buffered_read.c | 61 ++++++++++++++++++++++++++++++++++-- include/trace/events/netfs.h | 2 ++ 2 files changed, 60 insertions(+), 3 deletions(-) diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 73a6e4d61f9de..950f63fc156a8 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -16,6 +16,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) { struct netfs_io_subrequest *subreq; + struct netfs_folio *finfo; struct folio *folio; pgoff_t start_page = rreq->start / PAGE_SIZE; pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; @@ -87,6 +88,15 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) if (!pg_failed) { flush_dcache_folio(folio); + finfo = netfs_folio_info(folio); + if (finfo) { + trace_netfs_folio(folio, netfs_folio_trace_filled_gaps); + if (finfo->netfs_group) + folio_change_private(folio, finfo->netfs_group); + else + folio_detach_private(folio); + kfree(finfo); + } folio_mark_uptodate(folio); } @@ -239,6 +249,7 @@ int netfs_read_folio(struct file *file, struct folio *folio) struct address_space *mapping = folio_file_mapping(folio); struct netfs_io_request *rreq; struct netfs_inode *ctx = netfs_inode(mapping->host); + struct folio *sink = NULL; int ret; _enter("%lx", folio_index(folio)); @@ -259,12 +270,56 @@ int netfs_read_folio(struct file *file, struct folio *folio) trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage); /* Set up the output buffer */ - iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, - rreq->start, rreq->len); + if (folio_test_dirty(folio)) { + /* Handle someone trying to read from an unflushed streaming + * write. We fiddle the buffer so that a gap at the beginning + * and/or a gap at the end get copied to, but the middle is + * discarded. + */ + struct netfs_folio *finfo = netfs_folio_info(folio); + struct bio_vec *bvec; + unsigned int from = finfo->dirty_offset; + unsigned int to = from + finfo->dirty_len; + unsigned int off = 0, i = 0; + size_t flen = folio_size(folio); + size_t nr_bvec = flen / PAGE_SIZE + 2; + size_t part; + + ret = -ENOMEM; + bvec = kmalloc_array(nr_bvec, sizeof(*bvec), GFP_KERNEL); + if (!bvec) + goto discard; + + sink = folio_alloc(GFP_KERNEL, 0); + if (!sink) + goto discard; + + trace_netfs_folio(folio, netfs_folio_trace_read_gaps); + + rreq->direct_bv = bvec; + rreq->direct_bv_count = nr_bvec; + if (from > 0) { + bvec_set_folio(&bvec[i++], folio, from, 0); + off = from; + } + while (off < to) { + part = min_t(size_t, to - off, PAGE_SIZE); + bvec_set_folio(&bvec[i++], sink, part, 0); + off += part; + } + if (to < flen) + bvec_set_folio(&bvec[i++], folio, flen - to, to); + iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len); + } else { + iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, + rreq->start, rreq->len); + } ret = netfs_begin_read(rreq, true); + if (sink) + folio_put(sink); netfs_put_request(rreq, false, netfs_rreq_trace_put_return); - return ret; + return ret < 0 ? ret : 0; discard: netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 8308b81f36bed..082a5e717b58f 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -118,9 +118,11 @@ EM(netfs_folio_trace_clear_g, "clear-g") \ EM(netfs_folio_trace_copy_to_cache, "copy") \ EM(netfs_folio_trace_end_copy, "end-copy") \ + EM(netfs_folio_trace_filled_gaps, "filled-gaps") \ EM(netfs_folio_trace_kill, "kill") \ EM(netfs_folio_trace_mkwrite, "mkwrite") \ EM(netfs_folio_trace_mkwrite_plus, "mkwrite+") \ + EM(netfs_folio_trace_read_gaps, "read-gaps") \ EM(netfs_folio_trace_redirty, "redirty") \ EM(netfs_folio_trace_redirtied, "redirtied") \ EM(netfs_folio_trace_store, "store") \ -- GitLab From e2e2e83924b1fe4c28bf5617db90e893755e9cbd Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 29 Sep 2023 20:11:31 +0100 Subject: [PATCH 0584/1290] netfs: Allocate multipage folios in the writepath Allocate a multipage folio when copying data into the pagecache if possible if there's sufficient data to warrant it. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/netfs/buffered_write.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 6e7f06d9962db..b76688e98f819 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -84,14 +84,19 @@ static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx, } /* - * Grab a folio for writing and lock it. + * Grab a folio for writing and lock it. Attempt to allocate as large a folio + * as possible to hold as much of the remaining length as possible in one go. */ static struct folio *netfs_grab_folio_for_write(struct address_space *mapping, loff_t pos, size_t part) { pgoff_t index = pos / PAGE_SIZE; + fgf_t fgp_flags = FGP_WRITEBEGIN; - return __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + if (mapping_large_folio_support(mapping)) + fgp_flags |= fgf_set_order(pos % PAGE_SIZE + part); + + return __filemap_get_folio(mapping, index, fgp_flags, mapping_gfp_mask(mapping)); } -- GitLab From 016dc8516aec8719641e7aaaacd78d344759178e Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Jan 2022 17:39:55 +0000 Subject: [PATCH 0585/1290] netfs: Implement unbuffered/DIO read support Implement support for unbuffered and DIO reads in the netfs library, utilising the existing read helper code to do block splitting and individual queuing. The code also handles extraction of the destination buffer from the supplied iterator, allowing async unbuffered reads to take place. The read will be split up according to the rsize setting and, if supplied, the ->clamp_length() method. Note that the next subrequest will be issued as soon as issue_op returns, without waiting for previous ones to finish. The network filesystem needs to pause or handle queuing them if it doesn't want to fire them all at the server simultaneously. Once all the subrequests have finished, the state will be assessed and the amount of data to be indicated as having being obtained will be determined. As the subrequests may finish in any order, if an intermediate subrequest is short, any further subrequests may be copied into the buffer and then abandoned. In the future, this will also take care of doing an unbuffered read from encrypted content, with the decryption being done by the library. Signed-off-by: David Howells cc: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/netfs/Makefile | 1 + fs/netfs/direct_read.c | 125 +++++++++++++++++++++++++++++++++++ fs/netfs/internal.h | 1 + fs/netfs/io.c | 83 ++++++++++++++++++++--- fs/netfs/main.c | 1 + fs/netfs/objects.c | 5 +- fs/netfs/stats.c | 4 +- include/linux/netfs.h | 9 +++ include/trace/events/netfs.h | 7 +- mm/filemap.c | 1 + 10 files changed, 226 insertions(+), 11 deletions(-) create mode 100644 fs/netfs/direct_read.c diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile index 85d8333a1ed46..e968ab1eca400 100644 --- a/fs/netfs/Makefile +++ b/fs/netfs/Makefile @@ -3,6 +3,7 @@ netfs-y := \ buffered_read.o \ buffered_write.o \ + direct_read.o \ io.o \ iterator.o \ locking.o \ diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c new file mode 100644 index 0000000000000..ad4370b3935d6 --- /dev/null +++ b/fs/netfs/direct_read.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Direct I/O support. + * + * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/** + * netfs_unbuffered_read_iter_locked - Perform an unbuffered or direct I/O read + * @iocb: The I/O control descriptor describing the read + * @iter: The output buffer (also specifies read length) + * + * Perform an unbuffered I/O or direct I/O from the file in @iocb to the + * output buffer. No use is made of the pagecache. + * + * The caller must hold any appropriate locks. + */ +static ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *iter) +{ + struct netfs_io_request *rreq; + ssize_t ret; + size_t orig_count = iov_iter_count(iter); + bool async = !is_sync_kiocb(iocb); + + _enter(""); + + if (!orig_count) + return 0; /* Don't update atime */ + + ret = kiocb_write_and_wait(iocb, orig_count); + if (ret < 0) + return ret; + file_accessed(iocb->ki_filp); + + rreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp, + iocb->ki_pos, orig_count, + NETFS_DIO_READ); + if (IS_ERR(rreq)) + return PTR_ERR(rreq); + + netfs_stat(&netfs_n_rh_dio_read); + trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_dio_read); + + /* If this is an async op, we have to keep track of the destination + * buffer for ourselves as the caller's iterator will be trashed when + * we return. + * + * In such a case, extract an iterator to represent as much of the the + * output buffer as we can manage. Note that the extraction might not + * be able to allocate a sufficiently large bvec array and may shorten + * the request. + */ + if (user_backed_iter(iter)) { + ret = netfs_extract_user_iter(iter, rreq->len, &rreq->iter, 0); + if (ret < 0) + goto out; + rreq->direct_bv = (struct bio_vec *)rreq->iter.bvec; + rreq->direct_bv_count = ret; + rreq->direct_bv_unpin = iov_iter_extract_will_pin(iter); + rreq->len = iov_iter_count(&rreq->iter); + } else { + rreq->iter = *iter; + rreq->len = orig_count; + rreq->direct_bv_unpin = false; + iov_iter_advance(iter, orig_count); + } + + // TODO: Set up bounce buffer if needed + + if (async) + rreq->iocb = iocb; + + ret = netfs_begin_read(rreq, is_sync_kiocb(iocb)); + if (ret < 0) + goto out; /* May be -EIOCBQUEUED */ + if (!async) { + // TODO: Copy from bounce buffer + iocb->ki_pos += rreq->transferred; + ret = rreq->transferred; + } + +out: + netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + if (ret > 0) + orig_count -= ret; + if (ret != -EIOCBQUEUED) + iov_iter_revert(iter, orig_count - iov_iter_count(iter)); + return ret; +} + +/** + * netfs_unbuffered_read_iter - Perform an unbuffered or direct I/O read + * @iocb: The I/O control descriptor describing the read + * @iter: The output buffer (also specifies read length) + * + * Perform an unbuffered I/O or direct I/O from the file in @iocb to the + * output buffer. No use is made of the pagecache. + */ +ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + if (!iter->count) + return 0; /* Don't update atime */ + + ret = netfs_start_io_direct(inode); + if (ret == 0) { + ret = netfs_unbuffered_read_iter_locked(iocb, iter); + netfs_end_io_direct(inode); + } + return ret; +} +EXPORT_SYMBOL(netfs_unbuffered_read_iter); diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index 17e4ea4456c75..886c2e8f841fb 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -100,6 +100,7 @@ int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait, * stats.c */ #ifdef CONFIG_NETFS_STATS +extern atomic_t netfs_n_rh_dio_read; extern atomic_t netfs_n_rh_readahead; extern atomic_t netfs_n_rh_readpage; extern atomic_t netfs_n_rh_rreq; diff --git a/fs/netfs/io.c b/fs/netfs/io.c index 774aef6ea4cbc..c972415c8aad3 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -78,7 +78,9 @@ static void netfs_read_from_server(struct netfs_io_request *rreq, struct netfs_io_subrequest *subreq) { netfs_stat(&netfs_n_rh_download); - if (iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred) + + if (rreq->origin != NETFS_DIO_READ && + iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred) pr_warn("R=%08x[%u] ITER PRE-MISMATCH %zx != %zx-%zx %lx\n", rreq->debug_id, subreq->debug_index, iov_iter_count(&subreq->io_iter), subreq->len, @@ -341,6 +343,43 @@ static void netfs_rreq_is_still_valid(struct netfs_io_request *rreq) } } +/* + * Determine how much we can admit to having read from a DIO read. + */ +static void netfs_rreq_assess_dio(struct netfs_io_request *rreq) +{ + struct netfs_io_subrequest *subreq; + unsigned int i; + size_t transferred = 0; + + for (i = 0; i < rreq->direct_bv_count; i++) + flush_dcache_page(rreq->direct_bv[i].bv_page); + + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + if (subreq->error || subreq->transferred == 0) + break; + transferred += subreq->transferred; + if (subreq->transferred < subreq->len) + break; + } + + for (i = 0; i < rreq->direct_bv_count; i++) + flush_dcache_page(rreq->direct_bv[i].bv_page); + + rreq->transferred = transferred; + task_io_account_read(transferred); + + if (rreq->iocb) { + rreq->iocb->ki_pos += transferred; + if (rreq->iocb->ki_complete) + rreq->iocb->ki_complete( + rreq->iocb, rreq->error ? rreq->error : transferred); + } + if (rreq->netfs_ops->done) + rreq->netfs_ops->done(rreq); + inode_dio_end(rreq->inode); +} + /* * Assess the state of a read request and decide what to do next. * @@ -361,7 +400,10 @@ again: return; } - netfs_rreq_unlock_folios(rreq); + if (rreq->origin != NETFS_DIO_READ) + netfs_rreq_unlock_folios(rreq); + else + netfs_rreq_assess_dio(rreq); trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip); clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags); @@ -526,14 +568,16 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq, struct netfs_io_subrequest *subreq, struct iov_iter *io_iter) { - enum netfs_io_source source; + enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER; size_t lsize; _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size); - source = netfs_cache_prepare_read(subreq, rreq->i_size); - if (source == NETFS_INVALID_READ) - goto out; + if (rreq->origin != NETFS_DIO_READ) { + source = netfs_cache_prepare_read(subreq, rreq->i_size); + if (source == NETFS_INVALID_READ) + goto out; + } if (source == NETFS_DOWNLOAD_FROM_SERVER) { /* Call out to the netfs to let it shrink the request to fit @@ -544,6 +588,8 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq, */ if (subreq->len > rreq->i_size - subreq->start) subreq->len = rreq->i_size - subreq->start; + if (rreq->rsize && subreq->len > rreq->rsize) + subreq->len = rreq->rsize; if (rreq->netfs_ops->clamp_length && !rreq->netfs_ops->clamp_length(subreq)) { @@ -662,6 +708,10 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) return -EIO; } + if (rreq->origin == NETFS_DIO_READ) + inode_dio_begin(rreq->inode); + + // TODO: Use bounce buffer if requested rreq->io_iter = rreq->iter; INIT_WORK(&rreq->work, netfs_rreq_work); @@ -673,11 +723,25 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) atomic_set(&rreq->nr_outstanding, 1); io_iter = rreq->io_iter; do { + _debug("submit %llx + %zx >= %llx", + rreq->start, rreq->submitted, rreq->i_size); + if (rreq->origin == NETFS_DIO_READ && + rreq->start + rreq->submitted >= rreq->i_size) + break; if (!netfs_rreq_submit_slice(rreq, &io_iter, &debug_index)) break; + if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) && + test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags)) + break; } while (rreq->submitted < rreq->len); + if (!rreq->submitted) { + netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit); + ret = 0; + goto out; + } + if (sync) { /* Keep nr_outstanding incremented so that the ref always * belongs to us, and the service code isn't punted off to a @@ -694,7 +758,8 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) TASK_UNINTERRUPTIBLE); ret = rreq->error; - if (ret == 0 && rreq->submitted < rreq->len) { + if (ret == 0 && rreq->submitted < rreq->len && + rreq->origin != NETFS_DIO_READ) { trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read); ret = -EIO; } @@ -702,7 +767,9 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) /* If we decrement nr_outstanding to 0, the ref belongs to us. */ if (atomic_dec_and_test(&rreq->nr_outstanding)) netfs_rreq_assess(rreq, false); - ret = 0; + ret = -EIOCBQUEUED; } + +out: return ret; } diff --git a/fs/netfs/main.c b/fs/netfs/main.c index ab6cac1106762..abb8857486ee1 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -30,6 +30,7 @@ static const char *netfs_origins[nr__netfs_io_origin] = { [NETFS_READPAGE] = "RP", [NETFS_READ_FOR_WRITE] = "RW", [NETFS_WRITEBACK] = "WB", + [NETFS_DIO_READ] = "DR", }; /* diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index 3aa0bfbc04ec3..7153f24e80341 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -20,7 +20,8 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, struct inode *inode = file ? file_inode(file) : mapping->host; struct netfs_inode *ctx = netfs_inode(inode); struct netfs_io_request *rreq; - bool cached = netfs_is_cache_enabled(ctx); + bool is_dio = (origin == NETFS_DIO_READ); + bool cached = is_dio && netfs_is_cache_enabled(ctx); int ret; rreq = kzalloc(ctx->ops->io_request_size ?: sizeof(struct netfs_io_request), @@ -42,6 +43,8 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); if (cached) __set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags); + if (file && file->f_flags & O_NONBLOCK) + __set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags); if (rreq->netfs_ops->init_request) { ret = rreq->netfs_ops->init_request(rreq, file); if (ret < 0) { diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c index c1f85cd595a47..15fd5c3f0f392 100644 --- a/fs/netfs/stats.c +++ b/fs/netfs/stats.c @@ -9,6 +9,7 @@ #include #include "internal.h" +atomic_t netfs_n_rh_dio_read; atomic_t netfs_n_rh_readahead; atomic_t netfs_n_rh_readpage; atomic_t netfs_n_rh_rreq; @@ -36,7 +37,8 @@ atomic_t netfs_n_wh_write_failed; int netfs_stats_show(struct seq_file *m, void *v) { - seq_printf(m, "Netfs : RA=%u RP=%u WB=%u WBZ=%u rr=%u sr=%u\n", + seq_printf(m, "Netfs : DR=%u RA=%u RP=%u WB=%u WBZ=%u rr=%u sr=%u\n", + atomic_read(&netfs_n_rh_dio_read), atomic_read(&netfs_n_rh_readahead), atomic_read(&netfs_n_rh_readpage), atomic_read(&netfs_n_rh_write_begin), diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 70f578cf3715f..7c13095684598 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -226,6 +226,7 @@ enum netfs_io_origin { NETFS_READPAGE, /* This read is a synchronous read */ NETFS_READ_FOR_WRITE, /* This read is to prepare a write */ NETFS_WRITEBACK, /* This write was triggered by writepages */ + NETFS_DIO_READ, /* This is a direct I/O read */ nr__netfs_io_origin } __mode(byte); @@ -240,6 +241,7 @@ struct netfs_io_request { }; struct inode *inode; /* The file being accessed */ struct address_space *mapping; /* The mapping being accessed */ + struct kiocb *iocb; /* AIO completion vector */ struct netfs_cache_resources cache_resources; struct list_head proc_link; /* Link in netfs_iorequests */ struct list_head subrequests; /* Contributory I/O operations */ @@ -249,12 +251,14 @@ struct netfs_io_request { struct bio_vec *direct_bv; /* DIO buffer list (when handling iovec-iter) */ unsigned int direct_bv_count; /* Number of elements in direct_bv[] */ unsigned int debug_id; + unsigned int rsize; /* Maximum read size (0 for none) */ unsigned int wsize; /* Maximum write size (0 for none) */ unsigned int subreq_counter; /* Next subreq->debug_index */ atomic_t nr_outstanding; /* Number of ops in progress */ atomic_t nr_copy_ops; /* Number of copy-to-cache ops in progress */ size_t submitted; /* Amount submitted for I/O so far */ size_t len; /* Length of the request */ + size_t transferred; /* Amount to be indicated as transferred */ short error; /* 0 or error that occurred */ enum netfs_io_origin origin; /* Origin of the request */ bool direct_bv_unpin; /* T if direct_bv[] must be unpinned */ @@ -271,6 +275,8 @@ struct netfs_io_request { #define NETFS_RREQ_IN_PROGRESS 5 /* Unlocked when the request completes */ #define NETFS_RREQ_WRITE_TO_CACHE 7 /* Need to write to the cache */ #define NETFS_RREQ_UPLOAD_TO_SERVER 8 /* Need to write to the server */ +#define NETFS_RREQ_NONBLOCK 9 /* Don't block if possible (O_NONBLOCK) */ +#define NETFS_RREQ_BLOCKED 10 /* We blocked */ const struct netfs_request_ops *netfs_ops; void (*cleanup)(struct netfs_io_request *req); }; @@ -367,6 +373,9 @@ struct netfs_cache_ops { loff_t *_data_start, size_t *_data_len); }; +/* High-level read API. */ +ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter); + /* High-level write API */ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, struct netfs_group *netfs_group); diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 082a5e717b58f..5a4edadf0e59d 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -16,6 +16,7 @@ * Define enums for tracing information. */ #define netfs_read_traces \ + EM(netfs_read_trace_dio_read, "DIO-READ ") \ EM(netfs_read_trace_expanded, "EXPANDED ") \ EM(netfs_read_trace_readahead, "READAHEAD") \ EM(netfs_read_trace_readpage, "READPAGE ") \ @@ -31,7 +32,8 @@ EM(NETFS_READAHEAD, "RA") \ EM(NETFS_READPAGE, "RP") \ EM(NETFS_READ_FOR_WRITE, "RW") \ - E_(NETFS_WRITEBACK, "WB") + EM(NETFS_WRITEBACK, "WB") \ + E_(NETFS_DIO_READ, "DR") #define netfs_rreq_traces \ EM(netfs_rreq_trace_assess, "ASSESS ") \ @@ -70,6 +72,8 @@ #define netfs_failures \ EM(netfs_fail_check_write_begin, "check-write-begin") \ EM(netfs_fail_copy_to_cache, "copy-to-cache") \ + EM(netfs_fail_dio_read_short, "dio-read-short") \ + EM(netfs_fail_dio_read_zero, "dio-read-zero") \ EM(netfs_fail_read, "read") \ EM(netfs_fail_short_read, "short-read") \ EM(netfs_fail_prepare_write, "prep-write") \ @@ -81,6 +85,7 @@ EM(netfs_rreq_trace_put_complete, "PUT COMPLT ") \ EM(netfs_rreq_trace_put_discard, "PUT DISCARD") \ EM(netfs_rreq_trace_put_failed, "PUT FAILED ") \ + EM(netfs_rreq_trace_put_no_submit, "PUT NO-SUBM") \ EM(netfs_rreq_trace_put_return, "PUT RETURN ") \ EM(netfs_rreq_trace_put_subreq, "PUT SUBREQ ") \ EM(netfs_rreq_trace_put_work, "PUT WORK ") \ diff --git a/mm/filemap.c b/mm/filemap.c index f1c8c278310fd..1c5271ed0cc08 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2678,6 +2678,7 @@ int kiocb_write_and_wait(struct kiocb *iocb, size_t count) return filemap_write_and_wait_range(mapping, pos, end); } +EXPORT_SYMBOL_GPL(kiocb_write_and_wait); int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) { -- GitLab From 153a9961b551101cd38e94e26cd92fbfd198b19b Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 21 Feb 2022 11:38:17 +0000 Subject: [PATCH 0586/1290] netfs: Implement unbuffered/DIO write support Implement support for unbuffered writes and direct I/O writes. If the write is misaligned with respect to the fscrypt block size, then RMW cycles are performed if necessary. DIO writes are a special case of unbuffered writes with extra restriction imposed, such as block size alignment requirements. Also provide a field that can tell the code to add some extra space onto the bounce buffer for use by the filesystem in the case of a content-encrypted file. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/afs/inode.c | 2 +- fs/netfs/Makefile | 1 + fs/netfs/direct_write.c | 166 +++++++++++++++++++++++++++++++++++ fs/netfs/internal.h | 6 ++ fs/netfs/io.c | 2 +- fs/netfs/main.c | 12 +-- fs/netfs/objects.c | 6 +- fs/netfs/output.c | 30 +++++++ include/linux/netfs.h | 4 + include/trace/events/netfs.h | 4 +- mm/filemap.c | 1 + 11 files changed, 224 insertions(+), 10 deletions(-) create mode 100644 fs/netfs/direct_write.c diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 6f375f0cf650d..37485ae314714 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -250,7 +250,7 @@ static void afs_apply_status(struct afs_operation *op, * what's on the server. */ vnode->netfs.remote_i_size = status->size; - if (change_size) { + if (change_size || status->size > i_size_read(inode)) { afs_set_i_size(vnode, status->size); inode_set_ctime_to_ts(inode, t); inode_set_atime_to_ts(inode, t); diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile index e968ab1eca400..d4d1d799819ec 100644 --- a/fs/netfs/Makefile +++ b/fs/netfs/Makefile @@ -4,6 +4,7 @@ netfs-y := \ buffered_read.o \ buffered_write.o \ direct_read.o \ + direct_write.o \ io.o \ iterator.o \ locking.o \ diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c new file mode 100644 index 0000000000000..bb0c2718f57bd --- /dev/null +++ b/fs/netfs/direct_write.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Unbuffered and direct write support. + * + * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include +#include +#include "internal.h" + +static void netfs_cleanup_dio_write(struct netfs_io_request *wreq) +{ + struct inode *inode = wreq->inode; + unsigned long long end = wreq->start + wreq->len; + + if (!wreq->error && + i_size_read(inode) < end) { + if (wreq->netfs_ops->update_i_size) + wreq->netfs_ops->update_i_size(inode, end); + else + i_size_write(inode, end); + } +} + +/* + * Perform an unbuffered write where we may have to do an RMW operation on an + * encrypted file. This can also be used for direct I/O writes. + */ +ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter, + struct netfs_group *netfs_group) +{ + struct netfs_io_request *wreq; + unsigned long long start = iocb->ki_pos; + unsigned long long end = start + iov_iter_count(iter); + ssize_t ret, n; + bool async = !is_sync_kiocb(iocb); + + _enter(""); + + /* We're going to need a bounce buffer if what we transmit is going to + * be different in some way to the source buffer, e.g. because it gets + * encrypted/compressed or because it needs expanding to a block size. + */ + // TODO + + _debug("uw %llx-%llx", start, end); + + wreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp, + start, end - start, + iocb->ki_flags & IOCB_DIRECT ? + NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE); + if (IS_ERR(wreq)) + return PTR_ERR(wreq); + + { + /* If this is an async op and we're not using a bounce buffer, + * we have to save the source buffer as the iterator is only + * good until we return. In such a case, extract an iterator + * to represent as much of the the output buffer as we can + * manage. Note that the extraction might not be able to + * allocate a sufficiently large bvec array and may shorten the + * request. + */ + if (async || user_backed_iter(iter)) { + n = netfs_extract_user_iter(iter, wreq->len, &wreq->iter, 0); + if (n < 0) { + ret = n; + goto out; + } + wreq->direct_bv = (struct bio_vec *)wreq->iter.bvec; + wreq->direct_bv_count = n; + wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter); + wreq->len = iov_iter_count(&wreq->iter); + } else { + wreq->iter = *iter; + } + + wreq->io_iter = wreq->iter; + } + + /* Copy the data into the bounce buffer and encrypt it. */ + // TODO + + /* Dispatch the write. */ + __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); + if (async) + wreq->iocb = iocb; + wreq->cleanup = netfs_cleanup_dio_write; + ret = netfs_begin_write(wreq, is_sync_kiocb(iocb), + iocb->ki_flags & IOCB_DIRECT ? + netfs_write_trace_dio_write : + netfs_write_trace_unbuffered_write); + if (ret < 0) { + _debug("begin = %zd", ret); + goto out; + } + + if (!async) { + trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip); + wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, + TASK_UNINTERRUPTIBLE); + + ret = wreq->error; + _debug("waited = %zd", ret); + if (ret == 0) { + ret = wreq->transferred; + iocb->ki_pos += ret; + } + } else { + ret = -EIOCBQUEUED; + } + +out: + netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + return ret; +} + +/** + * netfs_unbuffered_write_iter - Unbuffered write to a file + * @iocb: IO state structure + * @from: iov_iter with data to write + * + * Do an unbuffered write to a file, writing the data directly to the server + * and not lodging the data in the pagecache. + * + * Return: + * * Negative error code if no data has been written at all of + * vfs_fsync_range() failed for a synchronous write + * * Number of bytes written, even for truncated writes + */ +ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct netfs_inode *ictx = netfs_inode(inode); + ssize_t ret; + + _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); + + trace_netfs_write_iter(iocb, from); + + ret = netfs_start_io_direct(inode); + if (ret < 0) + return ret; + ret = generic_write_checks(iocb, from); + if (ret < 0) + goto out; + ret = file_remove_privs(file); + if (ret < 0) + goto out; + ret = file_update_time(file); + if (ret < 0) + goto out; + ret = kiocb_invalidate_pages(iocb, iov_iter_count(from)); + if (ret < 0) + goto out; + + fscache_invalidate(netfs_i_cookie(ictx), NULL, i_size_read(inode), + FSCACHE_INVAL_DIO_WRITE); + ret = netfs_unbuffered_write_iter_locked(iocb, from, NULL); +out: + netfs_end_io_direct(inode); + return ret; +} +EXPORT_SYMBOL(netfs_unbuffered_write_iter); diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index 886c2e8f841fb..2de4f826dbe4f 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -26,6 +26,12 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq); int netfs_prefetch_for_write(struct file *file, struct folio *folio, size_t offset, size_t len); +/* + * direct_write.c + */ +ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter, + struct netfs_group *netfs_group); + /* * io.c */ diff --git a/fs/netfs/io.c b/fs/netfs/io.c index c972415c8aad3..01c7ff27228e4 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -645,7 +645,7 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq, subreq->debug_index = (*_debug_index)++; subreq->start = rreq->start + rreq->submitted; - subreq->len = rreq->len - rreq->submitted; + subreq->len = io_iter->count; _debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted); list_add_tail(&subreq->rreq_link, &rreq->subrequests); diff --git a/fs/netfs/main.c b/fs/netfs/main.c index abb8857486ee1..8e4db9ff40c40 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -26,11 +26,13 @@ LIST_HEAD(netfs_io_requests); DEFINE_SPINLOCK(netfs_proc_lock); static const char *netfs_origins[nr__netfs_io_origin] = { - [NETFS_READAHEAD] = "RA", - [NETFS_READPAGE] = "RP", - [NETFS_READ_FOR_WRITE] = "RW", - [NETFS_WRITEBACK] = "WB", - [NETFS_DIO_READ] = "DR", + [NETFS_READAHEAD] = "RA", + [NETFS_READPAGE] = "RP", + [NETFS_READ_FOR_WRITE] = "RW", + [NETFS_WRITEBACK] = "WB", + [NETFS_UNBUFFERED_WRITE] = "UW", + [NETFS_DIO_READ] = "DR", + [NETFS_DIO_WRITE] = "DW", }; /* diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index 7153f24e80341..93f1d74311993 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -20,8 +20,10 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, struct inode *inode = file ? file_inode(file) : mapping->host; struct netfs_inode *ctx = netfs_inode(inode); struct netfs_io_request *rreq; - bool is_dio = (origin == NETFS_DIO_READ); - bool cached = is_dio && netfs_is_cache_enabled(ctx); + bool is_unbuffered = (origin == NETFS_UNBUFFERED_WRITE || + origin == NETFS_DIO_READ || + origin == NETFS_DIO_WRITE); + bool cached = !is_unbuffered && netfs_is_cache_enabled(ctx); int ret; rreq = kzalloc(ctx->ops->io_request_size ?: sizeof(struct netfs_io_request), diff --git a/fs/netfs/output.c b/fs/netfs/output.c index 2ad0fd8c32be2..560cbcea0c0ae 100644 --- a/fs/netfs/output.c +++ b/fs/netfs/output.c @@ -74,11 +74,21 @@ static void netfs_write_terminated(struct netfs_io_request *wreq, bool was_async { struct netfs_io_subrequest *subreq; struct netfs_inode *ctx = netfs_inode(wreq->inode); + size_t transferred = 0; _enter("R=%x[]", wreq->debug_id); trace_netfs_rreq(wreq, netfs_rreq_trace_write_done); + list_for_each_entry(subreq, &wreq->subrequests, rreq_link) { + if (subreq->error || subreq->transferred == 0) + break; + transferred += subreq->transferred; + if (subreq->transferred < subreq->len) + break; + } + wreq->transferred = transferred; + list_for_each_entry(subreq, &wreq->subrequests, rreq_link) { if (!subreq->error) continue; @@ -110,11 +120,28 @@ static void netfs_write_terminated(struct netfs_io_request *wreq, bool was_async wreq->cleanup(wreq); + if (wreq->origin == NETFS_DIO_WRITE && + wreq->mapping->nrpages) { + pgoff_t first = wreq->start >> PAGE_SHIFT; + pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT; + invalidate_inode_pages2_range(wreq->mapping, first, last); + } + + if (wreq->origin == NETFS_DIO_WRITE) + inode_dio_end(wreq->inode); + _debug("finished"); trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip); clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags); wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS); + if (wreq->iocb) { + wreq->iocb->ki_pos += transferred; + if (wreq->iocb->ki_complete) + wreq->iocb->ki_complete( + wreq->iocb, wreq->error ? wreq->error : transferred); + } + netfs_clear_subrequests(wreq, was_async); netfs_put_request(wreq, was_async, netfs_rreq_trace_put_complete); } @@ -329,6 +356,9 @@ int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait, return -EIO; } + if (wreq->origin == NETFS_DIO_WRITE) + inode_dio_begin(wreq->inode); + wreq->io_iter = wreq->iter; /* ->outstanding > 0 carries a ref */ diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 7c13095684598..e1dfd6775c2c3 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -138,6 +138,7 @@ struct netfs_inode { loff_t remote_i_size; /* Size of the remote file */ unsigned long flags; #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ +#define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ }; /* @@ -226,7 +227,9 @@ enum netfs_io_origin { NETFS_READPAGE, /* This read is a synchronous read */ NETFS_READ_FOR_WRITE, /* This read is to prepare a write */ NETFS_WRITEBACK, /* This write was triggered by writepages */ + NETFS_UNBUFFERED_WRITE, /* This is an unbuffered write */ NETFS_DIO_READ, /* This is a direct I/O read */ + NETFS_DIO_WRITE, /* This is a direct I/O write */ nr__netfs_io_origin } __mode(byte); @@ -379,6 +382,7 @@ ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter); /* High-level write API */ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, struct netfs_group *netfs_group); +ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from); /* Address operations API */ struct readahead_control; diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 5a4edadf0e59d..914a24b03d08b 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -33,7 +33,9 @@ EM(NETFS_READPAGE, "RP") \ EM(NETFS_READ_FOR_WRITE, "RW") \ EM(NETFS_WRITEBACK, "WB") \ - E_(NETFS_DIO_READ, "DR") + EM(NETFS_UNBUFFERED_WRITE, "UW") \ + EM(NETFS_DIO_READ, "DR") \ + E_(NETFS_DIO_WRITE, "DW") #define netfs_rreq_traces \ EM(netfs_rreq_trace_assess, "ASSESS ") \ diff --git a/mm/filemap.c b/mm/filemap.c index 1c5271ed0cc08..73626eb323f35 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2706,6 +2706,7 @@ int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end >> PAGE_SHIFT); } +EXPORT_SYMBOL_GPL(kiocb_invalidate_pages); /** * generic_file_read_iter - generic filesystem read routine -- GitLab From 938e13a73b244278a3777f38fa915bd239b2efd2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 17 Jun 2021 13:09:21 +0100 Subject: [PATCH 0587/1290] netfs: Implement buffered write API Institute a netfs write helper, netfs_file_write_iter(), to be pointed at by the network filesystem ->write_iter() call. Make it handled buffered writes by calling the previously defined netfs_perform_write() to copy the source data into the pagecache. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/netfs/buffered_write.c | 83 +++++++++++++++++++++++++++++++++++++++ include/linux/netfs.h | 3 ++ 2 files changed, 86 insertions(+) diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index b76688e98f819..f244123ab568c 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -333,3 +333,86 @@ error_folio_unlock: goto out; } EXPORT_SYMBOL(netfs_perform_write); + +/** + * netfs_buffered_write_iter_locked - write data to a file + * @iocb: IO state structure (file, offset, etc.) + * @from: iov_iter with data to write + * @netfs_group: Grouping for dirty pages (eg. ceph snaps). + * + * This function does all the work needed for actually writing data to a + * file. It does all basic checks, removes SUID from the file, updates + * modification times and calls proper subroutines depending on whether we + * do direct IO or a standard buffered write. + * + * The caller must hold appropriate locks around this function and have called + * generic_write_checks() already. The caller is also responsible for doing + * any necessary syncing afterwards. + * + * This function does *not* take care of syncing data in case of O_SYNC write. + * A caller has to handle it. This is mainly due to the fact that we want to + * avoid syncing under i_rwsem. + * + * Return: + * * number of bytes written, even for truncated writes + * * negative error code if no data has been written at all + */ +ssize_t netfs_buffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *from, + struct netfs_group *netfs_group) +{ + struct file *file = iocb->ki_filp; + ssize_t ret; + + trace_netfs_write_iter(iocb, from); + + ret = file_remove_privs(file); + if (ret) + return ret; + + ret = file_update_time(file); + if (ret) + return ret; + + return netfs_perform_write(iocb, from, netfs_group); +} +EXPORT_SYMBOL(netfs_buffered_write_iter_locked); + +/** + * netfs_file_write_iter - write data to a file + * @iocb: IO state structure + * @from: iov_iter with data to write + * + * Perform a write to a file, writing into the pagecache if possible and doing + * an unbuffered write instead if not. + * + * Return: + * * Negative error code if no data has been written at all of + * vfs_fsync_range() failed for a synchronous write + * * Number of bytes written, even for truncated writes + */ +ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct netfs_inode *ictx = netfs_inode(inode); + ssize_t ret; + + _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); + + if ((iocb->ki_flags & IOCB_DIRECT) || + test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)) + return netfs_unbuffered_write_iter(iocb, from); + + ret = netfs_start_io_write(inode); + if (ret < 0) + return ret; + + ret = generic_write_checks(iocb, from); + if (ret > 0) + ret = netfs_buffered_write_iter_locked(iocb, from, NULL); + netfs_end_io_write(inode); + if (ret > 0) + ret = generic_write_sync(iocb, ret); + return ret; +} +EXPORT_SYMBOL(netfs_file_write_iter); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index e1dfd6775c2c3..0948ecf69aa5d 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -382,7 +382,10 @@ ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter); /* High-level write API */ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, struct netfs_group *netfs_group); +ssize_t netfs_buffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *from, + struct netfs_group *netfs_group); ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from); +ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from); /* Address operations API */ struct readahead_control; -- GitLab From 102a7e2c598c22bd2621fa97eb1c93c89d469a12 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 15 Feb 2022 23:15:57 +0000 Subject: [PATCH 0588/1290] netfs: Allow buffered shared-writeable mmap through netfs_page_mkwrite() Provide an entry point to delegate a filesystem's ->page_mkwrite() to. This checks for conflicting writes, then attached any netfs-specific group marking (e.g. ceph snap) to the page to be considered dirty. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/netfs/buffered_write.c | 59 +++++++++++++++++++++++++++++++++++++++ include/linux/netfs.h | 4 +++ 2 files changed, 63 insertions(+) diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index f244123ab568c..70cb8e98d068a 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -416,3 +416,62 @@ ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) return ret; } EXPORT_SYMBOL(netfs_file_write_iter); + +/* + * Notification that a previously read-only page is about to become writable. + * Note that the caller indicates a single page of a multipage folio. + */ +vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group) +{ + struct folio *folio = page_folio(vmf->page); + struct file *file = vmf->vma->vm_file; + struct inode *inode = file_inode(file); + vm_fault_t ret = VM_FAULT_RETRY; + int err; + + _enter("%lx", folio->index); + + sb_start_pagefault(inode->i_sb); + + if (folio_wait_writeback_killable(folio)) + goto out; + + if (folio_lock_killable(folio) < 0) + goto out; + + /* Can we see a streaming write here? */ + if (WARN_ON(!folio_test_uptodate(folio))) { + ret = VM_FAULT_SIGBUS | VM_FAULT_LOCKED; + goto out; + } + + if (netfs_folio_group(folio) != netfs_group) { + folio_unlock(folio); + err = filemap_fdatawait_range(inode->i_mapping, + folio_pos(folio), + folio_pos(folio) + folio_size(folio)); + switch (err) { + case 0: + ret = VM_FAULT_RETRY; + goto out; + case -ENOMEM: + ret = VM_FAULT_OOM; + goto out; + default: + ret = VM_FAULT_SIGBUS; + goto out; + } + } + + if (folio_test_dirty(folio)) + trace_netfs_folio(folio, netfs_folio_trace_mkwrite_plus); + else + trace_netfs_folio(folio, netfs_folio_trace_mkwrite); + netfs_set_group(folio, netfs_group); + file_update_time(file); + ret = VM_FAULT_LOCKED; +out: + sb_end_pagefault(inode->i_sb); + return ret; +} +EXPORT_SYMBOL(netfs_page_mkwrite); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 0948ecf69aa5d..d7f324c7c22ae 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -400,6 +400,10 @@ void netfs_clear_inode_writeback(struct inode *inode, const void *aux); void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length); bool netfs_release_folio(struct folio *folio, gfp_t gfp); +/* VMA operations API. */ +vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group); + +/* (Sub)request management API. */ void netfs_subreq_terminated(struct netfs_io_subrequest *, ssize_t, bool); void netfs_get_subrequest(struct netfs_io_subrequest *subreq, enum netfs_sreq_ref_trace what); -- GitLab From 80645bd4aa33a5c325f11b8dc6b38b38410ad5c0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 11 Oct 2023 09:29:43 +0100 Subject: [PATCH 0589/1290] netfs: Provide netfs_file_read_iter() Provide a top-level-ish function that can be pointed to directly by ->read_iter file op. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/netfs/buffered_read.c | 73 ++++++++++++++++++++++++++++++++++++++++ include/linux/netfs.h | 2 ++ 2 files changed, 75 insertions(+) diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 950f63fc156a8..a59e7b2edaacd 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -558,3 +558,76 @@ error: _leave(" = %d", ret); return ret; } + +/** + * netfs_buffered_read_iter - Filesystem buffered I/O read routine + * @iocb: kernel I/O control block + * @iter: destination for the data read + * + * This is the ->read_iter() routine for all filesystems that can use the page + * cache directly. + * + * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be + * returned when no data can be read without waiting for I/O requests to + * complete; it doesn't prevent readahead. + * + * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests + * shall be made for the read or for readahead. When no data can be read, + * -EAGAIN shall be returned. When readahead would be triggered, a partial, + * possibly empty read shall be returned. + * + * Return: + * * number of bytes copied, even for partial reads + * * negative error code (or 0 if IOCB_NOIO) if nothing was read + */ +ssize_t netfs_buffered_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct netfs_inode *ictx = netfs_inode(inode); + ssize_t ret; + + if (WARN_ON_ONCE((iocb->ki_flags & IOCB_DIRECT) || + test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))) + return -EINVAL; + + ret = netfs_start_io_read(inode); + if (ret == 0) { + ret = filemap_read(iocb, iter, 0); + netfs_end_io_read(inode); + } + return ret; +} +EXPORT_SYMBOL(netfs_buffered_read_iter); + +/** + * netfs_file_read_iter - Generic filesystem read routine + * @iocb: kernel I/O control block + * @iter: destination for the data read + * + * This is the ->read_iter() routine for all filesystems that can use the page + * cache directly. + * + * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be + * returned when no data can be read without waiting for I/O requests to + * complete; it doesn't prevent readahead. + * + * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests + * shall be made for the read or for readahead. When no data can be read, + * -EAGAIN shall be returned. When readahead would be triggered, a partial, + * possibly empty read shall be returned. + * + * Return: + * * number of bytes copied, even for partial reads + * * negative error code (or 0 if IOCB_NOIO) if nothing was read + */ +ssize_t netfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct netfs_inode *ictx = netfs_inode(iocb->ki_filp->f_mapping->host); + + if ((iocb->ki_flags & IOCB_DIRECT) || + test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)) + return netfs_unbuffered_read_iter(iocb, iter); + + return netfs_buffered_read_iter(iocb, iter); +} +EXPORT_SYMBOL(netfs_file_read_iter); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index d7f324c7c22ae..19a41c437af3f 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -378,6 +378,8 @@ struct netfs_cache_ops { /* High-level read API. */ ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter); +ssize_t netfs_buffered_read_iter(struct kiocb *iocb, struct iov_iter *iter); +ssize_t netfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter); /* High-level write API */ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, -- GitLab From e0ace6ca98bef0d8d354040f13ffc0a498813ee9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 22 Nov 2023 17:18:17 +0000 Subject: [PATCH 0590/1290] netfs, cachefiles: Pass upper bound length to allow expansion Make netfslib pass the maximum length to the ->prepare_write() op to tell the cache how much it can expand the length of a write to. This allows a write to the server at the end of a file to be limited to a few bytes whilst writing an entire block to the cache (something required by direct I/O). Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/cachefiles/internal.h | 2 +- fs/cachefiles/io.c | 10 ++++++---- fs/cachefiles/ondemand.c | 2 +- fs/netfs/fscache_io.c | 2 +- fs/netfs/io.c | 2 +- fs/netfs/objects.c | 1 + fs/netfs/output.c | 25 ++++++++++--------------- fs/smb/client/fscache.c | 2 +- include/linux/netfs.h | 5 +++-- 9 files changed, 25 insertions(+), 26 deletions(-) diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 2ad58c4652084..1af48d576a34d 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -233,7 +233,7 @@ extern bool cachefiles_begin_operation(struct netfs_cache_resources *cres, enum fscache_want_state want_state); extern int __cachefiles_prepare_write(struct cachefiles_object *object, struct file *file, - loff_t *_start, size_t *_len, + loff_t *_start, size_t *_len, size_t upper_len, bool no_space_allocated_yet); extern int __cachefiles_write(struct cachefiles_object *object, struct file *file, diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 009d23cd435b5..bffffedce4a93 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -518,7 +518,7 @@ cachefiles_prepare_ondemand_read(struct netfs_cache_resources *cres, */ int __cachefiles_prepare_write(struct cachefiles_object *object, struct file *file, - loff_t *_start, size_t *_len, + loff_t *_start, size_t *_len, size_t upper_len, bool no_space_allocated_yet) { struct cachefiles_cache *cache = object->volume->cache; @@ -530,6 +530,8 @@ int __cachefiles_prepare_write(struct cachefiles_object *object, down = start - round_down(start, PAGE_SIZE); *_start = start - down; *_len = round_up(down + len, PAGE_SIZE); + if (down < start || *_len > upper_len) + return -ENOBUFS; /* We need to work out whether there's sufficient disk space to perform * the write - but we can skip that check if we have space already @@ -592,8 +594,8 @@ check_space: } static int cachefiles_prepare_write(struct netfs_cache_resources *cres, - loff_t *_start, size_t *_len, loff_t i_size, - bool no_space_allocated_yet) + loff_t *_start, size_t *_len, size_t upper_len, + loff_t i_size, bool no_space_allocated_yet) { struct cachefiles_object *object = cachefiles_cres_object(cres); struct cachefiles_cache *cache = object->volume->cache; @@ -609,7 +611,7 @@ static int cachefiles_prepare_write(struct netfs_cache_resources *cres, cachefiles_begin_secure(cache, &saved_cred); ret = __cachefiles_prepare_write(object, cachefiles_cres_file(cres), - _start, _len, + _start, _len, upper_len, no_space_allocated_yet); cachefiles_end_secure(cache, saved_cred); return ret; diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index 0254ed39f68ce..9301d1eb05045 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -52,7 +52,7 @@ static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb, return -ENOBUFS; cachefiles_begin_secure(cache, &saved_cred); - ret = __cachefiles_prepare_write(object, file, &pos, &len, true); + ret = __cachefiles_prepare_write(object, file, &pos, &len, len, true); cachefiles_end_secure(cache, saved_cred); if (ret < 0) return ret; diff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c index 79171a6879304..ad572f7ee897b 100644 --- a/fs/netfs/fscache_io.c +++ b/fs/netfs/fscache_io.c @@ -237,7 +237,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie, fscache_access_io_write) < 0) goto abandon_free; - ret = cres->ops->prepare_write(cres, &start, &len, i_size, false); + ret = cres->ops->prepare_write(cres, &start, &len, len, i_size, false); if (ret < 0) goto abandon_end; diff --git a/fs/netfs/io.c b/fs/netfs/io.c index 01c7ff27228e4..14c18be5aca00 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -199,7 +199,7 @@ static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq) } ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len, - rreq->i_size, true); + subreq->len, rreq->i_size, true); if (ret < 0) { trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write); trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip); diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index 93f1d74311993..b4e3bd836e5df 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -33,6 +33,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, rreq->start = start; rreq->len = len; + rreq->upper_len = len; rreq->origin = origin; rreq->netfs_ops = ctx->ops; rreq->mapping = mapping; diff --git a/fs/netfs/output.c b/fs/netfs/output.c index 560cbcea0c0ae..cc9065733b423 100644 --- a/fs/netfs/output.c +++ b/fs/netfs/output.c @@ -280,7 +280,7 @@ EXPORT_SYMBOL(netfs_queue_write_request); */ static void netfs_set_up_write_to_cache(struct netfs_io_request *wreq) { - struct netfs_cache_resources *cres; + struct netfs_cache_resources *cres = &wreq->cache_resources; struct netfs_io_subrequest *subreq; struct netfs_inode *ctx = netfs_inode(wreq->inode); struct fscache_cookie *cookie = netfs_i_cookie(ctx); @@ -294,26 +294,21 @@ static void netfs_set_up_write_to_cache(struct netfs_io_request *wreq) } _debug("write to cache"); - subreq = netfs_create_write_request(wreq, NETFS_WRITE_TO_CACHE, start, len, - netfs_write_to_cache_op_worker); - if (!subreq) + ret = fscache_begin_write_operation(cres, cookie); + if (ret < 0) return; - cres = &wreq->cache_resources; - ret = fscache_begin_read_operation(cres, cookie); - if (ret < 0) { - netfs_write_subrequest_terminated(subreq, ret, false); + ret = cres->ops->prepare_write(cres, &start, &len, wreq->upper_len, + i_size_read(wreq->inode), true); + if (ret < 0) return; - } - ret = cres->ops->prepare_write(cres, &start, &len, i_size_read(wreq->inode), - true); - if (ret < 0) { - netfs_write_subrequest_terminated(subreq, ret, false); + subreq = netfs_create_write_request(wreq, NETFS_WRITE_TO_CACHE, start, len, + netfs_write_to_cache_op_worker); + if (!subreq) return; - } - netfs_queue_write_request(subreq); + netfs_write_to_cache_op(subreq); } /* diff --git a/fs/smb/client/fscache.c b/fs/smb/client/fscache.c index e5cad149f5a2d..c4a3cb736881a 100644 --- a/fs/smb/client/fscache.c +++ b/fs/smb/client/fscache.c @@ -180,7 +180,7 @@ static int fscache_fallback_write_pages(struct inode *inode, loff_t start, size_ if (ret < 0) return ret; - ret = cres.ops->prepare_write(&cres, &start, &len, i_size_read(inode), + ret = cres.ops->prepare_write(&cres, &start, &len, len, i_size_read(inode), no_space_allocated_yet); if (ret == 0) ret = fscache_write(&cres, start, &iter, NULL, NULL); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 19a41c437af3f..2856389f4694c 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -261,6 +261,7 @@ struct netfs_io_request { atomic_t nr_copy_ops; /* Number of copy-to-cache ops in progress */ size_t submitted; /* Amount submitted for I/O so far */ size_t len; /* Length of the request */ + size_t upper_len; /* Length can be extended to here */ size_t transferred; /* Amount to be indicated as transferred */ short error; /* 0 or error that occurred */ enum netfs_io_origin origin; /* Origin of the request */ @@ -357,8 +358,8 @@ struct netfs_cache_ops { * actually do. */ int (*prepare_write)(struct netfs_cache_resources *cres, - loff_t *_start, size_t *_len, loff_t i_size, - bool no_space_allocated_yet); + loff_t *_start, size_t *_len, size_t upper_len, + loff_t i_size, bool no_space_allocated_yet); /* Prepare an on-demand read operation, shortening it to a cached/uncached * boundary as appropriate. -- GitLab From 62c3b7481b9a108cb99ef9438dba66bb4738768b Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 28 Sep 2023 11:46:49 +0100 Subject: [PATCH 0591/1290] netfs: Provide a writepages implementation Provide an implementation of writepages for network filesystems to delegate to. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/netfs/buffered_write.c | 636 ++++++++++++++++++++++++++++++++++++++ include/linux/netfs.h | 2 + 2 files changed, 638 insertions(+) diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 70cb8e98d068a..c078826f7fe67 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -32,6 +32,18 @@ static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group folio_attach_private(folio, netfs_get_group(netfs_group)); } +#if IS_ENABLED(CONFIG_FSCACHE) +static void netfs_folio_start_fscache(bool caching, struct folio *folio) +{ + if (caching) + folio_start_fscache(folio); +} +#else +static void netfs_folio_start_fscache(bool caching, struct folio *folio) +{ +} +#endif + /* * Decide how we should modify a folio. We might be attempting to do * write-streaming, in which case we don't want to a local RMW cycle if we can @@ -475,3 +487,627 @@ out: return ret; } EXPORT_SYMBOL(netfs_page_mkwrite); + +/* + * Kill all the pages in the given range + */ +static void netfs_kill_pages(struct address_space *mapping, + loff_t start, loff_t len) +{ + struct folio *folio; + pgoff_t index = start / PAGE_SIZE; + pgoff_t last = (start + len - 1) / PAGE_SIZE, next; + + _enter("%llx-%llx", start, start + len - 1); + + do { + _debug("kill %lx (to %lx)", index, last); + + folio = filemap_get_folio(mapping, index); + if (IS_ERR(folio)) { + next = index + 1; + continue; + } + + next = folio_next_index(folio); + + trace_netfs_folio(folio, netfs_folio_trace_kill); + folio_clear_uptodate(folio); + if (folio_test_fscache(folio)) + folio_end_fscache(folio); + folio_end_writeback(folio); + folio_lock(folio); + generic_error_remove_page(mapping, &folio->page); + folio_unlock(folio); + folio_put(folio); + + } while (index = next, index <= last); + + _leave(""); +} + +/* + * Redirty all the pages in a given range. + */ +static void netfs_redirty_pages(struct address_space *mapping, + loff_t start, loff_t len) +{ + struct folio *folio; + pgoff_t index = start / PAGE_SIZE; + pgoff_t last = (start + len - 1) / PAGE_SIZE, next; + + _enter("%llx-%llx", start, start + len - 1); + + do { + _debug("redirty %llx @%llx", len, start); + + folio = filemap_get_folio(mapping, index); + if (IS_ERR(folio)) { + next = index + 1; + continue; + } + + next = folio_next_index(folio); + trace_netfs_folio(folio, netfs_folio_trace_redirty); + filemap_dirty_folio(mapping, folio); + if (folio_test_fscache(folio)) + folio_end_fscache(folio); + folio_end_writeback(folio); + folio_put(folio); + } while (index = next, index <= last); + + balance_dirty_pages_ratelimited(mapping); + + _leave(""); +} + +/* + * Completion of write to server + */ +static void netfs_pages_written_back(struct netfs_io_request *wreq) +{ + struct address_space *mapping = wreq->mapping; + struct netfs_folio *finfo; + struct netfs_group *group = NULL; + struct folio *folio; + pgoff_t last; + int gcount = 0; + + XA_STATE(xas, &mapping->i_pages, wreq->start / PAGE_SIZE); + + _enter("%llx-%llx", wreq->start, wreq->start + wreq->len); + + rcu_read_lock(); + + last = (wreq->start + wreq->len - 1) / PAGE_SIZE; + xas_for_each(&xas, folio, last) { + WARN(!folio_test_writeback(folio), + "bad %zx @%llx page %lx %lx\n", + wreq->len, wreq->start, folio_index(folio), last); + + if ((finfo = netfs_folio_info(folio))) { + /* Streaming writes cannot be redirtied whilst under + * writeback, so discard the streaming record. + */ + folio_detach_private(folio); + group = finfo->netfs_group; + gcount++; + trace_netfs_folio(folio, netfs_folio_trace_clear_s); + kfree(finfo); + } else if ((group = netfs_folio_group(folio))) { + /* Need to detach the group pointer if the page didn't + * get redirtied. If it has been redirtied, then it + * must be within the same group. + */ + if (folio_test_dirty(folio)) { + trace_netfs_folio(folio, netfs_folio_trace_redirtied); + goto end_wb; + } + if (folio_trylock(folio)) { + if (!folio_test_dirty(folio)) { + folio_detach_private(folio); + gcount++; + trace_netfs_folio(folio, netfs_folio_trace_clear_g); + } else { + trace_netfs_folio(folio, netfs_folio_trace_redirtied); + } + folio_unlock(folio); + goto end_wb; + } + + xas_pause(&xas); + rcu_read_unlock(); + folio_lock(folio); + if (!folio_test_dirty(folio)) { + folio_detach_private(folio); + gcount++; + trace_netfs_folio(folio, netfs_folio_trace_clear_g); + } else { + trace_netfs_folio(folio, netfs_folio_trace_redirtied); + } + folio_unlock(folio); + rcu_read_lock(); + } else { + trace_netfs_folio(folio, netfs_folio_trace_clear); + } + end_wb: + if (folio_test_fscache(folio)) + folio_end_fscache(folio); + folio_end_writeback(folio); + } + + rcu_read_unlock(); + netfs_put_group_many(group, gcount); + _leave(""); +} + +/* + * Deal with the disposition of the folios that are under writeback to close + * out the operation. + */ +static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq) +{ + struct address_space *mapping = wreq->mapping; + + _enter(""); + + switch (wreq->error) { + case 0: + netfs_pages_written_back(wreq); + break; + + default: + pr_notice("R=%08x Unexpected error %d\n", wreq->debug_id, wreq->error); + fallthrough; + case -EACCES: + case -EPERM: + case -ENOKEY: + case -EKEYEXPIRED: + case -EKEYREJECTED: + case -EKEYREVOKED: + case -ENETRESET: + case -EDQUOT: + case -ENOSPC: + netfs_redirty_pages(mapping, wreq->start, wreq->len); + break; + + case -EROFS: + case -EIO: + case -EREMOTEIO: + case -EFBIG: + case -ENOENT: + case -ENOMEDIUM: + case -ENXIO: + netfs_kill_pages(mapping, wreq->start, wreq->len); + break; + } + + if (wreq->error) + mapping_set_error(mapping, wreq->error); + if (wreq->netfs_ops->done) + wreq->netfs_ops->done(wreq); +} + +/* + * Extend the region to be written back to include subsequent contiguously + * dirty pages if possible, but don't sleep while doing so. + * + * If this page holds new content, then we can include filler zeros in the + * writeback. + */ +static void netfs_extend_writeback(struct address_space *mapping, + struct netfs_group *group, + struct xa_state *xas, + long *_count, + loff_t start, + loff_t max_len, + bool caching, + size_t *_len, + size_t *_top) +{ + struct netfs_folio *finfo; + struct folio_batch fbatch; + struct folio *folio; + unsigned int i; + pgoff_t index = (start + *_len) / PAGE_SIZE; + size_t len; + void *priv; + bool stop = true; + + folio_batch_init(&fbatch); + + do { + /* Firstly, we gather up a batch of contiguous dirty pages + * under the RCU read lock - but we can't clear the dirty flags + * there if any of those pages are mapped. + */ + rcu_read_lock(); + + xas_for_each(xas, folio, ULONG_MAX) { + stop = true; + if (xas_retry(xas, folio)) + continue; + if (xa_is_value(folio)) + break; + if (folio_index(folio) != index) { + xas_reset(xas); + break; + } + + if (!folio_try_get_rcu(folio)) { + xas_reset(xas); + continue; + } + + /* Has the folio moved or been split? */ + if (unlikely(folio != xas_reload(xas))) { + folio_put(folio); + xas_reset(xas); + break; + } + + if (!folio_trylock(folio)) { + folio_put(folio); + xas_reset(xas); + break; + } + if (!folio_test_dirty(folio) || + folio_test_writeback(folio) || + folio_test_fscache(folio)) { + folio_unlock(folio); + folio_put(folio); + xas_reset(xas); + break; + } + + stop = false; + len = folio_size(folio); + priv = folio_get_private(folio); + if ((const struct netfs_group *)priv != group) { + stop = true; + finfo = netfs_folio_info(folio); + if (finfo->netfs_group != group || + finfo->dirty_offset > 0) { + folio_unlock(folio); + folio_put(folio); + xas_reset(xas); + break; + } + len = finfo->dirty_len; + } + + *_top += folio_size(folio); + index += folio_nr_pages(folio); + *_count -= folio_nr_pages(folio); + *_len += len; + if (*_len >= max_len || *_count <= 0) + stop = true; + + if (!folio_batch_add(&fbatch, folio)) + break; + if (stop) + break; + } + + xas_pause(xas); + rcu_read_unlock(); + + /* Now, if we obtained any folios, we can shift them to being + * writable and mark them for caching. + */ + if (!folio_batch_count(&fbatch)) + break; + + for (i = 0; i < folio_batch_count(&fbatch); i++) { + folio = fbatch.folios[i]; + trace_netfs_folio(folio, netfs_folio_trace_store_plus); + + if (!folio_clear_dirty_for_io(folio)) + BUG(); + folio_start_writeback(folio); + netfs_folio_start_fscache(caching, folio); + folio_unlock(folio); + } + + folio_batch_release(&fbatch); + cond_resched(); + } while (!stop); +} + +/* + * Synchronously write back the locked page and any subsequent non-locked dirty + * pages. + */ +static ssize_t netfs_write_back_from_locked_folio(struct address_space *mapping, + struct writeback_control *wbc, + struct netfs_group *group, + struct xa_state *xas, + struct folio *folio, + unsigned long long start, + unsigned long long end) +{ + struct netfs_io_request *wreq; + struct netfs_folio *finfo; + struct netfs_inode *ctx = netfs_inode(mapping->host); + unsigned long long i_size = i_size_read(&ctx->inode); + size_t len, max_len; + bool caching = netfs_is_cache_enabled(ctx); + long count = wbc->nr_to_write; + int ret; + + _enter(",%lx,%llx-%llx,%u", folio_index(folio), start, end, caching); + + wreq = netfs_alloc_request(mapping, NULL, start, folio_size(folio), + NETFS_WRITEBACK); + if (IS_ERR(wreq)) { + folio_unlock(folio); + return PTR_ERR(wreq); + } + + if (!folio_clear_dirty_for_io(folio)) + BUG(); + folio_start_writeback(folio); + netfs_folio_start_fscache(caching, folio); + + count -= folio_nr_pages(folio); + + /* Find all consecutive lockable dirty pages that have contiguous + * written regions, stopping when we find a page that is not + * immediately lockable, is not dirty or is missing, or we reach the + * end of the range. + */ + trace_netfs_folio(folio, netfs_folio_trace_store); + + len = wreq->len; + finfo = netfs_folio_info(folio); + if (finfo) { + start += finfo->dirty_offset; + if (finfo->dirty_offset + finfo->dirty_len != len) { + len = finfo->dirty_len; + goto cant_expand; + } + len = finfo->dirty_len; + } + + if (start < i_size) { + /* Trim the write to the EOF; the extra data is ignored. Also + * put an upper limit on the size of a single storedata op. + */ + max_len = 65536 * 4096; + max_len = min_t(unsigned long long, max_len, end - start + 1); + max_len = min_t(unsigned long long, max_len, i_size - start); + + if (len < max_len) + netfs_extend_writeback(mapping, group, xas, &count, start, + max_len, caching, &len, &wreq->upper_len); + } + +cant_expand: + len = min_t(unsigned long long, len, i_size - start); + + /* We now have a contiguous set of dirty pages, each with writeback + * set; the first page is still locked at this point, but all the rest + * have been unlocked. + */ + folio_unlock(folio); + wreq->start = start; + wreq->len = len; + + if (start < i_size) { + _debug("write back %zx @%llx [%llx]", len, start, i_size); + + /* Speculatively write to the cache. We have to fix this up + * later if the store fails. + */ + wreq->cleanup = netfs_cleanup_buffered_write; + + iov_iter_xarray(&wreq->iter, ITER_SOURCE, &mapping->i_pages, start, + wreq->upper_len); + __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); + ret = netfs_begin_write(wreq, true, netfs_write_trace_writeback); + if (ret == 0 || ret == -EIOCBQUEUED) + wbc->nr_to_write -= len / PAGE_SIZE; + } else { + _debug("write discard %zx @%llx [%llx]", len, start, i_size); + + /* The dirty region was entirely beyond the EOF. */ + fscache_clear_page_bits(mapping, start, len, caching); + netfs_pages_written_back(wreq); + ret = 0; + } + + netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + _leave(" = 1"); + return 1; +} + +/* + * Write a region of pages back to the server + */ +static ssize_t netfs_writepages_begin(struct address_space *mapping, + struct writeback_control *wbc, + struct netfs_group *group, + struct xa_state *xas, + unsigned long long *_start, + unsigned long long end) +{ + const struct netfs_folio *finfo; + struct folio *folio; + unsigned long long start = *_start; + ssize_t ret; + void *priv; + int skips = 0; + + _enter("%llx,%llx,", start, end); + +search_again: + /* Find the first dirty page in the group. */ + rcu_read_lock(); + + for (;;) { + folio = xas_find_marked(xas, end / PAGE_SIZE, PAGECACHE_TAG_DIRTY); + if (xas_retry(xas, folio) || xa_is_value(folio)) + continue; + if (!folio) + break; + + if (!folio_try_get_rcu(folio)) { + xas_reset(xas); + continue; + } + + if (unlikely(folio != xas_reload(xas))) { + folio_put(folio); + xas_reset(xas); + continue; + } + + /* Skip any dirty folio that's not in the group of interest. */ + priv = folio_get_private(folio); + if ((const struct netfs_group *)priv != group) { + finfo = netfs_folio_info(folio); + if (finfo->netfs_group != group) { + folio_put(folio); + continue; + } + } + + xas_pause(xas); + break; + } + rcu_read_unlock(); + if (!folio) + return 0; + + start = folio_pos(folio); /* May regress with THPs */ + + _debug("wback %lx", folio_index(folio)); + + /* At this point we hold neither the i_pages lock nor the page lock: + * the page may be truncated or invalidated (changing page->mapping to + * NULL), or even swizzled back from swapper_space to tmpfs file + * mapping + */ +lock_again: + if (wbc->sync_mode != WB_SYNC_NONE) { + ret = folio_lock_killable(folio); + if (ret < 0) + return ret; + } else { + if (!folio_trylock(folio)) + goto search_again; + } + + if (folio->mapping != mapping || + !folio_test_dirty(folio)) { + start += folio_size(folio); + folio_unlock(folio); + goto search_again; + } + + if (folio_test_writeback(folio) || + folio_test_fscache(folio)) { + folio_unlock(folio); + if (wbc->sync_mode != WB_SYNC_NONE) { + folio_wait_writeback(folio); +#ifdef CONFIG_NETFS_FSCACHE + folio_wait_fscache(folio); +#endif + goto lock_again; + } + + start += folio_size(folio); + if (wbc->sync_mode == WB_SYNC_NONE) { + if (skips >= 5 || need_resched()) { + ret = 0; + goto out; + } + skips++; + } + goto search_again; + } + + ret = netfs_write_back_from_locked_folio(mapping, wbc, group, xas, + folio, start, end); +out: + if (ret > 0) + *_start = start + ret; + _leave(" = %zd [%llx]", ret, *_start); + return ret; +} + +/* + * Write a region of pages back to the server + */ +static int netfs_writepages_region(struct address_space *mapping, + struct writeback_control *wbc, + struct netfs_group *group, + unsigned long long *_start, + unsigned long long end) +{ + ssize_t ret; + + XA_STATE(xas, &mapping->i_pages, *_start / PAGE_SIZE); + + do { + ret = netfs_writepages_begin(mapping, wbc, group, &xas, + _start, end); + if (ret > 0 && wbc->nr_to_write > 0) + cond_resched(); + } while (ret > 0 && wbc->nr_to_write > 0); + + return ret > 0 ? 0 : ret; +} + +/* + * write some of the pending data back to the server + */ +int netfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct netfs_group *group = NULL; + loff_t start, end; + int ret; + + _enter(""); + + /* We have to be careful as we can end up racing with setattr() + * truncating the pagecache since the caller doesn't take a lock here + * to prevent it. + */ + + if (wbc->range_cyclic && mapping->writeback_index) { + start = mapping->writeback_index * PAGE_SIZE; + ret = netfs_writepages_region(mapping, wbc, group, + &start, LLONG_MAX); + if (ret < 0) + goto out; + + if (wbc->nr_to_write <= 0) { + mapping->writeback_index = start / PAGE_SIZE; + goto out; + } + + start = 0; + end = mapping->writeback_index * PAGE_SIZE; + mapping->writeback_index = 0; + ret = netfs_writepages_region(mapping, wbc, group, &start, end); + if (ret == 0) + mapping->writeback_index = start / PAGE_SIZE; + } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) { + start = 0; + ret = netfs_writepages_region(mapping, wbc, group, + &start, LLONG_MAX); + if (wbc->nr_to_write > 0 && ret == 0) + mapping->writeback_index = start / PAGE_SIZE; + } else { + start = wbc->range_start; + ret = netfs_writepages_region(mapping, wbc, group, + &start, wbc->range_end); + } + +out: + _leave(" = %d", ret); + return ret; +} +EXPORT_SYMBOL(netfs_writepages); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 2856389f4694c..86bb8cb7f8d08 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -397,6 +397,8 @@ int netfs_read_folio(struct file *, struct folio *); int netfs_write_begin(struct netfs_inode *, struct file *, struct address_space *, loff_t pos, unsigned int len, struct folio **, void **fsdata); +int netfs_writepages(struct address_space *mapping, + struct writeback_control *wbc); bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio); int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc); void netfs_clear_inode_writeback(struct inode *inode, const void *aux); -- GitLab From 4a79616cfb27d76947ea37f0336745ef929d56be Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 5 Oct 2023 16:52:58 +0100 Subject: [PATCH 0592/1290] netfs: Provide a launder_folio implementation Provide a launder_folio implementation for netfslib. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/netfs/buffered_write.c | 74 ++++++++++++++++++++++++++++++++++++ fs/netfs/main.c | 1 + include/linux/netfs.h | 2 + include/trace/events/netfs.h | 3 ++ 4 files changed, 80 insertions(+) diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index c078826f7fe67..50be8fe3ca432 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -1111,3 +1111,77 @@ out: return ret; } EXPORT_SYMBOL(netfs_writepages); + +/* + * Deal with the disposition of a laundered folio. + */ +static void netfs_cleanup_launder_folio(struct netfs_io_request *wreq) +{ + if (wreq->error) { + pr_notice("R=%08x Laundering error %d\n", wreq->debug_id, wreq->error); + mapping_set_error(wreq->mapping, wreq->error); + } +} + +/** + * netfs_launder_folio - Clean up a dirty folio that's being invalidated + * @folio: The folio to clean + * + * This is called to write back a folio that's being invalidated when an inode + * is getting torn down. Ideally, writepages would be used instead. + */ +int netfs_launder_folio(struct folio *folio) +{ + struct netfs_io_request *wreq; + struct address_space *mapping = folio->mapping; + struct netfs_folio *finfo = netfs_folio_info(folio); + struct netfs_group *group = netfs_folio_group(folio); + struct bio_vec bvec; + unsigned long long i_size = i_size_read(mapping->host); + unsigned long long start = folio_pos(folio); + size_t offset = 0, len; + int ret = 0; + + if (finfo) { + offset = finfo->dirty_offset; + start += offset; + len = finfo->dirty_len; + } else { + len = folio_size(folio); + } + len = min_t(unsigned long long, len, i_size - start); + + wreq = netfs_alloc_request(mapping, NULL, start, len, NETFS_LAUNDER_WRITE); + if (IS_ERR(wreq)) { + ret = PTR_ERR(wreq); + goto out; + } + + if (!folio_clear_dirty_for_io(folio)) + goto out_put; + + trace_netfs_folio(folio, netfs_folio_trace_launder); + + _debug("launder %llx-%llx", start, start + len - 1); + + /* Speculatively write to the cache. We have to fix this up later if + * the store fails. + */ + wreq->cleanup = netfs_cleanup_launder_folio; + + bvec_set_folio(&bvec, folio, len, offset); + iov_iter_bvec(&wreq->iter, ITER_SOURCE, &bvec, 1, len); + __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); + ret = netfs_begin_write(wreq, true, netfs_write_trace_launder); + +out_put: + folio_detach_private(folio); + netfs_put_group(group); + kfree(finfo); + netfs_put_request(wreq, false, netfs_rreq_trace_put_return); +out: + folio_wait_fscache(folio); + _leave(" = %d", ret); + return ret; +} +EXPORT_SYMBOL(netfs_launder_folio); diff --git a/fs/netfs/main.c b/fs/netfs/main.c index 8e4db9ff40c40..473f889e1bd18 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -30,6 +30,7 @@ static const char *netfs_origins[nr__netfs_io_origin] = { [NETFS_READPAGE] = "RP", [NETFS_READ_FOR_WRITE] = "RW", [NETFS_WRITEBACK] = "WB", + [NETFS_LAUNDER_WRITE] = "LW", [NETFS_UNBUFFERED_WRITE] = "UW", [NETFS_DIO_READ] = "DR", [NETFS_DIO_WRITE] = "DW", diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 86bb8cb7f8d08..29c66acad9256 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -227,6 +227,7 @@ enum netfs_io_origin { NETFS_READPAGE, /* This read is a synchronous read */ NETFS_READ_FOR_WRITE, /* This read is to prepare a write */ NETFS_WRITEBACK, /* This write was triggered by writepages */ + NETFS_LAUNDER_WRITE, /* This is triggered by ->launder_folio() */ NETFS_UNBUFFERED_WRITE, /* This is an unbuffered write */ NETFS_DIO_READ, /* This is a direct I/O read */ NETFS_DIO_WRITE, /* This is a direct I/O write */ @@ -404,6 +405,7 @@ int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc); void netfs_clear_inode_writeback(struct inode *inode, const void *aux); void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length); bool netfs_release_folio(struct folio *folio, gfp_t gfp); +int netfs_launder_folio(struct folio *folio); /* VMA operations API. */ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group); diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 914a24b03d08b..cc998798e20a4 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -25,6 +25,7 @@ #define netfs_write_traces \ EM(netfs_write_trace_dio_write, "DIO-WRITE") \ + EM(netfs_write_trace_launder, "LAUNDER ") \ EM(netfs_write_trace_unbuffered_write, "UNB-WRITE") \ E_(netfs_write_trace_writeback, "WRITEBACK") @@ -33,6 +34,7 @@ EM(NETFS_READPAGE, "RP") \ EM(NETFS_READ_FOR_WRITE, "RW") \ EM(NETFS_WRITEBACK, "WB") \ + EM(NETFS_LAUNDER_WRITE, "LW") \ EM(NETFS_UNBUFFERED_WRITE, "UW") \ EM(NETFS_DIO_READ, "DR") \ E_(NETFS_DIO_WRITE, "DW") @@ -127,6 +129,7 @@ EM(netfs_folio_trace_end_copy, "end-copy") \ EM(netfs_folio_trace_filled_gaps, "filled-gaps") \ EM(netfs_folio_trace_kill, "kill") \ + EM(netfs_folio_trace_launder, "launder") \ EM(netfs_folio_trace_mkwrite, "mkwrite") \ EM(netfs_folio_trace_mkwrite_plus, "mkwrite+") \ EM(netfs_folio_trace_read_gaps, "read-gaps") \ -- GitLab From 41d8e7673a7726cba57cb8112d81c89cfb6c3e35 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 12 Oct 2023 09:06:24 +0100 Subject: [PATCH 0593/1290] netfs: Implement a write-through caching option Provide a flag whereby a filesystem may request that cifs_perform_write() perform write-through caching. This involves putting pages directly into writeback rather than dirty and attaching them to a write operation as we go. Further, the writes being made are limited to the byte range being written rather than whole folios being written. This can be used by cifs, for example, to deal with strict byte-range locking. This can't be used with content encryption as that may require expansion of the write RPC beyond the write being made. This doesn't affect writes via mmap - those are written back in the normal way; similarly failed writethrough writes are marked dirty and left to writeback to retry. Another option would be to simply invalidate them, but the contents can be simultaneously accessed by read() and through mmap. Signed-off-by: David Howells cc: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/netfs/buffered_write.c | 69 +++++++++++++++++++++++---- fs/netfs/internal.h | 3 ++ fs/netfs/main.c | 1 + fs/netfs/objects.c | 1 + fs/netfs/output.c | 90 ++++++++++++++++++++++++++++++++++++ include/linux/netfs.h | 2 + include/trace/events/netfs.h | 8 +++- 7 files changed, 162 insertions(+), 12 deletions(-) diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 50be8fe3ca432..6ca6c4bde5eb2 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -26,6 +26,8 @@ enum netfs_how_to_modify { NETFS_FLUSH_CONTENT, /* Flush incompatible content. */ }; +static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq); + static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) { if (netfs_group && !folio_get_private(folio)) @@ -133,6 +135,14 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; struct netfs_inode *ctx = netfs_inode(inode); + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .for_sync = true, + .nr_to_write = LONG_MAX, + .range_start = iocb->ki_pos, + .range_end = iocb->ki_pos + iter->count, + }; + struct netfs_io_request *wreq = NULL; struct netfs_folio *finfo; struct folio *folio; enum netfs_how_to_modify howto; @@ -143,6 +153,30 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER; bool maybe_trouble = false; + if (unlikely(test_bit(NETFS_ICTX_WRITETHROUGH, &ctx->flags) || + iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) + ) { + if (pos < i_size_read(inode)) { + ret = filemap_write_and_wait_range(mapping, pos, pos + iter->count); + if (ret < 0) { + goto out; + } + } + + wbc_attach_fdatawrite_inode(&wbc, mapping->host); + + wreq = netfs_begin_writethrough(iocb, iter->count); + if (IS_ERR(wreq)) { + wbc_detach_inode(&wbc); + ret = PTR_ERR(wreq); + wreq = NULL; + goto out; + } + if (!is_sync_kiocb(iocb)) + wreq->iocb = iocb; + wreq->cleanup = netfs_cleanup_buffered_write; + } + do { size_t flen; size_t offset; /* Offset into pagecache folio */ @@ -315,7 +349,25 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, } written += copied; - folio_mark_dirty(folio); + if (likely(!wreq)) { + folio_mark_dirty(folio); + } else { + if (folio_test_dirty(folio)) + /* Sigh. mmap. */ + folio_clear_dirty_for_io(folio); + /* We make multiple writes to the folio... */ + if (!folio_test_writeback(folio)) { + folio_wait_fscache(folio); + folio_start_writeback(folio); + folio_start_fscache(folio); + if (wreq->iter.count == 0) + trace_netfs_folio(folio, netfs_folio_trace_wthru); + else + trace_netfs_folio(folio, netfs_folio_trace_wthru_plus); + } + netfs_advance_writethrough(wreq, copied, + offset + copied == flen); + } retry: folio_unlock(folio); folio_put(folio); @@ -325,17 +377,14 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, } while (iov_iter_count(iter)); out: - if (likely(written)) { - /* Flush and wait for a write that requires immediate synchronisation. */ - if (iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) { - _debug("dsync"); - ret = filemap_fdatawait_range(mapping, iocb->ki_pos, - iocb->ki_pos + written); - } - - iocb->ki_pos += written; + if (unlikely(wreq)) { + ret = netfs_end_writethrough(wreq, iocb); + wbc_detach_inode(&wbc); + if (ret == -EIOCBQUEUED) + return ret; } + iocb->ki_pos += written; _leave(" = %zd [%zd]", written, ret); return written ? written : ret; diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index 2de4f826dbe4f..d2d63120ac60a 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -101,6 +101,9 @@ static inline void netfs_see_request(struct netfs_io_request *rreq, */ int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait, enum netfs_write_trace what); +struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len); +int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end); +int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb); /* * stats.c diff --git a/fs/netfs/main.c b/fs/netfs/main.c index 473f889e1bd18..81a13071b258a 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -30,6 +30,7 @@ static const char *netfs_origins[nr__netfs_io_origin] = { [NETFS_READPAGE] = "RP", [NETFS_READ_FOR_WRITE] = "RW", [NETFS_WRITEBACK] = "WB", + [NETFS_WRITETHROUGH] = "WT", [NETFS_LAUNDER_WRITE] = "LW", [NETFS_UNBUFFERED_WRITE] = "UW", [NETFS_DIO_READ] = "DR", diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index b4e3bd836e5df..610ceb5bd86c0 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -41,6 +41,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, rreq->i_size = i_size_read(inode); rreq->debug_id = atomic_inc_return(&debug_ids); INIT_LIST_HEAD(&rreq->subrequests); + INIT_WORK(&rreq->work, NULL); refcount_set(&rreq->ref, 1); __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); diff --git a/fs/netfs/output.c b/fs/netfs/output.c index cc9065733b423..625eb68f3e5ad 100644 --- a/fs/netfs/output.c +++ b/fs/netfs/output.c @@ -386,3 +386,93 @@ int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait, TASK_UNINTERRUPTIBLE); return wreq->error; } + +/* + * Begin a write operation for writing through the pagecache. + */ +struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len) +{ + struct netfs_io_request *wreq; + struct file *file = iocb->ki_filp; + + wreq = netfs_alloc_request(file->f_mapping, file, iocb->ki_pos, len, + NETFS_WRITETHROUGH); + if (IS_ERR(wreq)) + return wreq; + + trace_netfs_write(wreq, netfs_write_trace_writethrough); + + __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); + iov_iter_xarray(&wreq->iter, ITER_SOURCE, &wreq->mapping->i_pages, wreq->start, 0); + wreq->io_iter = wreq->iter; + + /* ->outstanding > 0 carries a ref */ + netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding); + atomic_set(&wreq->nr_outstanding, 1); + return wreq; +} + +static void netfs_submit_writethrough(struct netfs_io_request *wreq, bool final) +{ + struct netfs_inode *ictx = netfs_inode(wreq->inode); + unsigned long long start; + size_t len; + + if (!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags)) + return; + + start = wreq->start + wreq->submitted; + len = wreq->iter.count - wreq->submitted; + if (!final) { + len /= wreq->wsize; /* Round to number of maximum packets */ + len *= wreq->wsize; + } + + ictx->ops->create_write_requests(wreq, start, len); + wreq->submitted += len; +} + +/* + * Advance the state of the write operation used when writing through the + * pagecache. Data has been copied into the pagecache that we need to append + * to the request. If we've added more than wsize then we need to create a new + * subrequest. + */ +int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end) +{ + _enter("ic=%zu sb=%zu ws=%u cp=%zu tp=%u", + wreq->iter.count, wreq->submitted, wreq->wsize, copied, to_page_end); + + wreq->iter.count += copied; + wreq->io_iter.count += copied; + if (to_page_end && wreq->io_iter.count - wreq->submitted >= wreq->wsize) + netfs_submit_writethrough(wreq, false); + + return wreq->error; +} + +/* + * End a write operation used when writing through the pagecache. + */ +int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb) +{ + int ret = -EIOCBQUEUED; + + _enter("ic=%zu sb=%zu ws=%u", + wreq->iter.count, wreq->submitted, wreq->wsize); + + if (wreq->submitted < wreq->io_iter.count) + netfs_submit_writethrough(wreq, true); + + if (atomic_dec_and_test(&wreq->nr_outstanding)) + netfs_write_terminated(wreq, false); + + if (is_sync_kiocb(iocb)) { + wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, + TASK_UNINTERRUPTIBLE); + ret = wreq->error; + } + + netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + return ret; +} diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 29c66acad9256..8a2dd882a7814 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -139,6 +139,7 @@ struct netfs_inode { unsigned long flags; #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ +#define NETFS_ICTX_WRITETHROUGH 2 /* Write-through caching */ }; /* @@ -227,6 +228,7 @@ enum netfs_io_origin { NETFS_READPAGE, /* This read is a synchronous read */ NETFS_READ_FOR_WRITE, /* This read is to prepare a write */ NETFS_WRITEBACK, /* This write was triggered by writepages */ + NETFS_WRITETHROUGH, /* This write was made by netfs_perform_write() */ NETFS_LAUNDER_WRITE, /* This is triggered by ->launder_folio() */ NETFS_UNBUFFERED_WRITE, /* This is an unbuffered write */ NETFS_DIO_READ, /* This is a direct I/O read */ diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index cc998798e20a4..447a8c21cf57d 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -27,13 +27,15 @@ EM(netfs_write_trace_dio_write, "DIO-WRITE") \ EM(netfs_write_trace_launder, "LAUNDER ") \ EM(netfs_write_trace_unbuffered_write, "UNB-WRITE") \ - E_(netfs_write_trace_writeback, "WRITEBACK") + EM(netfs_write_trace_writeback, "WRITEBACK") \ + E_(netfs_write_trace_writethrough, "WRITETHRU") #define netfs_rreq_origins \ EM(NETFS_READAHEAD, "RA") \ EM(NETFS_READPAGE, "RP") \ EM(NETFS_READ_FOR_WRITE, "RW") \ EM(NETFS_WRITEBACK, "WB") \ + EM(NETFS_WRITETHROUGH, "WT") \ EM(NETFS_LAUNDER_WRITE, "LW") \ EM(NETFS_UNBUFFERED_WRITE, "UW") \ EM(NETFS_DIO_READ, "DR") \ @@ -136,7 +138,9 @@ EM(netfs_folio_trace_redirty, "redirty") \ EM(netfs_folio_trace_redirtied, "redirtied") \ EM(netfs_folio_trace_store, "store") \ - E_(netfs_folio_trace_store_plus, "store+") + EM(netfs_folio_trace_store_plus, "store+") \ + EM(netfs_folio_trace_wthru, "wthru") \ + E_(netfs_folio_trace_wthru_plus, "wthru+") #ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY #define __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY -- GitLab From 100ccd18bb41ea7abb4fbb419202c06079559501 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 24 Nov 2023 13:39:02 +0000 Subject: [PATCH 0594/1290] netfs: Optimise away reads above the point at which there can be no data Track the file position above which the server is not expected to have any data (the "zero point") and preemptively assume that we can satisfy requests by filling them with zeroes locally rather than attempting to download them if they're over that line - even if we've written data back to the server. Assume that any data that was written back above that position is held in the local cache. Note that we have to split requests that straddle the line. Make use of this to optimise away some reads from the server. We need to set the zero point in the following circumstances: (1) When we see an extant remote inode and have no cache for it, we set the zero_point to i_size. (2) On local inode creation, we set zero_point to 0. (3) On local truncation down, we reduce zero_point to the new i_size if the new i_size is lower. (4) On local truncation up, we don't change zero_point. (5) On local modification, we don't change zero_point. (6) On remote invalidation, we set zero_point to the new i_size. (7) If stored data is discarded from the pagecache or culled from fscache, we must set zero_point above that if the data also got written to the server. (8) If dirty data is written back to the server, but not fscache, we must set zero_point above that. (9) If a direct I/O write is made, set zero_point above that. Assuming the above, any read from the server at or above the zero_point position will return all zeroes. The zero_point value can be stored in the cache, provided the above rules are applied to it by any code that culls part of the local cache. Signed-off-by: David Howells cc: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/9p/vfs_inode.c | 2 +- fs/afs/dynroot.c | 2 +- fs/afs/inode.c | 24 ++++++++++++++---------- fs/ceph/inode.c | 2 +- fs/netfs/buffered_write.c | 2 +- fs/netfs/direct_write.c | 4 ++++ fs/netfs/io.c | 10 ++++++++++ fs/netfs/misc.c | 5 +++++ fs/nfs/fscache.h | 2 +- fs/smb/client/cifsfs.c | 4 ++-- include/linux/netfs.h | 21 ++++++++++++++++++--- 11 files changed, 58 insertions(+), 20 deletions(-) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 74122540e00f2..df7ae381a7088 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -249,7 +249,7 @@ void v9fs_free_inode(struct inode *inode) static void v9fs_set_netfs_context(struct inode *inode) { struct v9fs_inode *v9inode = V9FS_I(inode); - netfs_inode_init(&v9inode->netfs, &v9fs_req_ops); + netfs_inode_init(&v9inode->netfs, &v9fs_req_ops, true); } int v9fs_init_inode(struct v9fs_session_info *v9ses, diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 1f656005018ea..9c517269ff954 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -76,7 +76,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) /* there shouldn't be an existing inode */ BUG_ON(!(inode->i_state & I_NEW)); - netfs_inode_init(&vnode->netfs, NULL); + netfs_inode_init(&vnode->netfs, NULL, false); inode->i_size = 0; inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; if (root) { diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 37485ae314714..381521e9e1183 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -58,7 +58,7 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren */ static void afs_set_netfs_context(struct afs_vnode *vnode) { - netfs_inode_init(&vnode->netfs, &afs_req_ops); + netfs_inode_init(&vnode->netfs, &afs_req_ops, true); } /* @@ -168,6 +168,7 @@ static void afs_apply_status(struct afs_operation *op, struct inode *inode = &vnode->netfs.inode; struct timespec64 t; umode_t mode; + bool unexpected_jump = false; bool data_changed = false; bool change_size = vp->set_size; @@ -231,6 +232,7 @@ static void afs_apply_status(struct afs_operation *op, } change_size = true; data_changed = true; + unexpected_jump = true; } else if (vnode->status.type == AFS_FTYPE_DIR) { /* Expected directory change is handled elsewhere so * that we can locally edit the directory and save on a @@ -252,6 +254,8 @@ static void afs_apply_status(struct afs_operation *op, vnode->netfs.remote_i_size = status->size; if (change_size || status->size > i_size_read(inode)) { afs_set_i_size(vnode, status->size); + if (unexpected_jump) + vnode->netfs.zero_point = status->size; inode_set_ctime_to_ts(inode, t); inode_set_atime_to_ts(inode, t); } @@ -865,17 +869,17 @@ static void afs_setattr_success(struct afs_operation *op) static void afs_setattr_edit_file(struct afs_operation *op) { struct afs_vnode_param *vp = &op->file[0]; - struct inode *inode = &vp->vnode->netfs.inode; + struct afs_vnode *vnode = vp->vnode; if (op->setattr.attr->ia_valid & ATTR_SIZE) { loff_t size = op->setattr.attr->ia_size; loff_t i_size = op->setattr.old_i_size; - if (size < i_size) - truncate_pagecache(inode, size); - if (size != i_size) - fscache_resize_cookie(afs_vnode_cache(vp->vnode), - vp->scb.status.size); + if (size != i_size) { + truncate_setsize(&vnode->netfs.inode, size); + netfs_resize_file(&vnode->netfs, size, true); + fscache_resize_cookie(afs_vnode_cache(vnode), size); + } } } @@ -943,11 +947,11 @@ int afs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, */ if (!(attr->ia_valid & (supported & ~ATTR_SIZE & ~ATTR_MTIME)) && attr->ia_size < i_size && - attr->ia_size > vnode->status.size) { - truncate_pagecache(inode, attr->ia_size); + attr->ia_size > vnode->netfs.remote_i_size) { + truncate_setsize(inode, attr->ia_size); + netfs_resize_file(&vnode->netfs, size, false); fscache_resize_cookie(afs_vnode_cache(vnode), attr->ia_size); - i_size_write(inode, attr->ia_size); ret = 0; goto out_unlock; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 3149d79a9dbe5..0c25d326afc41 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -574,7 +574,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) doutc(fsc->client, "%p\n", &ci->netfs.inode); /* Set parameters for the netfs library */ - netfs_inode_init(&ci->netfs, &ceph_netfs_ops); + netfs_inode_init(&ci->netfs, &ceph_netfs_ops, false); spin_lock_init(&ci->i_ceph_lock); diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 6ca6c4bde5eb2..08f28800232c3 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -73,7 +73,7 @@ static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx, if (folio_test_uptodate(folio)) return NETFS_FOLIO_IS_UPTODATE; - if (pos >= ctx->remote_i_size) + if (pos >= ctx->zero_point) return NETFS_MODIFY_AND_CLEAR; if (!maybe_trouble && offset == 0 && len >= flen) diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index bb0c2718f57bd..aad05f2349a48 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -134,6 +134,7 @@ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from) struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct netfs_inode *ictx = netfs_inode(inode); + unsigned long long end; ssize_t ret; _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); @@ -155,6 +156,9 @@ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = kiocb_invalidate_pages(iocb, iov_iter_count(from)); if (ret < 0) goto out; + end = iocb->ki_pos + iov_iter_count(from); + if (end > ictx->zero_point) + ictx->zero_point = end; fscache_invalidate(netfs_i_cookie(ictx), NULL, i_size_read(inode), FSCACHE_INVAL_DIO_WRITE); diff --git a/fs/netfs/io.c b/fs/netfs/io.c index 14c18be5aca00..5b5af96cd4b95 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -569,6 +569,7 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq, struct iov_iter *io_iter) { enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER; + struct netfs_inode *ictx = netfs_inode(rreq->inode); size_t lsize; _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size); @@ -586,6 +587,14 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq, * to make serial calls, it can indicate a short read and then * we will call it again. */ + if (rreq->origin != NETFS_DIO_READ) { + if (subreq->start >= ictx->zero_point) { + source = NETFS_FILL_WITH_ZEROES; + goto set; + } + if (subreq->len > ictx->zero_point - subreq->start) + subreq->len = ictx->zero_point - subreq->start; + } if (subreq->len > rreq->i_size - subreq->start) subreq->len = rreq->i_size - subreq->start; if (rreq->rsize && subreq->len > rreq->rsize) @@ -607,6 +616,7 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq, } } +set: if (subreq->len > rreq->len) pr_warn("R=%08x[%u] SREQ>RREQ %zx > %zx\n", rreq->debug_id, subreq->debug_index, diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index eeb44abe59c52..0e3af37fc9243 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -240,6 +240,11 @@ EXPORT_SYMBOL(netfs_invalidate_folio); bool netfs_release_folio(struct folio *folio, gfp_t gfp) { struct netfs_inode *ctx = netfs_inode(folio_inode(folio)); + unsigned long long end; + + end = folio_pos(folio) + folio_size(folio); + if (end > ctx->zero_point) + ctx->zero_point = end; if (folio_test_private(folio)) return false; diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index 5407ab8c87835..e3cb4923316b2 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -80,7 +80,7 @@ static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs) } static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) { - netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops); + netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops, false); } extern void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr); extern void nfs_netfs_read_completion(struct nfs_pgio_header *hdr); diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 96a65cf9b5ec9..07cd88897c331 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -1220,7 +1220,7 @@ static int cifs_precopy_set_eof(struct inode *src_inode, struct cifsInodeInfo *s if (rc < 0) goto set_failed; - netfs_resize_file(&src_cifsi->netfs, src_end); + netfs_resize_file(&src_cifsi->netfs, src_end, true); fscache_resize_cookie(cifs_inode_cookie(src_inode), src_end); return 0; @@ -1351,7 +1351,7 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off, smb_file_src, smb_file_target, off, len, destoff); if (rc == 0 && new_size > i_size_read(target_inode)) { truncate_setsize(target_inode, new_size); - netfs_resize_file(&target_cifsi->netfs, new_size); + netfs_resize_file(&target_cifsi->netfs, new_size, true); fscache_resize_cookie(cifs_inode_cookie(target_inode), new_size); } diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 8a2dd882a7814..852956aa3c4bb 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -136,6 +136,8 @@ struct netfs_inode { struct fscache_cookie *cache; #endif loff_t remote_i_size; /* Size of the remote file */ + loff_t zero_point; /* Size after which we assume there's no data + * on the server */ unsigned long flags; #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ @@ -453,31 +455,44 @@ static inline struct netfs_inode *netfs_inode(struct inode *inode) * netfs_inode_init - Initialise a netfslib inode context * @ctx: The netfs inode to initialise * @ops: The netfs's operations list + * @use_zero_point: True to use the zero_point read optimisation * * Initialise the netfs library context struct. This is expected to follow on * directly from the VFS inode struct. */ static inline void netfs_inode_init(struct netfs_inode *ctx, - const struct netfs_request_ops *ops) + const struct netfs_request_ops *ops, + bool use_zero_point) { ctx->ops = ops; ctx->remote_i_size = i_size_read(&ctx->inode); + ctx->zero_point = LLONG_MAX; ctx->flags = 0; #if IS_ENABLED(CONFIG_FSCACHE) ctx->cache = NULL; #endif + /* ->releasepage() drives zero_point */ + if (use_zero_point) { + ctx->zero_point = ctx->remote_i_size; + mapping_set_release_always(ctx->inode.i_mapping); + } } /** * netfs_resize_file - Note that a file got resized * @ctx: The netfs inode being resized * @new_i_size: The new file size + * @changed_on_server: The change was applied to the server * * Inform the netfs lib that a file got resized so that it can adjust its state. */ -static inline void netfs_resize_file(struct netfs_inode *ctx, loff_t new_i_size) +static inline void netfs_resize_file(struct netfs_inode *ctx, loff_t new_i_size, + bool changed_on_server) { - ctx->remote_i_size = new_i_size; + if (changed_on_server) + ctx->remote_i_size = new_i_size; + if (new_i_size < ctx->zero_point) + ctx->zero_point = new_i_size; } /** -- GitLab From 545b135b72002145ade758f7e59c113915283188 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 25 Apr 2022 16:30:11 +0100 Subject: [PATCH 0595/1290] netfs: Export the netfs_sreq tracepoint Export the netfs_sreq tracepoint so that it can be called directly from client filesystems/cache backend modules. Signed-off-by: David Howells cc: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/netfs/main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/netfs/main.c b/fs/netfs/main.c index 81a13071b258a..5e77618a79409 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -17,6 +17,8 @@ MODULE_DESCRIPTION("Network fs support"); MODULE_AUTHOR("Red Hat, Inc."); MODULE_LICENSE("GPL"); +EXPORT_TRACEPOINT_SYMBOL(netfs_sreq); + unsigned netfs_debug; module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO); MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask"); -- GitLab From 3560358a49569d0ade0ee5c9cecb3606dac863c2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 15 Feb 2022 17:02:04 +0000 Subject: [PATCH 0596/1290] afs: Use the netfs write helpers Make afs use the netfs write helpers. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: Marc Dionne cc: linux-afs@lists.infradead.org cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/afs/file.c | 70 +++- fs/afs/internal.h | 10 +- fs/afs/write.c | 707 ++----------------------------------- include/trace/events/afs.h | 23 -- 4 files changed, 84 insertions(+), 726 deletions(-) diff --git a/fs/afs/file.c b/fs/afs/file.c index 3403bb792deb8..833e7c2003e0c 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -34,7 +34,7 @@ const struct file_operations afs_file_operations = { .release = afs_release, .llseek = generic_file_llseek, .read_iter = afs_file_read_iter, - .write_iter = afs_file_write, + .write_iter = netfs_file_write_iter, .mmap = afs_file_mmap, .splice_read = afs_file_splice_read, .splice_write = iter_file_splice_write, @@ -50,16 +50,15 @@ const struct inode_operations afs_file_inode_operations = { }; const struct address_space_operations afs_file_aops = { + .direct_IO = noop_direct_IO, .read_folio = netfs_read_folio, .readahead = netfs_readahead, .dirty_folio = netfs_dirty_folio, - .launder_folio = afs_launder_folio, + .launder_folio = netfs_launder_folio, .release_folio = netfs_release_folio, .invalidate_folio = netfs_invalidate_folio, - .write_begin = afs_write_begin, - .write_end = afs_write_end, - .writepages = afs_writepages, .migrate_folio = filemap_migrate_folio, + .writepages = afs_writepages, }; const struct address_space_operations afs_symlink_aops = { @@ -355,7 +354,10 @@ static int afs_symlink_read_folio(struct file *file, struct folio *folio) static int afs_init_request(struct netfs_io_request *rreq, struct file *file) { - rreq->netfs_priv = key_get(afs_file_key(file)); + if (file) + rreq->netfs_priv = key_get(afs_file_key(file)); + rreq->rsize = 256 * 1024; + rreq->wsize = 256 * 1024; return 0; } @@ -372,11 +374,36 @@ static void afs_free_request(struct netfs_io_request *rreq) key_put(rreq->netfs_priv); } +static void afs_update_i_size(struct inode *inode, loff_t new_i_size) +{ + struct afs_vnode *vnode = AFS_FS_I(inode); + loff_t i_size; + + write_seqlock(&vnode->cb_lock); + i_size = i_size_read(&vnode->netfs.inode); + if (new_i_size > i_size) { + i_size_write(&vnode->netfs.inode, new_i_size); + inode_set_bytes(&vnode->netfs.inode, new_i_size); + } + write_sequnlock(&vnode->cb_lock); + fscache_update_cookie(afs_vnode_cache(vnode), NULL, &new_i_size); +} + +static void afs_netfs_invalidate_cache(struct netfs_io_request *wreq) +{ + struct afs_vnode *vnode = AFS_FS_I(wreq->inode); + + afs_invalidate_cache(vnode, 0); +} + const struct netfs_request_ops afs_req_ops = { .init_request = afs_init_request, .free_request = afs_free_request, .check_write_begin = afs_check_write_begin, .issue_read = afs_issue_read, + .update_i_size = afs_update_i_size, + .invalidate_cache = afs_netfs_invalidate_cache, + .create_write_requests = afs_create_write_requests, }; static void afs_add_open_mmap(struct afs_vnode *vnode) @@ -445,28 +472,39 @@ static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pg static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { - struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp)); + struct inode *inode = file_inode(iocb->ki_filp); + struct afs_vnode *vnode = AFS_FS_I(inode); struct afs_file *af = iocb->ki_filp->private_data; - int ret; + ssize_t ret; - ret = afs_validate(vnode, af->key); + if (iocb->ki_flags & IOCB_DIRECT) + return netfs_unbuffered_read_iter(iocb, iter); + + ret = netfs_start_io_read(inode); if (ret < 0) return ret; - - return generic_file_read_iter(iocb, iter); + ret = afs_validate(vnode, af->key); + if (ret == 0) + ret = filemap_read(iocb, iter, 0); + netfs_end_io_read(inode); + return ret; } static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { - struct afs_vnode *vnode = AFS_FS_I(file_inode(in)); + struct inode *inode = file_inode(in); + struct afs_vnode *vnode = AFS_FS_I(inode); struct afs_file *af = in->private_data; - int ret; + ssize_t ret; - ret = afs_validate(vnode, af->key); + ret = netfs_start_io_read(inode); if (ret < 0) return ret; - - return filemap_splice_read(in, ppos, pipe, len, flags); + ret = afs_validate(vnode, af->key); + if (ret == 0) + ret = filemap_splice_read(in, ppos, pipe, len, flags); + netfs_end_io_read(inode); + return ret; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index a7c8d1d702ee0..32f787f74e76c 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -1465,19 +1465,11 @@ extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *); /* * write.c */ -extern int afs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct page **pagep, void **fsdata); -extern int afs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata); -extern int afs_writepage(struct page *, struct writeback_control *); extern int afs_writepages(struct address_space *, struct writeback_control *); -extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *); extern int afs_fsync(struct file *, loff_t, loff_t, int); extern vm_fault_t afs_page_mkwrite(struct vm_fault *vmf); extern void afs_prune_wb_keys(struct afs_vnode *); -int afs_launder_folio(struct folio *); +void afs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len); /* * xattr.c diff --git a/fs/afs/write.c b/fs/afs/write.c index 80daf28d8f8b8..09b6c0b9a28ed 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -12,228 +12,17 @@ #include #include #include +#include #include "internal.h" -static int afs_writepages_region(struct address_space *mapping, - struct writeback_control *wbc, - unsigned long long start, - unsigned long long end, loff_t *_next, - bool max_one_loop); - -static void afs_write_to_cache(struct afs_vnode *vnode, loff_t start, size_t len, - loff_t i_size, bool caching); - -#ifdef CONFIG_AFS_FSCACHE -static void afs_folio_start_fscache(bool caching, struct folio *folio) -{ - if (caching) - folio_start_fscache(folio); -} -#else -static void afs_folio_start_fscache(bool caching, struct folio *folio) -{ -} -#endif - -/* - * prepare to perform part of a write to a page - */ -int afs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct page **_page, void **fsdata) -{ - struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); - struct folio *folio; - int ret; - - _enter("{%llx:%llu},%llx,%x", - vnode->fid.vid, vnode->fid.vnode, pos, len); - - /* Prefetch area to be written into the cache if we're caching this - * file. We need to do this before we get a lock on the page in case - * there's more than one writer competing for the same cache block. - */ - ret = netfs_write_begin(&vnode->netfs, file, mapping, pos, len, &folio, fsdata); - if (ret < 0) - return ret; - -try_again: - /* See if this page is already partially written in a way that we can - * merge the new write with. - */ - if (folio_test_writeback(folio)) { - trace_afs_folio_dirty(vnode, tracepoint_string("alrdy"), folio); - folio_unlock(folio); - goto wait_for_writeback; - } - - *_page = folio_file_page(folio, pos / PAGE_SIZE); - _leave(" = 0"); - return 0; - -wait_for_writeback: - ret = folio_wait_writeback_killable(folio); - if (ret < 0) - goto error; - - ret = folio_lock_killable(folio); - if (ret < 0) - goto error; - goto try_again; - -error: - folio_put(folio); - _leave(" = %d", ret); - return ret; -} - -/* - * finalise part of a write to a page - */ -int afs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *subpage, void *fsdata) -{ - struct folio *folio = page_folio(subpage); - struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); - loff_t i_size, write_end_pos; - - _enter("{%llx:%llu},{%lx}", - vnode->fid.vid, vnode->fid.vnode, folio_index(folio)); - - if (!folio_test_uptodate(folio)) { - if (copied < len) { - copied = 0; - goto out; - } - - folio_mark_uptodate(folio); - } - - if (copied == 0) - goto out; - - write_end_pos = pos + copied; - - i_size = i_size_read(&vnode->netfs.inode); - if (write_end_pos > i_size) { - write_seqlock(&vnode->cb_lock); - i_size = i_size_read(&vnode->netfs.inode); - if (write_end_pos > i_size) - afs_set_i_size(vnode, write_end_pos); - write_sequnlock(&vnode->cb_lock); - fscache_update_cookie(afs_vnode_cache(vnode), NULL, &write_end_pos); - } - - if (folio_mark_dirty(folio)) - _debug("dirtied %lx", folio_index(folio)); - -out: - folio_unlock(folio); - folio_put(folio); - return copied; -} - -/* - * kill all the pages in the given range - */ -static void afs_kill_pages(struct address_space *mapping, - loff_t start, loff_t len) -{ - struct afs_vnode *vnode = AFS_FS_I(mapping->host); - struct folio *folio; - pgoff_t index = start / PAGE_SIZE; - pgoff_t last = (start + len - 1) / PAGE_SIZE, next; - - _enter("{%llx:%llu},%llx @%llx", - vnode->fid.vid, vnode->fid.vnode, len, start); - - do { - _debug("kill %lx (to %lx)", index, last); - - folio = filemap_get_folio(mapping, index); - if (IS_ERR(folio)) { - next = index + 1; - continue; - } - - next = folio_next_index(folio); - - folio_clear_uptodate(folio); - folio_end_writeback(folio); - folio_lock(folio); - generic_error_remove_page(mapping, &folio->page); - folio_unlock(folio); - folio_put(folio); - - } while (index = next, index <= last); - - _leave(""); -} - -/* - * Redirty all the pages in a given range. - */ -static void afs_redirty_pages(struct writeback_control *wbc, - struct address_space *mapping, - loff_t start, loff_t len) -{ - struct afs_vnode *vnode = AFS_FS_I(mapping->host); - struct folio *folio; - pgoff_t index = start / PAGE_SIZE; - pgoff_t last = (start + len - 1) / PAGE_SIZE, next; - - _enter("{%llx:%llu},%llx @%llx", - vnode->fid.vid, vnode->fid.vnode, len, start); - - do { - _debug("redirty %llx @%llx", len, start); - - folio = filemap_get_folio(mapping, index); - if (IS_ERR(folio)) { - next = index + 1; - continue; - } - - next = index + folio_nr_pages(folio); - folio_redirty_for_writepage(wbc, folio); - folio_end_writeback(folio); - folio_put(folio); - } while (index = next, index <= last); - - _leave(""); -} - /* * completion of write to server */ static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsigned int len) { - struct address_space *mapping = vnode->netfs.inode.i_mapping; - struct folio *folio; - pgoff_t end; - - XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE); - _enter("{%llx:%llu},{%x @%llx}", vnode->fid.vid, vnode->fid.vnode, len, start); - rcu_read_lock(); - - end = (start + len - 1) / PAGE_SIZE; - xas_for_each(&xas, folio, end) { - if (!folio_test_writeback(folio)) { - kdebug("bad %x @%llx page %lx %lx", - len, start, folio_index(folio), end); - ASSERT(folio_test_writeback(folio)); - } - - trace_afs_folio_dirty(vnode, tracepoint_string("clear"), folio); - folio_end_writeback(folio); - } - - rcu_read_unlock(); - afs_prune_wb_keys(vnode); _leave(""); } @@ -370,339 +159,53 @@ try_next_key: return afs_put_operation(op); } -/* - * Extend the region to be written back to include subsequent contiguously - * dirty pages if possible, but don't sleep while doing so. - * - * If this page holds new content, then we can include filler zeros in the - * writeback. - */ -static void afs_extend_writeback(struct address_space *mapping, - struct afs_vnode *vnode, - long *_count, - loff_t start, - loff_t max_len, - bool caching, - size_t *_len) +static void afs_upload_to_server(struct netfs_io_subrequest *subreq) { - struct folio_batch fbatch; - struct folio *folio; - pgoff_t index = (start + *_len) / PAGE_SIZE; - bool stop = true; - unsigned int i; - - XA_STATE(xas, &mapping->i_pages, index); - folio_batch_init(&fbatch); - - do { - /* Firstly, we gather up a batch of contiguous dirty pages - * under the RCU read lock - but we can't clear the dirty flags - * there if any of those pages are mapped. - */ - rcu_read_lock(); - - xas_for_each(&xas, folio, ULONG_MAX) { - stop = true; - if (xas_retry(&xas, folio)) - continue; - if (xa_is_value(folio)) - break; - if (folio_index(folio) != index) - break; - - if (!folio_try_get_rcu(folio)) { - xas_reset(&xas); - continue; - } - - /* Has the folio moved or been split? */ - if (unlikely(folio != xas_reload(&xas))) { - folio_put(folio); - break; - } - - if (!folio_trylock(folio)) { - folio_put(folio); - break; - } - if (!folio_test_dirty(folio) || - folio_test_writeback(folio) || - folio_test_fscache(folio)) { - folio_unlock(folio); - folio_put(folio); - break; - } - - index += folio_nr_pages(folio); - *_count -= folio_nr_pages(folio); - *_len += folio_size(folio); - stop = false; - if (*_len >= max_len || *_count <= 0) - stop = true; - - if (!folio_batch_add(&fbatch, folio)) - break; - if (stop) - break; - } - - if (!stop) - xas_pause(&xas); - rcu_read_unlock(); - - /* Now, if we obtained any folios, we can shift them to being - * writable and mark them for caching. - */ - if (!folio_batch_count(&fbatch)) - break; - - for (i = 0; i < folio_batch_count(&fbatch); i++) { - folio = fbatch.folios[i]; - trace_afs_folio_dirty(vnode, tracepoint_string("store+"), folio); + struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode); + ssize_t ret; - if (!folio_clear_dirty_for_io(folio)) - BUG(); - if (folio_start_writeback(folio)) - BUG(); - afs_folio_start_fscache(caching, folio); - folio_unlock(folio); - } + _enter("%x[%x],%zx", + subreq->rreq->debug_id, subreq->debug_index, subreq->io_iter.count); - folio_batch_release(&fbatch); - cond_resched(); - } while (!stop); + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + ret = afs_store_data(vnode, &subreq->io_iter, subreq->start, + subreq->rreq->origin == NETFS_LAUNDER_WRITE); + netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len, + false); } -/* - * Synchronously write back the locked page and any subsequent non-locked dirty - * pages. - */ -static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, - struct writeback_control *wbc, - struct folio *folio, - unsigned long long start, - unsigned long long end) +static void afs_upload_to_server_worker(struct work_struct *work) { - struct afs_vnode *vnode = AFS_FS_I(mapping->host); - struct iov_iter iter; - unsigned long long i_size = i_size_read(&vnode->netfs.inode); - size_t len, max_len; - bool caching = fscache_cookie_enabled(afs_vnode_cache(vnode)); - long count = wbc->nr_to_write; - int ret; - - _enter(",%lx,%llx-%llx", folio_index(folio), start, end); - - if (folio_start_writeback(folio)) - BUG(); - afs_folio_start_fscache(caching, folio); - - count -= folio_nr_pages(folio); - - /* Find all consecutive lockable dirty pages that have contiguous - * written regions, stopping when we find a page that is not - * immediately lockable, is not dirty or is missing, or we reach the - * end of the range. - */ - trace_afs_folio_dirty(vnode, tracepoint_string("store"), folio); - - len = folio_size(folio); - if (start < i_size) { - /* Trim the write to the EOF; the extra data is ignored. Also - * put an upper limit on the size of a single storedata op. - */ - max_len = 65536 * 4096; - max_len = min_t(unsigned long long, max_len, end - start + 1); - max_len = min_t(unsigned long long, max_len, i_size - start); - - if (len < max_len) - afs_extend_writeback(mapping, vnode, &count, - start, max_len, caching, &len); - len = min_t(unsigned long long, len, i_size - start); - } - - /* We now have a contiguous set of dirty pages, each with writeback - * set; the first page is still locked at this point, but all the rest - * have been unlocked. - */ - folio_unlock(folio); - - if (start < i_size) { - _debug("write back %zx @%llx [%llx]", len, start, i_size); - - /* Speculatively write to the cache. We have to fix this up - * later if the store fails. - */ - afs_write_to_cache(vnode, start, len, i_size, caching); - - iov_iter_xarray(&iter, ITER_SOURCE, &mapping->i_pages, start, len); - ret = afs_store_data(vnode, &iter, start, false); - } else { - _debug("write discard %zx @%llx [%llx]", len, start, i_size); - - /* The dirty region was entirely beyond the EOF. */ - fscache_clear_page_bits(mapping, start, len, caching); - afs_pages_written_back(vnode, start, len); - ret = 0; - } - - switch (ret) { - case 0: - wbc->nr_to_write = count; - ret = len; - break; - - default: - pr_notice("kAFS: Unexpected error from FS.StoreData %d\n", ret); - fallthrough; - case -EACCES: - case -EPERM: - case -ENOKEY: - case -EKEYEXPIRED: - case -EKEYREJECTED: - case -EKEYREVOKED: - case -ENETRESET: - afs_redirty_pages(wbc, mapping, start, len); - mapping_set_error(mapping, ret); - break; - - case -EDQUOT: - case -ENOSPC: - afs_redirty_pages(wbc, mapping, start, len); - mapping_set_error(mapping, -ENOSPC); - break; - - case -EROFS: - case -EIO: - case -EREMOTEIO: - case -EFBIG: - case -ENOENT: - case -ENOMEDIUM: - case -ENXIO: - trace_afs_file_error(vnode, ret, afs_file_error_writeback_fail); - afs_kill_pages(mapping, start, len); - mapping_set_error(mapping, ret); - break; - } + struct netfs_io_subrequest *subreq = + container_of(work, struct netfs_io_subrequest, work); - _leave(" = %d", ret); - return ret; + afs_upload_to_server(subreq); } /* - * write a region of pages back to the server + * Set up write requests for a writeback slice. We need to add a write request + * for each write we want to make. */ -static int afs_writepages_region(struct address_space *mapping, - struct writeback_control *wbc, - unsigned long long start, - unsigned long long end, loff_t *_next, - bool max_one_loop) +void afs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len) { - struct folio *folio; - struct folio_batch fbatch; - ssize_t ret; - unsigned int i; - int n, skips = 0; - - _enter("%llx,%llx,", start, end); - folio_batch_init(&fbatch); + struct netfs_io_subrequest *subreq; - do { - pgoff_t index = start / PAGE_SIZE; + _enter("%x,%llx-%llx", wreq->debug_id, start, start + len); - n = filemap_get_folios_tag(mapping, &index, end / PAGE_SIZE, - PAGECACHE_TAG_DIRTY, &fbatch); - - if (!n) - break; - for (i = 0; i < n; i++) { - folio = fbatch.folios[i]; - start = folio_pos(folio); /* May regress with THPs */ - - _debug("wback %lx", folio_index(folio)); - - /* At this point we hold neither the i_pages lock nor the - * page lock: the page may be truncated or invalidated - * (changing page->mapping to NULL), or even swizzled - * back from swapper_space to tmpfs file mapping - */ -try_again: - if (wbc->sync_mode != WB_SYNC_NONE) { - ret = folio_lock_killable(folio); - if (ret < 0) { - folio_batch_release(&fbatch); - return ret; - } - } else { - if (!folio_trylock(folio)) - continue; - } - - if (folio->mapping != mapping || - !folio_test_dirty(folio)) { - start += folio_size(folio); - folio_unlock(folio); - continue; - } - - if (folio_test_writeback(folio) || - folio_test_fscache(folio)) { - folio_unlock(folio); - if (wbc->sync_mode != WB_SYNC_NONE) { - folio_wait_writeback(folio); -#ifdef CONFIG_AFS_FSCACHE - folio_wait_fscache(folio); -#endif - goto try_again; - } - - start += folio_size(folio); - if (wbc->sync_mode == WB_SYNC_NONE) { - if (skips >= 5 || need_resched()) { - *_next = start; - folio_batch_release(&fbatch); - _leave(" = 0 [%llx]", *_next); - return 0; - } - skips++; - } - continue; - } - - if (!folio_clear_dirty_for_io(folio)) - BUG(); - ret = afs_write_back_from_locked_folio(mapping, wbc, - folio, start, end); - if (ret < 0) { - _leave(" = %zd", ret); - folio_batch_release(&fbatch); - return ret; - } - - start += ret; - } - - folio_batch_release(&fbatch); - cond_resched(); - } while (wbc->nr_to_write > 0); - - *_next = start; - _leave(" = 0 [%llx]", *_next); - return 0; + subreq = netfs_create_write_request(wreq, NETFS_UPLOAD_TO_SERVER, + start, len, afs_upload_to_server_worker); + if (subreq) + netfs_queue_write_request(subreq); } /* * write some of the pending data back to the server */ -int afs_writepages(struct address_space *mapping, - struct writeback_control *wbc) +int afs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct afs_vnode *vnode = AFS_FS_I(mapping->host); - loff_t start, next; int ret; - _enter(""); - /* We have to be careful as we can end up racing with setattr() * truncating the pagecache since the caller doesn't take a lock here * to prevent it. @@ -712,68 +215,11 @@ int afs_writepages(struct address_space *mapping, else if (!down_read_trylock(&vnode->validate_lock)) return 0; - if (wbc->range_cyclic) { - start = mapping->writeback_index * PAGE_SIZE; - ret = afs_writepages_region(mapping, wbc, start, LLONG_MAX, - &next, false); - if (ret == 0) { - mapping->writeback_index = next / PAGE_SIZE; - if (start > 0 && wbc->nr_to_write > 0) { - ret = afs_writepages_region(mapping, wbc, 0, - start, &next, false); - if (ret == 0) - mapping->writeback_index = - next / PAGE_SIZE; - } - } - } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) { - ret = afs_writepages_region(mapping, wbc, 0, LLONG_MAX, - &next, false); - if (wbc->nr_to_write > 0 && ret == 0) - mapping->writeback_index = next / PAGE_SIZE; - } else { - ret = afs_writepages_region(mapping, wbc, - wbc->range_start, wbc->range_end, - &next, false); - } - + ret = netfs_writepages(mapping, wbc); up_read(&vnode->validate_lock); - _leave(" = %d", ret); return ret; } -/* - * write to an AFS file - */ -ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from) -{ - struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp)); - struct afs_file *af = iocb->ki_filp->private_data; - ssize_t result; - size_t count = iov_iter_count(from); - - _enter("{%llx:%llu},{%zu},", - vnode->fid.vid, vnode->fid.vnode, count); - - if (IS_SWAPFILE(&vnode->netfs.inode)) { - printk(KERN_INFO - "AFS: Attempt to write to active swap file!\n"); - return -EBUSY; - } - - if (!count) - return 0; - - result = afs_validate(vnode, af->key); - if (result < 0) - return result; - - result = generic_file_write_iter(iocb, from); - - _leave(" = %zd", result); - return result; -} - /* * flush any dirty pages for this process, and check for write errors. * - the return status from this call provides a reliable indication of @@ -802,49 +248,11 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) */ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) { - struct folio *folio = page_folio(vmf->page); struct file *file = vmf->vma->vm_file; - struct inode *inode = file_inode(file); - struct afs_vnode *vnode = AFS_FS_I(inode); - struct afs_file *af = file->private_data; - vm_fault_t ret = VM_FAULT_RETRY; - - _enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, folio_index(folio)); - - afs_validate(vnode, af->key); - - sb_start_pagefault(inode->i_sb); - - /* Wait for the page to be written to the cache before we allow it to - * be modified. We then assume the entire page will need writing back. - */ -#ifdef CONFIG_AFS_FSCACHE - if (folio_test_fscache(folio) && - folio_wait_fscache_killable(folio) < 0) - goto out; -#endif - - if (folio_wait_writeback_killable(folio)) - goto out; - - if (folio_lock_killable(folio) < 0) - goto out; - - if (folio_wait_writeback_killable(folio) < 0) { - folio_unlock(folio); - goto out; - } - - if (folio_test_dirty(folio)) - trace_afs_folio_dirty(vnode, tracepoint_string("mkwrite+"), folio); - else - trace_afs_folio_dirty(vnode, tracepoint_string("mkwrite"), folio); - file_update_time(file); - ret = VM_FAULT_LOCKED; -out: - sb_end_pagefault(inode->i_sb); - return ret; + if (afs_validate(AFS_FS_I(file_inode(file)), afs_file_key(file)) < 0) + return VM_FAULT_SIGBUS; + return netfs_page_mkwrite(vmf, NULL); } /* @@ -874,60 +282,3 @@ void afs_prune_wb_keys(struct afs_vnode *vnode) afs_put_wb_key(wbk); } } - -/* - * Clean up a page during invalidation. - */ -int afs_launder_folio(struct folio *folio) -{ - struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio)); - struct iov_iter iter; - struct bio_vec bv; - unsigned long long fend, i_size = vnode->netfs.inode.i_size; - size_t len; - int ret = 0; - - _enter("{%lx}", folio->index); - - if (folio_clear_dirty_for_io(folio) && folio_pos(folio) < i_size) { - len = folio_size(folio); - fend = folio_pos(folio) + len; - if (vnode->netfs.inode.i_size < fend) - len = fend - i_size; - - bvec_set_folio(&bv, folio, len, 0); - iov_iter_bvec(&iter, WRITE, &bv, 1, len); - - trace_afs_folio_dirty(vnode, tracepoint_string("launder"), folio); - ret = afs_store_data(vnode, &iter, folio_pos(folio), true); - } - - trace_afs_folio_dirty(vnode, tracepoint_string("laundered"), folio); - folio_wait_fscache(folio); - return ret; -} - -/* - * Deal with the completion of writing the data to the cache. - */ -static void afs_write_to_cache_done(void *priv, ssize_t transferred_or_error, - bool was_async) -{ - struct afs_vnode *vnode = priv; - - if (IS_ERR_VALUE(transferred_or_error) && - transferred_or_error != -ENOBUFS) - afs_invalidate_cache(vnode, 0); -} - -/* - * Save the write to the cache also. - */ -static void afs_write_to_cache(struct afs_vnode *vnode, - loff_t start, size_t len, loff_t i_size, - bool caching) -{ - fscache_write_to_cache(afs_vnode_cache(vnode), - vnode->netfs.inode.i_mapping, start, len, i_size, - afs_write_to_cache_done, vnode, caching); -} diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index 08506680350c8..7543581493722 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -837,29 +837,6 @@ TRACE_EVENT(afs_dir_check_failed, __entry->vnode, __entry->off, __entry->i_size) ); -TRACE_EVENT(afs_folio_dirty, - TP_PROTO(struct afs_vnode *vnode, const char *where, struct folio *folio), - - TP_ARGS(vnode, where, folio), - - TP_STRUCT__entry( - __field(struct afs_vnode *, vnode) - __field(const char *, where) - __field(pgoff_t, index) - __field(size_t, size) - ), - - TP_fast_assign( - __entry->vnode = vnode; - __entry->where = where; - __entry->index = folio_index(folio); - __entry->size = folio_size(folio); - ), - - TP_printk("vn=%p ix=%05lx s=%05lx %s", - __entry->vnode, __entry->index, __entry->size, __entry->where) - ); - TRACE_EVENT(afs_call_state, TP_PROTO(struct afs_call *call, enum afs_call_state from, -- GitLab From 80105ed2fd2715fb09a8fdb0655a8bdc86c120db Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 6 Dec 2023 12:48:56 +0000 Subject: [PATCH 0597/1290] 9p: Use netfslib read/write_iter Use netfslib's read and write iteration helpers, allowing netfslib to take over the management of the page cache for 9p files and to manage local disk caching. In particular, this eliminates write_begin, write_end, writepage and all mentions of struct page and struct folio from 9p. Note that netfslib now offers the possibility of write-through caching if that is desirable for 9p: just set the NETFS_ICTX_WRITETHROUGH flag in v9inode->netfs.flags in v9fs_set_netfs_context(). Note also this is untested as I can't get ganesha.nfsd to correctly parse the config to turn on 9p support. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: Eric Van Hensbergen cc: Latchesar Ionkov cc: Dominique Martinet cc: Christian Schoenebeck cc: v9fs@lists.linux.dev cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org --- fs/9p/vfs_addr.c | 293 ++++++++++------------------------------- fs/9p/vfs_file.c | 89 ++----------- fs/9p/vfs_inode.c | 5 +- fs/9p/vfs_inode_dotl.c | 7 +- 4 files changed, 85 insertions(+), 309 deletions(-) diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 055b672a247d7..d8fb407189a09 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -19,12 +19,48 @@ #include #include #include +#include #include "v9fs.h" #include "v9fs_vfs.h" #include "cache.h" #include "fid.h" +static void v9fs_upload_to_server(struct netfs_io_subrequest *subreq) +{ + struct inode *inode = subreq->rreq->inode; + struct v9fs_inode __maybe_unused *v9inode = V9FS_I(inode); + struct p9_fid *fid = subreq->rreq->netfs_priv; + int err; + + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + p9_client_write(fid, subreq->start, &subreq->io_iter, &err); + netfs_write_subrequest_terminated(subreq, err < 0 ? err : subreq->len, + false); +} + +static void v9fs_upload_to_server_worker(struct work_struct *work) +{ + struct netfs_io_subrequest *subreq = + container_of(work, struct netfs_io_subrequest, work); + + v9fs_upload_to_server(subreq); +} + +/* + * Set up write requests for a writeback slice. We need to add a write request + * for each write we want to make. + */ +static void v9fs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len) +{ + struct netfs_io_subrequest *subreq; + + subreq = netfs_create_write_request(wreq, NETFS_UPLOAD_TO_SERVER, + start, len, v9fs_upload_to_server_worker); + if (subreq) + netfs_queue_write_request(subreq); +} + /** * v9fs_issue_read - Issue a read from 9P * @subreq: The read to make @@ -33,14 +69,10 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; struct p9_fid *fid = rreq->netfs_priv; - struct iov_iter to; - loff_t pos = subreq->start + subreq->transferred; - size_t len = subreq->len - subreq->transferred; int total, err; - iov_iter_xarray(&to, ITER_DEST, &rreq->mapping->i_pages, pos, len); - - total = p9_client_read(fid, pos, &to, &err); + total = p9_client_read(fid, subreq->start + subreq->transferred, + &subreq->io_iter, &err); /* if we just extended the file size, any portion not in * cache won't be on server and is zeroes */ @@ -50,23 +82,37 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) } /** - * v9fs_init_request - Initialise a read request + * v9fs_init_request - Initialise a request * @rreq: The read request * @file: The file being read from */ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file) { - struct p9_fid *fid = file->private_data; - - BUG_ON(!fid); + struct p9_fid *fid; + bool writing = (rreq->origin == NETFS_READ_FOR_WRITE || + rreq->origin == NETFS_WRITEBACK || + rreq->origin == NETFS_WRITETHROUGH || + rreq->origin == NETFS_LAUNDER_WRITE || + rreq->origin == NETFS_UNBUFFERED_WRITE || + rreq->origin == NETFS_DIO_WRITE); + + if (file) { + fid = file->private_data; + BUG_ON(!fid); + p9_fid_get(fid); + } else { + fid = v9fs_fid_find_inode(rreq->inode, writing, INVALID_UID, true); + if (!fid) { + WARN_ONCE(1, "folio expected an open fid inode->i_private=%p\n", + rreq->inode->i_private); + return -EINVAL; + } + } /* we might need to read from a fid that was opened write-only * for read-modify-write of page cache, use the writeback fid * for that */ - WARN_ON(rreq->origin == NETFS_READ_FOR_WRITE && - !(fid->mode & P9_ORDWR)); - - p9_fid_get(fid); + WARN_ON(rreq->origin == NETFS_READ_FOR_WRITE && !(fid->mode & P9_ORDWR)); rreq->netfs_priv = fid; return 0; } @@ -86,217 +132,16 @@ const struct netfs_request_ops v9fs_req_ops = { .init_request = v9fs_init_request, .free_request = v9fs_free_request, .issue_read = v9fs_issue_read, + .create_write_requests = v9fs_create_write_requests, }; -#ifdef CONFIG_9P_FSCACHE -static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error, - bool was_async) -{ - struct v9fs_inode *v9inode = priv; - __le32 version; - - if (IS_ERR_VALUE(transferred_or_error) && - transferred_or_error != -ENOBUFS) { - version = cpu_to_le32(v9inode->qid.version); - fscache_invalidate(v9fs_inode_cookie(v9inode), &version, - i_size_read(&v9inode->netfs.inode), 0); - } -} -#endif - -static int v9fs_vfs_write_folio_locked(struct folio *folio) -{ - struct inode *inode = folio_inode(folio); - loff_t start = folio_pos(folio); - loff_t i_size = i_size_read(inode); - struct iov_iter from; - size_t len = folio_size(folio); - struct p9_fid *writeback_fid; - int err; - struct v9fs_inode __maybe_unused *v9inode = V9FS_I(inode); - struct fscache_cookie __maybe_unused *cookie = v9fs_inode_cookie(v9inode); - - if (start >= i_size) - return 0; /* Simultaneous truncation occurred */ - - len = min_t(loff_t, i_size - start, len); - - iov_iter_xarray(&from, ITER_SOURCE, &folio_mapping(folio)->i_pages, start, len); - - writeback_fid = v9fs_fid_find_inode(inode, true, INVALID_UID, true); - if (!writeback_fid) { - WARN_ONCE(1, "folio expected an open fid inode->i_private=%p\n", - inode->i_private); - return -EINVAL; - } - - folio_wait_fscache(folio); - folio_start_writeback(folio); - - p9_client_write(writeback_fid, start, &from, &err); - -#ifdef CONFIG_9P_FSCACHE - if (err == 0 && - fscache_cookie_enabled(cookie) && - test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags)) { - folio_start_fscache(folio); - fscache_write_to_cache(v9fs_inode_cookie(v9inode), - folio_mapping(folio), start, len, i_size, - v9fs_write_to_cache_done, v9inode, - true); - } -#endif - - folio_end_writeback(folio); - p9_fid_put(writeback_fid); - - return err; -} - -static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc) -{ - struct folio *folio = page_folio(page); - int retval; - - p9_debug(P9_DEBUG_VFS, "folio %p\n", folio); - - retval = v9fs_vfs_write_folio_locked(folio); - if (retval < 0) { - if (retval == -EAGAIN) { - folio_redirty_for_writepage(wbc, folio); - retval = 0; - } else { - mapping_set_error(folio_mapping(folio), retval); - } - } else - retval = 0; - - folio_unlock(folio); - return retval; -} - -static int v9fs_launder_folio(struct folio *folio) -{ - int retval; - - if (folio_clear_dirty_for_io(folio)) { - retval = v9fs_vfs_write_folio_locked(folio); - if (retval) - return retval; - } - folio_wait_fscache(folio); - return 0; -} - -/** - * v9fs_direct_IO - 9P address space operation for direct I/O - * @iocb: target I/O control block - * @iter: The data/buffer to use - * - * The presence of v9fs_direct_IO() in the address space ops vector - * allowes open() O_DIRECT flags which would have failed otherwise. - * - * In the non-cached mode, we shunt off direct read and write requests before - * the VFS gets them, so this method should never be called. - * - * Direct IO is not 'yet' supported in the cached mode. Hence when - * this routine is called through generic_file_aio_read(), the read/write fails - * with an error. - * - */ -static ssize_t -v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - loff_t pos = iocb->ki_pos; - ssize_t n; - int err = 0; - - if (iov_iter_rw(iter) == WRITE) { - n = p9_client_write(file->private_data, pos, iter, &err); - if (n) { - struct inode *inode = file_inode(file); - loff_t i_size = i_size_read(inode); - - if (pos + n > i_size) - inode_add_bytes(inode, pos + n - i_size); - } - } else { - n = p9_client_read(file->private_data, pos, iter, &err); - } - return n ? n : err; -} - -static int v9fs_write_begin(struct file *filp, struct address_space *mapping, - loff_t pos, unsigned int len, - struct page **subpagep, void **fsdata) -{ - int retval; - struct folio *folio; - struct v9fs_inode *v9inode = V9FS_I(mapping->host); - - p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); - - /* Prefetch area to be written into the cache if we're caching this - * file. We need to do this before we get a lock on the page in case - * there's more than one writer competing for the same cache block. - */ - retval = netfs_write_begin(&v9inode->netfs, filp, mapping, pos, len, &folio, fsdata); - if (retval < 0) - return retval; - - *subpagep = &folio->page; - return retval; -} - -static int v9fs_write_end(struct file *filp, struct address_space *mapping, - loff_t pos, unsigned int len, unsigned int copied, - struct page *subpage, void *fsdata) -{ - loff_t last_pos = pos + copied; - struct folio *folio = page_folio(subpage); - struct inode *inode = mapping->host; - - p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); - - if (!folio_test_uptodate(folio)) { - if (unlikely(copied < len)) { - copied = 0; - goto out; - } - - folio_mark_uptodate(folio); - } - - /* - * No need to use i_size_read() here, the i_size - * cannot change under us because we hold the i_mutex. - */ - if (last_pos > inode->i_size) { - inode_add_bytes(inode, last_pos - inode->i_size); - i_size_write(inode, last_pos); -#ifdef CONFIG_9P_FSCACHE - fscache_update_cookie(v9fs_inode_cookie(V9FS_I(inode)), NULL, - &last_pos); -#endif - } - folio_mark_dirty(folio); -out: - folio_unlock(folio); - folio_put(folio); - - return copied; -} - const struct address_space_operations v9fs_addr_operations = { - .read_folio = netfs_read_folio, - .readahead = netfs_readahead, - .dirty_folio = netfs_dirty_folio, - .writepage = v9fs_vfs_writepage, - .write_begin = v9fs_write_begin, - .write_end = v9fs_write_end, - .release_folio = netfs_release_folio, - .invalidate_folio = netfs_invalidate_folio, - .launder_folio = v9fs_launder_folio, - .direct_IO = v9fs_direct_IO, + .read_folio = netfs_read_folio, + .readahead = netfs_readahead, + .dirty_folio = netfs_dirty_folio, + .release_folio = netfs_release_folio, + .invalidate_folio = netfs_invalidate_folio, + .launder_folio = netfs_launder_folio, + .direct_IO = noop_direct_IO, + .writepages = netfs_writepages, }; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 11cd8d23f6f23..bae330c2f0cf0 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -353,25 +353,15 @@ static ssize_t v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct p9_fid *fid = iocb->ki_filp->private_data; - int ret, err = 0; p9_debug(P9_DEBUG_VFS, "fid %d count %zu offset %lld\n", fid->fid, iov_iter_count(to), iocb->ki_pos); - if (!(fid->mode & P9L_DIRECT)) { - p9_debug(P9_DEBUG_VFS, "(cached)\n"); - return generic_file_read_iter(iocb, to); - } - - if (iocb->ki_filp->f_flags & O_NONBLOCK) - ret = p9_client_read_once(fid, iocb->ki_pos, to, &err); - else - ret = p9_client_read(fid, iocb->ki_pos, to, &err); - if (!ret) - return err; + if (fid->mode & P9L_DIRECT) + return netfs_unbuffered_read_iter(iocb, to); - iocb->ki_pos += ret; - return ret; + p9_debug(P9_DEBUG_VFS, "(cached)\n"); + return netfs_file_read_iter(iocb, to); } /* @@ -407,46 +397,14 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct p9_fid *fid = file->private_data; - ssize_t retval; - loff_t origin; - int err = 0; p9_debug(P9_DEBUG_VFS, "fid %d\n", fid->fid); - if (!(fid->mode & (P9L_DIRECT | P9L_NOWRITECACHE))) { - p9_debug(P9_DEBUG_CACHE, "(cached)\n"); - return generic_file_write_iter(iocb, from); - } + if (fid->mode & (P9L_DIRECT | P9L_NOWRITECACHE)) + return netfs_unbuffered_write_iter(iocb, from); - retval = generic_write_checks(iocb, from); - if (retval <= 0) - return retval; - - origin = iocb->ki_pos; - retval = p9_client_write(file->private_data, iocb->ki_pos, from, &err); - if (retval > 0) { - struct inode *inode = file_inode(file); - loff_t i_size; - unsigned long pg_start, pg_end; - - pg_start = origin >> PAGE_SHIFT; - pg_end = (origin + retval - 1) >> PAGE_SHIFT; - if (inode->i_mapping && inode->i_mapping->nrpages) - invalidate_inode_pages2_range(inode->i_mapping, - pg_start, pg_end); - iocb->ki_pos += retval; - i_size = i_size_read(inode); - if (iocb->ki_pos > i_size) { - inode_add_bytes(inode, iocb->ki_pos - i_size); - /* - * Need to serialize against i_size_write() in - * v9fs_stat2inode() - */ - v9fs_i_size_write(inode, iocb->ki_pos); - } - return retval; - } - return err; + p9_debug(P9_DEBUG_CACHE, "(cached)\n"); + return netfs_file_write_iter(iocb, from); } static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end, @@ -519,36 +477,7 @@ v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma) static vm_fault_t v9fs_vm_page_mkwrite(struct vm_fault *vmf) { - struct folio *folio = page_folio(vmf->page); - struct file *filp = vmf->vma->vm_file; - struct inode *inode = file_inode(filp); - - - p9_debug(P9_DEBUG_VFS, "folio %p fid %lx\n", - folio, (unsigned long)filp->private_data); - - /* Wait for the page to be written to the cache before we allow it to - * be modified. We then assume the entire page will need writing back. - */ -#ifdef CONFIG_9P_FSCACHE - if (folio_test_fscache(folio) && - folio_wait_fscache_killable(folio) < 0) - return VM_FAULT_NOPAGE; -#endif - - /* Update file times before taking page lock */ - file_update_time(filp); - - if (folio_lock_killable(folio) < 0) - return VM_FAULT_RETRY; - if (folio_mapping(folio) != inode->i_mapping) - goto out_unlock; - folio_wait_stable(folio); - - return VM_FAULT_LOCKED; -out_unlock: - folio_unlock(folio); - return VM_FAULT_NOPAGE; + return netfs_page_mkwrite(vmf, NULL); } static void v9fs_mmap_vm_close(struct vm_area_struct *vma) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index df7ae381a7088..b66466e97459c 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -374,10 +374,8 @@ void v9fs_evict_inode(struct inode *inode) truncate_inode_pages_final(&inode->i_data); -#ifdef CONFIG_9P_FSCACHE version = cpu_to_le32(v9inode->qid.version); netfs_clear_inode_writeback(inode, &version); -#endif clear_inode(inode); filemap_fdatawrite(&inode->i_data); @@ -1112,7 +1110,7 @@ static int v9fs_vfs_setattr(struct mnt_idmap *idmap, if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size != i_size_read(inode)) { truncate_setsize(inode, iattr->ia_size); - truncate_pagecache(inode, iattr->ia_size); + netfs_resize_file(netfs_inode(inode), iattr->ia_size, true); #ifdef CONFIG_9P_FSCACHE if (v9ses->cache & CACHE_FSCACHE) { @@ -1180,6 +1178,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; + v9inode->netfs.remote_i_size = stat->length; if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) v9fs_i_size_write(inode, stat->length); /* not real number of blocks, but 512 byte ones ... */ diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index c7319af2f4711..e25fbc988f095 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -598,7 +598,7 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap, if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size != i_size_read(inode)) { truncate_setsize(inode, iattr->ia_size); - truncate_pagecache(inode, iattr->ia_size); + netfs_resize_file(netfs_inode(inode), iattr->ia_size, true); #ifdef CONFIG_9P_FSCACHE if (v9ses->cache & CACHE_FSCACHE) @@ -655,6 +655,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; + v9inode->netfs.remote_i_size = stat->st_size; if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) v9fs_i_size_write(inode, stat->st_size); inode->i_blocks = stat->st_blocks; @@ -683,8 +684,10 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, inode->i_mode = mode; } if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) && - stat->st_result_mask & P9_STATS_SIZE) + stat->st_result_mask & P9_STATS_SIZE) { + v9inode->netfs.remote_i_size = stat->st_size; v9fs_i_size_write(inode, stat->st_size); + } if (stat->st_result_mask & P9_STATS_BLOCKS) inode->i_blocks = stat->st_blocks; } -- GitLab From 0b670b54119902de75fcd20a50585cf7b573f801 Mon Sep 17 00:00:00 2001 From: Hermes Zhang Date: Fri, 29 Dec 2023 09:36:57 +0800 Subject: [PATCH 0598/1290] Input: gpio-keys - filter gpio_keys -EPROBE_DEFER error messages commit ae42f9288846 ("gpio: Return EPROBE_DEFER if gc->to_irq is NULL") make gpiod_to_irq() possible to return -EPROBE_DEFER when the racing happens. This causes the following error message to be printed: gpio-keys gpio_keys: Unable to get irq number for GPIO 0, error -517 Fix that by changing dev_err() to dev_err_probe() Signed-off-by: Hermes Zhang Link: https://lore.kernel.org/r/20231229013657.692600-1-Hermes.Zhang@axis.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/gpio_keys.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/input/keyboard/gpio_keys.c b/drivers/input/keyboard/gpio_keys.c index 06c09b7addf8e..9f3bcd41cf67d 100644 --- a/drivers/input/keyboard/gpio_keys.c +++ b/drivers/input/keyboard/gpio_keys.c @@ -592,9 +592,9 @@ static int gpio_keys_setup_key(struct platform_device *pdev, irq = gpiod_to_irq(bdata->gpiod); if (irq < 0) { error = irq; - dev_err(dev, - "Unable to get irq number for GPIO %d, error %d\n", - button->gpio, error); + dev_err_probe(dev, error, + "Unable to get irq number for GPIO %d\n", + button->gpio); return error; } bdata->irq = irq; -- GitLab From 15d3f7664d2776c086f813f1efbfe2ae20a85e89 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 22 Nov 2023 12:47:45 +0900 Subject: [PATCH 0599/1290] kconfig: WERROR unmet symbol dependency When KCONFIG_WERROR env variable is set treat unmet direct symbol dependency as a terminal condition (error). Suggested-by: Stefan Reinauer Signed-off-by: Sergey Senozhatsky Signed-off-by: Masahiro Yamada --- scripts/kconfig/conf.c | 6 ++++++ scripts/kconfig/confdata.c | 13 ++++++++----- scripts/kconfig/lkc_proto.h | 2 ++ scripts/kconfig/symbol.c | 9 +++++++++ 4 files changed, 25 insertions(+), 5 deletions(-) diff --git a/scripts/kconfig/conf.c b/scripts/kconfig/conf.c index 33d19e419908b..662a5e7c37c28 100644 --- a/scripts/kconfig/conf.c +++ b/scripts/kconfig/conf.c @@ -827,6 +827,9 @@ int main(int ac, char **av) break; } + if (conf_errors()) + exit(1); + if (sync_kconfig) { name = getenv("KCONFIG_NOSILENTUPDATE"); if (name && *name) { @@ -890,6 +893,9 @@ int main(int ac, char **av) break; } + if (sym_dep_errors()) + exit(1); + if (input_mode == savedefconfig) { if (conf_write_defconfig(defconfig_file)) { fprintf(stderr, "n*** Error while saving defconfig to: %s\n\n", diff --git a/scripts/kconfig/confdata.c b/scripts/kconfig/confdata.c index f1197e6724316..f53dcdd445976 100644 --- a/scripts/kconfig/confdata.c +++ b/scripts/kconfig/confdata.c @@ -155,6 +155,13 @@ static void conf_message(const char *fmt, ...) static const char *conf_filename; static int conf_lineno, conf_warnings; +bool conf_errors(void) +{ + if (conf_warnings) + return getenv("KCONFIG_WERROR"); + return false; +} + static void conf_warning(const char *fmt, ...) { va_list ap; @@ -365,10 +372,9 @@ int conf_read_simple(const char *name, int def) char *p, *val; struct symbol *sym; int i, def_flags; - const char *warn_unknown, *werror, *sym_name; + const char *warn_unknown, *sym_name; warn_unknown = getenv("KCONFIG_WARN_UNKNOWN_SYMBOLS"); - werror = getenv("KCONFIG_WERROR"); if (name) { in = zconf_fopen(name); } else { @@ -525,9 +531,6 @@ load: free(line); fclose(in); - if (conf_warnings && werror) - exit(1); - return 0; } diff --git a/scripts/kconfig/lkc_proto.h b/scripts/kconfig/lkc_proto.h index 687d8698d801c..a4ae5e9eadadb 100644 --- a/scripts/kconfig/lkc_proto.h +++ b/scripts/kconfig/lkc_proto.h @@ -15,6 +15,7 @@ void conf_set_changed(bool val); bool conf_get_changed(void); void conf_set_changed_callback(void (*fn)(void)); void conf_set_message_callback(void (*fn)(const char *s)); +bool conf_errors(void); /* symbol.c */ extern struct symbol * symbol_hash[SYMBOL_HASHSIZE]; @@ -25,6 +26,7 @@ void print_symbol_for_listconfig(struct symbol *sym); struct symbol ** sym_re_search(const char *pattern); const char * sym_type_name(enum symbol_type type); void sym_calc_value(struct symbol *sym); +bool sym_dep_errors(void); enum symbol_type sym_get_type(struct symbol *sym); bool sym_tristate_within_range(struct symbol *sym,tristate tri); bool sym_set_tristate_value(struct symbol *sym,tristate tri); diff --git a/scripts/kconfig/symbol.c b/scripts/kconfig/symbol.c index a5a4f9153eb73..3e808528aaeab 100644 --- a/scripts/kconfig/symbol.c +++ b/scripts/kconfig/symbol.c @@ -31,6 +31,7 @@ struct symbol symbol_no = { struct symbol *modules_sym; static tristate modules_val; +static int sym_warnings; enum symbol_type sym_get_type(struct symbol *sym) { @@ -311,6 +312,14 @@ static void sym_warn_unmet_dep(struct symbol *sym) " Selected by [m]:\n"); fputs(str_get(&gs), stderr); + sym_warnings++; +} + +bool sym_dep_errors(void) +{ + if (sym_warnings) + return getenv("KCONFIG_WERROR"); + return false; } void sym_calc_value(struct symbol *sym) -- GitLab From 67f8f1e7aa31b6fe17aeee1c581f61fc3dfa331a Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Mon, 11 Dec 2023 19:13:36 -0300 Subject: [PATCH 0600/1290] scripts: Introduce a default git.orderFile When reviewing patches, it looks much nicer to have some changes shown before others, which allow better understanding of the patch before the the .c files reviewing. Introduce a default git.orderFile, in order to help developers getting the best ordering easier. Signed-off-by: Leonardo Bras Acked-by: Randy Dunlap Signed-off-by: Masahiro Yamada --- scripts/git.orderFile | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 scripts/git.orderFile diff --git a/scripts/git.orderFile b/scripts/git.orderFile new file mode 100644 index 0000000000000..5102ba73357f0 --- /dev/null +++ b/scripts/git.orderFile @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: GPL-2.0 + +# order file for git, to produce patches which are easier to review +# by diffing the important stuff like header changes first. +# +# one-off usage: +# git diff -O scripts/git.orderFile ... +# +# add to git config: +# git config diff.orderFile scripts/git.orderFile +# + +MAINTAINERS + +# Documentation +Documentation/* +*.rst + +# git-specific +.gitignore +scripts/git.orderFile + +# build system +Kconfig* +*/Kconfig* +Kbuild* +*/Kbuild* +Makefile* +*/Makefile* +*.mak +*.mk +scripts/* + +# semantic patches +*.cocci + +# headers +*types.h +*.h + +# code +*.c -- GitLab From 1f7f31bf7202adcab9616307bcb11a65fb565f63 Mon Sep 17 00:00:00 2001 From: John Moon Date: Mon, 11 Dec 2023 18:02:57 -0800 Subject: [PATCH 0601/1290] check-uapi: Introduce check-uapi.sh While the kernel community has been good at maintaining backwards compatibility with kernel UAPIs, it would be helpful to have a tool to check if a commit introduces changes that break backwards compatibility. To that end, introduce check-uapi.sh: a simple shell script that checks for changes to UAPI headers using libabigail. libabigail is "a framework which aims at helping developers and software distributors to spot some ABI-related issues like interface incompatibility in ELF shared libraries by performing a static analysis of the ELF binaries at hand." The script uses one of libabigail's tools, "abidiff", to compile the changed header before and after the commit to detect any changes. abidiff "compares the ABI of two shared libraries in ELF format. It emits a meaningful report describing the differences between the two ABIs." The script also includes the ability to check the compatibility of all UAPI headers across commits. This allows developers to inspect the stability of the UAPIs over time. Signed-off-by: John Moon Signed-off-by: Masahiro Yamada --- scripts/check-uapi.sh | 573 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 573 insertions(+) create mode 100755 scripts/check-uapi.sh diff --git a/scripts/check-uapi.sh b/scripts/check-uapi.sh new file mode 100755 index 0000000000000..955581735cb3c --- /dev/null +++ b/scripts/check-uapi.sh @@ -0,0 +1,573 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-only +# Script to check commits for UAPI backwards compatibility + +set -o errexit +set -o pipefail + +print_usage() { + name=$(basename "$0") + cat << EOF +$name - check for UAPI header stability across Git commits + +By default, the script will check to make sure the latest commit (or current +dirty changes) did not introduce ABI changes when compared to HEAD^1. You can +check against additional commit ranges with the -b and -p options. + +The script will not check UAPI headers for architectures other than the one +defined in ARCH. + +Usage: $name [-b BASE_REF] [-p PAST_REF] [-j N] [-l ERROR_LOG] [-i] [-q] [-v] + +Options: + -b BASE_REF Base git reference to use for comparison. If unspecified or empty, + will use any dirty changes in tree to UAPI files. If there are no + dirty changes, HEAD will be used. + -p PAST_REF Compare BASE_REF to PAST_REF (e.g. -p v6.1). If unspecified or empty, + will use BASE_REF^1. Must be an ancestor of BASE_REF. Only headers + that exist on PAST_REF will be checked for compatibility. + -j JOBS Number of checks to run in parallel (default: number of CPU cores). + -l ERROR_LOG Write error log to file (default: no error log is generated). + -i Ignore ambiguous changes that may or may not break UAPI compatibility. + -q Quiet operation. + -v Verbose operation (print more information about each header being checked). + +Environmental args: + ABIDIFF Custom path to abidiff binary + CC C compiler (default is "gcc") + ARCH Target architecture for the UAPI check (default is host arch) + +Exit codes: + $SUCCESS) Success + $FAIL_ABI) ABI difference detected + $FAIL_PREREQ) Prerequisite not met +EOF +} + +readonly SUCCESS=0 +readonly FAIL_ABI=1 +readonly FAIL_PREREQ=2 + +# Print to stderr +eprintf() { + # shellcheck disable=SC2059 + printf "$@" >&2 +} + +# Expand an array with a specific character (similar to Python string.join()) +join() { + local IFS="$1" + shift + printf "%s" "$*" +} + +# Create abidiff suppressions +gen_suppressions() { + # Common enum variant names which we don't want to worry about + # being shifted when new variants are added. + local -a enum_regex=( + ".*_AFTER_LAST$" + ".*_CNT$" + ".*_COUNT$" + ".*_END$" + ".*_LAST$" + ".*_MASK$" + ".*_MAX$" + ".*_MAX_BIT$" + ".*_MAX_BPF_ATTACH_TYPE$" + ".*_MAX_ID$" + ".*_MAX_SHIFT$" + ".*_NBITS$" + ".*_NETDEV_NUMHOOKS$" + ".*_NFT_META_IIFTYPE$" + ".*_NL80211_ATTR$" + ".*_NLDEV_NUM_OPS$" + ".*_NUM$" + ".*_NUM_ELEMS$" + ".*_NUM_IRQS$" + ".*_SIZE$" + ".*_TLSMAX$" + "^MAX_.*" + "^NUM_.*" + ) + + # Common padding field names which can be expanded into + # without worrying about users. + local -a padding_regex=( + ".*end$" + ".*pad$" + ".*pad[0-9]?$" + ".*pad_[0-9]?$" + ".*padding$" + ".*padding[0-9]?$" + ".*padding_[0-9]?$" + ".*res$" + ".*resv$" + ".*resv[0-9]?$" + ".*resv_[0-9]?$" + ".*reserved$" + ".*reserved[0-9]?$" + ".*reserved_[0-9]?$" + ".*rsvd[0-9]?$" + ".*unused$" + ) + + cat << EOF +[suppress_type] + type_kind = enum + changed_enumerators_regexp = $(join , "${enum_regex[@]}") +EOF + + for p in "${padding_regex[@]}"; do + cat << EOF +[suppress_type] + type_kind = struct + has_data_member_inserted_at = offset_of_first_data_member_regexp(${p}) +EOF + done + +if [ "$IGNORE_AMBIGUOUS_CHANGES" = "true" ]; then + cat << EOF +[suppress_type] + type_kind = struct + has_data_member_inserted_at = end + has_size_change = yes +EOF +fi +} + +# Check if git tree is dirty +tree_is_dirty() { + ! git diff --quiet +} + +# Get list of files installed in $ref +get_file_list() { + local -r ref="$1" + local -r tree="$(get_header_tree "$ref")" + + # Print all installed headers, filtering out ones that can't be compiled + find "$tree" -type f -name '*.h' -printf '%P\n' | grep -v -f "$INCOMPAT_LIST" +} + +# Add to the list of incompatible headers +add_to_incompat_list() { + local -r ref="$1" + + # Start with the usr/include/Makefile to get a list of the headers + # that don't compile using this method. + if [ ! -f usr/include/Makefile ]; then + eprintf "error - no usr/include/Makefile present at %s\n" "$ref" + eprintf "Note: usr/include/Makefile was added in the v5.3 kernel release\n" + exit "$FAIL_PREREQ" + fi + { + # shellcheck disable=SC2016 + printf 'all: ; @echo $(no-header-test)\n' + cat usr/include/Makefile + } | SRCARCH="$ARCH" make --always-make -f - | tr " " "\n" \ + | grep -v "asm-generic" >> "$INCOMPAT_LIST" + + # The makefile also skips all asm-generic files, but prints "asm-generic/%" + # which won't work for our grep match. Instead, print something grep will match. + printf "asm-generic/.*\.h\n" >> "$INCOMPAT_LIST" +} + +# Compile the simple test app +do_compile() { + local -r inc_dir="$1" + local -r header="$2" + local -r out="$3" + printf "int main(void) { return 0; }\n" | \ + "$CC" -c \ + -o "$out" \ + -x c \ + -O0 \ + -std=c90 \ + -fno-eliminate-unused-debug-types \ + -g \ + "-I${inc_dir}" \ + -include "$header" \ + - +} + +# Run make headers_install +run_make_headers_install() { + local -r ref="$1" + local -r install_dir="$(get_header_tree "$ref")" + make -j "$MAX_THREADS" ARCH="$ARCH" INSTALL_HDR_PATH="$install_dir" \ + headers_install > /dev/null +} + +# Install headers for both git refs +install_headers() { + local -r base_ref="$1" + local -r past_ref="$2" + + for ref in "$base_ref" "$past_ref"; do + printf "Installing user-facing UAPI headers from %s... " "${ref:-dirty tree}" + if [ -n "$ref" ]; then + git archive --format=tar --prefix="${ref}-archive/" "$ref" \ + | (cd "$TMP_DIR" && tar xf -) + ( + cd "${TMP_DIR}/${ref}-archive" + run_make_headers_install "$ref" + add_to_incompat_list "$ref" "$INCOMPAT_LIST" + ) + else + run_make_headers_install "$ref" + add_to_incompat_list "$ref" "$INCOMPAT_LIST" + fi + printf "OK\n" + done + sort -u -o "$INCOMPAT_LIST" "$INCOMPAT_LIST" + sed -i -e '/^$/d' "$INCOMPAT_LIST" +} + +# Print the path to the headers_install tree for a given ref +get_header_tree() { + local -r ref="$1" + printf "%s" "${TMP_DIR}/${ref}/usr" +} + +# Check file list for UAPI compatibility +check_uapi_files() { + local -r base_ref="$1" + local -r past_ref="$2" + local -r abi_error_log="$3" + + local passed=0; + local failed=0; + local -a threads=() + set -o errexit + + printf "Checking changes to UAPI headers between %s and %s...\n" "$past_ref" "${base_ref:-dirty tree}" + # Loop over all UAPI headers that were installed by $past_ref (if they only exist on $base_ref, + # there's no way they're broken and no way to compare anyway) + while read -r file; do + if [ "${#threads[@]}" -ge "$MAX_THREADS" ]; then + if wait "${threads[0]}"; then + passed=$((passed + 1)) + else + failed=$((failed + 1)) + fi + threads=("${threads[@]:1}") + fi + + check_individual_file "$base_ref" "$past_ref" "$file" & + threads+=("$!") + done < <(get_file_list "$past_ref") + + for t in "${threads[@]}"; do + if wait "$t"; then + passed=$((passed + 1)) + else + failed=$((failed + 1)) + fi + done + + if [ -n "$abi_error_log" ]; then + printf 'Generated by "%s %s" from git ref %s\n\n' \ + "$0" "$*" "$(git rev-parse HEAD)" > "$abi_error_log" + fi + + while read -r error_file; do + { + cat "$error_file" + printf "\n\n" + } | tee -a "${abi_error_log:-/dev/null}" >&2 + done < <(find "$TMP_DIR" -type f -name '*.error' | sort) + + total="$((passed + failed))" + if [ "$failed" -gt 0 ]; then + eprintf "error - %d/%d UAPI headers compatible with %s appear _not_ to be backwards compatible\n" \ + "$failed" "$total" "$ARCH" + if [ -n "$abi_error_log" ]; then + eprintf "Failure summary saved to %s\n" "$abi_error_log" + fi + else + printf "All %d UAPI headers compatible with %s appear to be backwards compatible\n" \ + "$total" "$ARCH" + fi + + return "$failed" +} + +# Check an individual file for UAPI compatibility +check_individual_file() { + local -r base_ref="$1" + local -r past_ref="$2" + local -r file="$3" + + local -r base_header="$(get_header_tree "$base_ref")/${file}" + local -r past_header="$(get_header_tree "$past_ref")/${file}" + + if [ ! -f "$base_header" ]; then + mkdir -p "$(dirname "$base_header")" + printf "==== UAPI header %s was removed between %s and %s ====" \ + "$file" "$past_ref" "$base_ref" \ + > "${base_header}.error" + return 1 + fi + + compare_abi "$file" "$base_header" "$past_header" "$base_ref" "$past_ref" +} + +# Perform the A/B compilation and compare output ABI +compare_abi() { + local -r file="$1" + local -r base_header="$2" + local -r past_header="$3" + local -r base_ref="$4" + local -r past_ref="$5" + local -r log="${TMP_DIR}/log/${file}.log" + local -r error_log="${TMP_DIR}/log/${file}.error" + + mkdir -p "$(dirname "$log")" + + if ! do_compile "$(get_header_tree "$base_ref")/include" "$base_header" "${base_header}.bin" 2> "$log"; then + { + warn_str=$(printf "==== Could not compile version of UAPI header %s at %s ====\n" \ + "$file" "$base_ref") + printf "%s\n" "$warn_str" + cat "$log" + printf -- "=%.0s" $(seq 0 ${#warn_str}) + } > "$error_log" + return 1 + fi + + if ! do_compile "$(get_header_tree "$past_ref")/include" "$past_header" "${past_header}.bin" 2> "$log"; then + { + warn_str=$(printf "==== Could not compile version of UAPI header %s at %s ====\n" \ + "$file" "$past_ref") + printf "%s\n" "$warn_str" + cat "$log" + printf -- "=%.0s" $(seq 0 ${#warn_str}) + } > "$error_log" + return 1 + fi + + local ret=0 + "$ABIDIFF" --non-reachable-types \ + --suppressions "$SUPPRESSIONS" \ + "${past_header}.bin" "${base_header}.bin" > "$log" || ret="$?" + if [ "$ret" -eq 0 ]; then + if [ "$VERBOSE" = "true" ]; then + printf "No ABI differences detected in %s from %s -> %s\n" \ + "$file" "$past_ref" "$base_ref" + fi + else + # Bits in abidiff's return code can be used to determine the type of error + if [ $((ret & 0x2)) -gt 0 ]; then + eprintf "error - abidiff did not run properly\n" + exit 1 + fi + + if [ "$IGNORE_AMBIGUOUS_CHANGES" = "true" ] && [ "$ret" -eq 4 ]; then + return 0 + fi + + # If the only changes were additions (not modifications to existing APIs), then + # there's no problem. Ignore these diffs. + if grep "Unreachable types summary" "$log" | grep -q "0 removed" && + grep "Unreachable types summary" "$log" | grep -q "0 changed"; then + return 0 + fi + + { + warn_str=$(printf "==== ABI differences detected in %s from %s -> %s ====" \ + "$file" "$past_ref" "$base_ref") + printf "%s\n" "$warn_str" + sed -e '/summary:/d' -e '/changed type/d' -e '/^$/d' -e 's/^/ /g' "$log" + printf -- "=%.0s" $(seq 0 ${#warn_str}) + if cmp "$past_header" "$base_header" > /dev/null 2>&1; then + printf "\n%s did not change between %s and %s...\n" "$file" "$past_ref" "${base_ref:-dirty tree}" + printf "It's possible a change to one of the headers it includes caused this error:\n" + grep '^#include' "$base_header" + printf "\n" + fi + } > "$error_log" + + return 1 + fi +} + +# Check that a minimum software version number is satisfied +min_version_is_satisfied() { + local -r min_version="$1" + local -r version_installed="$2" + + printf "%s\n%s\n" "$min_version" "$version_installed" \ + | sort -Vc > /dev/null 2>&1 +} + +# Make sure we have the tools we need and the arguments make sense +check_deps() { + ABIDIFF="${ABIDIFF:-abidiff}" + CC="${CC:-gcc}" + ARCH="${ARCH:-$(uname -m)}" + if [ "$ARCH" = "x86_64" ]; then + ARCH="x86" + fi + + local -r abidiff_min_version="2.4" + local -r libdw_min_version_if_clang="0.171" + + if ! command -v "$ABIDIFF" > /dev/null 2>&1; then + eprintf "error - abidiff not found!\n" + eprintf "Please install abigail-tools version %s or greater\n" "$abidiff_min_version" + eprintf "See: https://sourceware.org/libabigail/manual/libabigail-overview.html\n" + return 1 + fi + + local -r abidiff_version="$("$ABIDIFF" --version | cut -d ' ' -f 2)" + if ! min_version_is_satisfied "$abidiff_min_version" "$abidiff_version"; then + eprintf "error - abidiff version too old: %s\n" "$abidiff_version" + eprintf "Please install abigail-tools version %s or greater\n" "$abidiff_min_version" + eprintf "See: https://sourceware.org/libabigail/manual/libabigail-overview.html\n" + return 1 + fi + + if ! command -v "$CC" > /dev/null 2>&1; then + eprintf 'error - %s not found\n' "$CC" + return 1 + fi + + if "$CC" --version | grep -q clang; then + local -r libdw_version="$(ldconfig -v 2>/dev/null | grep -v SKIPPED | grep -m 1 -o 'libdw-[0-9]\+.[0-9]\+' | cut -c 7-)" + if ! min_version_is_satisfied "$libdw_min_version_if_clang" "$libdw_version"; then + eprintf "error - libdw version too old for use with clang: %s\n" "$libdw_version" + eprintf "Please install libdw from elfutils version %s or greater\n" "$libdw_min_version_if_clang" + eprintf "See: https://sourceware.org/elfutils/\n" + return 1 + fi + fi + + if [ ! -d "arch/${ARCH}" ]; then + eprintf 'error - ARCH "%s" is not a subdirectory under arch/\n' "$ARCH" + eprintf "Please set ARCH to one of:\n%s\n" "$(find arch -maxdepth 1 -mindepth 1 -type d -printf '%f ' | fmt)" + return 1 + fi + + if ! git rev-parse --is-inside-work-tree > /dev/null 2>&1; then + eprintf "error - this script requires the kernel tree to be initialized with Git\n" + return 1 + fi + + if ! git rev-parse --verify "$past_ref" > /dev/null 2>&1; then + printf 'error - invalid git reference "%s"\n' "$past_ref" + return 1 + fi + + if [ -n "$base_ref" ]; then + if ! git merge-base --is-ancestor "$past_ref" "$base_ref" > /dev/null 2>&1; then + printf 'error - "%s" is not an ancestor of base ref "%s"\n' "$past_ref" "$base_ref" + return 1 + fi + if [ "$(git rev-parse "$base_ref")" = "$(git rev-parse "$past_ref")" ]; then + printf 'error - "%s" and "%s" are the same reference\n' "$past_ref" "$base_ref" + return 1 + fi + fi +} + +run() { + local base_ref="$1" + local past_ref="$2" + local abi_error_log="$3" + shift 3 + + if [ -z "$KERNEL_SRC" ]; then + KERNEL_SRC="$(realpath "$(dirname "$0")"/..)" + fi + + cd "$KERNEL_SRC" + + if [ -z "$base_ref" ] && ! tree_is_dirty; then + base_ref=HEAD + fi + + if [ -z "$past_ref" ]; then + if [ -n "$base_ref" ]; then + past_ref="${base_ref}^1" + else + past_ref=HEAD + fi + fi + + if ! check_deps; then + exit "$FAIL_PREREQ" + fi + + TMP_DIR=$(mktemp -d) + readonly TMP_DIR + trap 'rm -rf "$TMP_DIR"' EXIT + + readonly INCOMPAT_LIST="${TMP_DIR}/incompat_list.txt" + touch "$INCOMPAT_LIST" + + readonly SUPPRESSIONS="${TMP_DIR}/suppressions.txt" + gen_suppressions > "$SUPPRESSIONS" + + # Run make install_headers for both refs + install_headers "$base_ref" "$past_ref" + + # Check for any differences in the installed header trees + if diff -r -q "$(get_header_tree "$base_ref")" "$(get_header_tree "$past_ref")" > /dev/null 2>&1; then + printf "No changes to UAPI headers were applied between %s and %s\n" "$past_ref" "${base_ref:-dirty tree}" + exit "$SUCCESS" + fi + + if ! check_uapi_files "$base_ref" "$past_ref" "$abi_error_log"; then + exit "$FAIL_ABI" + fi +} + +main() { + MAX_THREADS=$(nproc) + VERBOSE="false" + IGNORE_AMBIGUOUS_CHANGES="false" + quiet="false" + local base_ref="" + while getopts "hb:p:j:l:iqv" opt; do + case $opt in + h) + print_usage + exit "$SUCCESS" + ;; + b) + base_ref="$OPTARG" + ;; + p) + past_ref="$OPTARG" + ;; + j) + MAX_THREADS="$OPTARG" + ;; + l) + abi_error_log="$OPTARG" + ;; + i) + IGNORE_AMBIGUOUS_CHANGES="true" + ;; + q) + quiet="true" + VERBOSE="false" + ;; + v) + VERBOSE="true" + quiet="false" + ;; + *) + exit "$FAIL_PREREQ" + esac + done + + if [ "$quiet" = "true" ]; then + exec > /dev/null 2>&1 + fi + + run "$base_ref" "$past_ref" "$abi_error_log" "$@" +} + +main "$@" -- GitLab From 8c88bc5b489e785c7ead94ce6fc3adb7f76e8715 Mon Sep 17 00:00:00 2001 From: John Moon Date: Mon, 11 Dec 2023 18:02:58 -0800 Subject: [PATCH 0602/1290] docs: dev-tools: Add UAPI checker documentation Add detailed documentation for scripts/check-uapi.sh. Signed-off-by: John Moon Signed-off-by: Masahiro Yamada --- Documentation/dev-tools/checkuapi.rst | 477 ++++++++++++++++++++++++++ Documentation/dev-tools/index.rst | 1 + 2 files changed, 478 insertions(+) create mode 100644 Documentation/dev-tools/checkuapi.rst diff --git a/Documentation/dev-tools/checkuapi.rst b/Documentation/dev-tools/checkuapi.rst new file mode 100644 index 0000000000000..9072f21b50b0c --- /dev/null +++ b/Documentation/dev-tools/checkuapi.rst @@ -0,0 +1,477 @@ +.. SPDX-License-Identifier: GPL-2.0-only + +============ +UAPI Checker +============ + +The UAPI checker (``scripts/check-uapi.sh``) is a shell script which +checks UAPI header files for userspace backwards-compatibility across +the git tree. + +Options +======= + +This section will describe the options with which ``check-uapi.sh`` +can be run. + +Usage:: + + check-uapi.sh [-b BASE_REF] [-p PAST_REF] [-j N] [-l ERROR_LOG] [-i] [-q] [-v] + +Available options:: + + -b BASE_REF Base git reference to use for comparison. If unspecified or empty, + will use any dirty changes in tree to UAPI files. If there are no + dirty changes, HEAD will be used. + -p PAST_REF Compare BASE_REF to PAST_REF (e.g. -p v6.1). If unspecified or empty, + will use BASE_REF^1. Must be an ancestor of BASE_REF. Only headers + that exist on PAST_REF will be checked for compatibility. + -j JOBS Number of checks to run in parallel (default: number of CPU cores). + -l ERROR_LOG Write error log to file (default: no error log is generated). + -i Ignore ambiguous changes that may or may not break UAPI compatibility. + -q Quiet operation. + -v Verbose operation (print more information about each header being checked). + +Environmental args:: + + ABIDIFF Custom path to abidiff binary + CC C compiler (default is "gcc") + ARCH Target architecture of C compiler (default is host arch) + +Exit codes:: + + 0) Success + 1) ABI difference detected + 2) Prerequisite not met + +Examples +======== + +Basic Usage +----------- + +First, let's try making a change to a UAPI header file that obviously +won't break userspace:: + + cat << 'EOF' | patch -l -p1 + --- a/include/uapi/linux/acct.h + +++ b/include/uapi/linux/acct.h + @@ -21,7 +21,9 @@ + #include + #include + + -/* + +#define FOO + + + +/* + * comp_t is a 16-bit "floating" point number with a 3-bit base 8 + * exponent and a 13-bit fraction. + * comp2_t is 24-bit with 5-bit base 2 exponent and 20 bit fraction + diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h + EOF + +Now, let's use the script to validate:: + + % ./scripts/check-uapi.sh + Installing user-facing UAPI headers from dirty tree... OK + Installing user-facing UAPI headers from HEAD... OK + Checking changes to UAPI headers between HEAD and dirty tree... + All 912 UAPI headers compatible with x86 appear to be backwards compatible + +Let's add another change that *might* break userspace:: + + cat << 'EOF' | patch -l -p1 + --- a/include/uapi/linux/bpf.h + +++ b/include/uapi/linux/bpf.h + @@ -74,7 +74,7 @@ struct bpf_insn { + __u8 dst_reg:4; /* dest register */ + __u8 src_reg:4; /* source register */ + __s16 off; /* signed offset */ + - __s32 imm; /* signed immediate constant */ + + __u32 imm; /* unsigned immediate constant */ + }; + + /* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ + EOF + +The script will catch this:: + + % ./scripts/check-uapi.sh + Installing user-facing UAPI headers from dirty tree... OK + Installing user-facing UAPI headers from HEAD... OK + Checking changes to UAPI headers between HEAD and dirty tree... + ==== ABI differences detected in include/linux/bpf.h from HEAD -> dirty tree ==== + [C] 'struct bpf_insn' changed: + type size hasn't changed + 1 data member change: + type of '__s32 imm' changed: + typedef name changed from __s32 to __u32 at int-ll64.h:27:1 + underlying type 'int' changed: + type name changed from 'int' to 'unsigned int' + type size hasn't changed + ================================================================================== + + error - 1/912 UAPI headers compatible with x86 appear _not_ to be backwards compatible + +In this case, the script is reporting the type change because it could +break a userspace program that passes in a negative number. Now, let's +say you know that no userspace program could possibly be using a negative +value in ``imm``, so changing to an unsigned type there shouldn't hurt +anything. You can pass the ``-i`` flag to the script to ignore changes +in which the userspace backwards compatibility is ambiguous:: + + % ./scripts/check-uapi.sh -i + Installing user-facing UAPI headers from dirty tree... OK + Installing user-facing UAPI headers from HEAD... OK + Checking changes to UAPI headers between HEAD and dirty tree... + All 912 UAPI headers compatible with x86 appear to be backwards compatible + +Now, let's make a similar change that *will* break userspace:: + + cat << 'EOF' | patch -l -p1 + --- a/include/uapi/linux/bpf.h + +++ b/include/uapi/linux/bpf.h + @@ -71,8 +71,8 @@ enum { + + struct bpf_insn { + __u8 code; /* opcode */ + - __u8 dst_reg:4; /* dest register */ + __u8 src_reg:4; /* source register */ + + __u8 dst_reg:4; /* dest register */ + __s16 off; /* signed offset */ + __s32 imm; /* signed immediate constant */ + }; + EOF + +Since we're re-ordering an existing struct member, there's no ambiguity, +and the script will report the breakage even if you pass ``-i``:: + + % ./scripts/check-uapi.sh -i + Installing user-facing UAPI headers from dirty tree... OK + Installing user-facing UAPI headers from HEAD... OK + Checking changes to UAPI headers between HEAD and dirty tree... + ==== ABI differences detected in include/linux/bpf.h from HEAD -> dirty tree ==== + [C] 'struct bpf_insn' changed: + type size hasn't changed + 2 data member changes: + '__u8 dst_reg' offset changed from 8 to 12 (in bits) (by +4 bits) + '__u8 src_reg' offset changed from 12 to 8 (in bits) (by -4 bits) + ================================================================================== + + error - 1/912 UAPI headers compatible with x86 appear _not_ to be backwards compatible + +Let's commit the breaking change, then commit the innocuous change:: + + % git commit -m 'Breaking UAPI change' include/uapi/linux/bpf.h + [detached HEAD f758e574663a] Breaking UAPI change + 1 file changed, 1 insertion(+), 1 deletion(-) + % git commit -m 'Innocuous UAPI change' include/uapi/linux/acct.h + [detached HEAD 2e87df769081] Innocuous UAPI change + 1 file changed, 3 insertions(+), 1 deletion(-) + +Now, let's run the script again with no arguments:: + + % ./scripts/check-uapi.sh + Installing user-facing UAPI headers from HEAD... OK + Installing user-facing UAPI headers from HEAD^1... OK + Checking changes to UAPI headers between HEAD^1 and HEAD... + All 912 UAPI headers compatible with x86 appear to be backwards compatible + +It doesn't catch any breaking change because, by default, it only +compares ``HEAD`` to ``HEAD^1``. The breaking change was committed on +``HEAD~2``. If we wanted the search scope to go back further, we'd have to +use the ``-p`` option to pass a different past reference. In this case, +let's pass ``-p HEAD~2`` to the script so it checks UAPI changes between +``HEAD~2`` and ``HEAD``:: + + % ./scripts/check-uapi.sh -p HEAD~2 + Installing user-facing UAPI headers from HEAD... OK + Installing user-facing UAPI headers from HEAD~2... OK + Checking changes to UAPI headers between HEAD~2 and HEAD... + ==== ABI differences detected in include/linux/bpf.h from HEAD~2 -> HEAD ==== + [C] 'struct bpf_insn' changed: + type size hasn't changed + 2 data member changes: + '__u8 dst_reg' offset changed from 8 to 12 (in bits) (by +4 bits) + '__u8 src_reg' offset changed from 12 to 8 (in bits) (by -4 bits) + ============================================================================== + + error - 1/912 UAPI headers compatible with x86 appear _not_ to be backwards compatible + +Alternatively, we could have also run with ``-b HEAD~``. This would set the +base reference to ``HEAD~`` so then the script would compare it to ``HEAD~^1``. + +Architecture-specific Headers +----------------------------- + +Consider this change:: + + cat << 'EOF' | patch -l -p1 + --- a/arch/arm64/include/uapi/asm/sigcontext.h + +++ b/arch/arm64/include/uapi/asm/sigcontext.h + @@ -70,6 +70,7 @@ struct sigcontext { + struct _aarch64_ctx { + __u32 magic; + __u32 size; + + __u32 new_var; + }; + + #define FPSIMD_MAGIC 0x46508001 + EOF + +This is a change to an arm64-specific UAPI header file. In this example, I'm +running the script from an x86 machine with an x86 compiler, so, by default, +the script only checks x86-compatible UAPI header files:: + + % ./scripts/check-uapi.sh + Installing user-facing UAPI headers from dirty tree... OK + Installing user-facing UAPI headers from HEAD... OK + No changes to UAPI headers were applied between HEAD and dirty tree + +With an x86 compiler, we can't check header files in ``arch/arm64``, so the +script doesn't even try. + +If we want to check the header file, we'll have to use an arm64 compiler and +set ``ARCH`` accordingly:: + + % CC=aarch64-linux-gnu-gcc ARCH=arm64 ./scripts/check-uapi.sh + Installing user-facing UAPI headers from dirty tree... OK + Installing user-facing UAPI headers from HEAD... OK + Checking changes to UAPI headers between HEAD and dirty tree... + ==== ABI differences detected in include/asm/sigcontext.h from HEAD -> dirty tree ==== + [C] 'struct _aarch64_ctx' changed: + type size changed from 64 to 96 (in bits) + 1 data member insertion: + '__u32 new_var', at offset 64 (in bits) at sigcontext.h:73:1 + -- snip -- + [C] 'struct zt_context' changed: + type size changed from 128 to 160 (in bits) + 2 data member changes (1 filtered): + '__u16 nregs' offset changed from 64 to 96 (in bits) (by +32 bits) + '__u16 __reserved[3]' offset changed from 80 to 112 (in bits) (by +32 bits) + ======================================================================================= + + error - 1/884 UAPI headers compatible with arm64 appear _not_ to be backwards compatible + +We can see with ``ARCH`` and ``CC`` set properly for the file, the ABI +change is reported properly. Also notice that the total number of UAPI +header files checked by the script changes. This is because the number +of headers installed for arm64 platforms is different than x86. + +Cross-Dependency Breakages +-------------------------- + +Consider this change:: + + cat << 'EOF' | patch -l -p1 + --- a/include/uapi/linux/types.h + +++ b/include/uapi/linux/types.h + @@ -52,7 +52,7 @@ typedef __u32 __bitwise __wsum; + #define __aligned_be64 __be64 __attribute__((aligned(8))) + #define __aligned_le64 __le64 __attribute__((aligned(8))) + + -typedef unsigned __bitwise __poll_t; + +typedef unsigned short __bitwise __poll_t; + + #endif /* __ASSEMBLY__ */ + #endif /* _UAPI_LINUX_TYPES_H */ + EOF + +Here, we're changing a ``typedef`` in ``types.h``. This doesn't break +a UAPI in ``types.h``, but other UAPIs in the tree may break due to +this change:: + + % ./scripts/check-uapi.sh + Installing user-facing UAPI headers from dirty tree... OK + Installing user-facing UAPI headers from HEAD... OK + Checking changes to UAPI headers between HEAD and dirty tree... + ==== ABI differences detected in include/linux/eventpoll.h from HEAD -> dirty tree ==== + [C] 'struct epoll_event' changed: + type size changed from 96 to 80 (in bits) + 2 data member changes: + type of '__poll_t events' changed: + underlying type 'unsigned int' changed: + type name changed from 'unsigned int' to 'unsigned short int' + type size changed from 32 to 16 (in bits) + '__u64 data' offset changed from 32 to 16 (in bits) (by -16 bits) + ======================================================================================== + include/linux/eventpoll.h did not change between HEAD and dirty tree... + It's possible a change to one of the headers it includes caused this error: + #include + #include + +Note that the script noticed the failing header file did not change, +so it assumes one of its includes must have caused the breakage. Indeed, +we can see ``linux/types.h`` is used from ``eventpoll.h``. + +UAPI Header Removals +-------------------- + +Consider this change:: + + cat << 'EOF' | patch -l -p1 + diff --git a/include/uapi/asm-generic/Kbuild b/include/uapi/asm-generic/Kbuild + index ebb180aac74e..a9c88b0a8b3b 100644 + --- a/include/uapi/asm-generic/Kbuild + +++ b/include/uapi/asm-generic/Kbuild + @@ -31,6 +31,6 @@ mandatory-y += stat.h + mandatory-y += statfs.h + mandatory-y += swab.h + mandatory-y += termbits.h + -mandatory-y += termios.h + +#mandatory-y += termios.h + mandatory-y += types.h + mandatory-y += unistd.h + EOF + +This script removes a UAPI header file from the install list. Let's run +the script:: + + % ./scripts/check-uapi.sh + Installing user-facing UAPI headers from dirty tree... OK + Installing user-facing UAPI headers from HEAD... OK + Checking changes to UAPI headers between HEAD and dirty tree... + ==== UAPI header include/asm/termios.h was removed between HEAD and dirty tree ==== + + error - 1/912 UAPI headers compatible with x86 appear _not_ to be backwards compatible + +Removing a UAPI header is considered a breaking change, and the script +will flag it as such. + +Checking Historic UAPI Compatibility +------------------------------------ + +You can use the ``-b`` and ``-p`` options to examine different chunks of your +git tree. For example, to check all changed UAPI header files between tags +v6.0 and v6.1, you'd run:: + + % ./scripts/check-uapi.sh -b v6.1 -p v6.0 + Installing user-facing UAPI headers from v6.1... OK + Installing user-facing UAPI headers from v6.0... OK + Checking changes to UAPI headers between v6.0 and v6.1... + + --- snip --- + error - 37/907 UAPI headers compatible with x86 appear _not_ to be backwards compatible + +Note: Before v5.3, a header file needed by the script is not present, +so the script is unable to check changes before then. + +You'll notice that the script detected many UAPI changes that are not +backwards compatible. Knowing that kernel UAPIs are supposed to be stable +forever, this is an alarming result. This brings us to the next section: +caveats. + +Caveats +======= + +The UAPI checker makes no assumptions about the author's intention, so some +types of changes may be flagged even though they intentionally break UAPI. + +Removals For Refactoring or Deprecation +--------------------------------------- + +Sometimes drivers for very old hardware are removed, such as in this example:: + + % ./scripts/check-uapi.sh -b ba47652ba655 + Installing user-facing UAPI headers from ba47652ba655... OK + Installing user-facing UAPI headers from ba47652ba655^1... OK + Checking changes to UAPI headers between ba47652ba655^1 and ba47652ba655... + ==== UAPI header include/linux/meye.h was removed between ba47652ba655^1 and ba47652ba655 ==== + + error - 1/910 UAPI headers compatible with x86 appear _not_ to be backwards compatible + +The script will always flag removals (even if they're intentional). + +Struct Expansions +----------------- + +Depending on how a structure is handled in kernelspace, a change which +expands a struct could be non-breaking. + +If a struct is used as the argument to an ioctl, then the kernel driver +must be able to handle ioctl commands of any size. Beyond that, you need +to be careful when copying data from the user. Say, for example, that +``struct foo`` is changed like this:: + + struct foo { + __u64 a; /* added in version 1 */ + + __u32 b; /* added in version 2 */ + + __u32 c; /* added in version 2 */ + } + +By default, the script will flag this kind of change for further review:: + + [C] 'struct foo' changed: + type size changed from 64 to 128 (in bits) + 2 data member insertions: + '__u32 b', at offset 64 (in bits) + '__u32 c', at offset 96 (in bits) + +However, it is possible that this change was made safely. + +If a userspace program was built with version 1, it will think +``sizeof(struct foo)`` is 8. That size will be encoded in the +ioctl value that gets sent to the kernel. If the kernel is built +with version 2, it will think the ``sizeof(struct foo)`` is 16. + +The kernel can use the ``_IOC_SIZE`` macro to get the size encoded +in the ioctl code that the user passed in and then use +``copy_struct_from_user()`` to safely copy the value:: + + int handle_ioctl(unsigned long cmd, unsigned long arg) + { + switch _IOC_NR(cmd) { + 0x01: { + struct foo my_cmd; /* size 16 in the kernel */ + + ret = copy_struct_from_user(&my_cmd, arg, sizeof(struct foo), _IOC_SIZE(cmd)); + ... + +``copy_struct_from_user`` will zero the struct in the kernel and then copy +only the bytes passed in from the user (leaving new members zeroized). +If the user passed in a larger struct, the extra members are ignored. + +If you know this situation is accounted for in the kernel code, you can +pass ``-i`` to the script, and struct expansions like this will be ignored. + +Flex Array Migration +-------------------- + +While the script handles expansion into an existing flex array, it does +still flag initial migration to flex arrays from 1-element fake flex +arrays. For example:: + + struct foo { + __u32 x; + - __u32 flex[1]; /* fake flex */ + + __u32 flex[]; /* real flex */ + }; + +This change would be flagged by the script:: + + [C] 'struct foo' changed: + type size changed from 64 to 32 (in bits) + 1 data member change: + type of '__u32 flex[1]' changed: + type name changed from '__u32[1]' to '__u32[]' + array type size changed from 32 to 'unknown' + array type subrange 1 changed length from 1 to 'unknown' + +At this time, there's no way to filter these types of changes, so be +aware of this possible false positive. + +Summary +------- + +While many types of false positives are filtered out by the script, +it's possible there are some cases where the script flags a change +which does not break UAPI. It's also possible a change which *does* +break userspace would not be flagged by this script. While the script +has been run on much of the kernel history, there could still be corner +cases that are not accounted for. + +The intention is for this script to be used as a quick check for +maintainers or automated tooling, not as the end-all authority on +patch compatibility. It's best to remember: use your best judgment +(and ideally a unit test in userspace) to make sure your UAPI changes +are backwards-compatible! diff --git a/Documentation/dev-tools/index.rst b/Documentation/dev-tools/index.rst index 6b0663075dc04..0876f5a2cf55a 100644 --- a/Documentation/dev-tools/index.rst +++ b/Documentation/dev-tools/index.rst @@ -34,6 +34,7 @@ Documentation/dev-tools/testing-overview.rst kselftest kunit/index ktap + checkuapi .. only:: subproject and html -- GitLab From 7beba04eb305393e3f8386390f25b4a9475f27f2 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 16 Dec 2023 01:06:37 +0900 Subject: [PATCH 0603/1290] kbuild: resolve symlinks for O= properly Currently, Kbuild follows the logical chain of directories for the O= option, just like 'cd' (or 'realpath --logical') does. Example: $ mkdir -p /tmp/a /tmp/x/y $ ln -s /tmp/x/y /tmp/a/b $ realpath /tmp/a/b/.. /tmp/x $ realpath --logical /tmp/a/b/.. /tmp/a $ make O=/tmp/a/b/.. defconfig make[1]: Entering directory '/tmp/a' [snip] make[1]: Leaving directory '/tmp/a' 'make O=/tmp/a/b/.. defconfig' creates the kernel configuration in /tmp/a instead of /tmp/x despite /tmp/a/b/.. resolves to /tmp/x. This is because Kbuild internally uses the 'cd ... && pwd' for the path resolution, but this behavior is not predictable for users. Additionally, it is not consistent with how the Kbuild handles the M= option or GNU Make works with 'make -C /tmp/a/b/..'. Using the physical directory structure for the O= option seems more reasonable. The comment says "expand a shell special character '~'", but it has already been expanded to the home directory in the command line. Signed-off-by: Masahiro Yamada Reviewed-by: Nicolas Schier --- Makefile | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 5a11804af640a..6204a3803a90f 100644 --- a/Makefile +++ b/Makefile @@ -190,14 +190,11 @@ ifeq ("$(origin O)", "command line") endif ifneq ($(KBUILD_OUTPUT),) -# Make's built-in functions such as $(abspath ...), $(realpath ...) cannot -# expand a shell special character '~'. We use a somewhat tedious way here. -abs_objtree := $(shell mkdir -p $(KBUILD_OUTPUT) && cd $(KBUILD_OUTPUT) && pwd) -$(if $(abs_objtree),, \ - $(error failed to create output directory "$(KBUILD_OUTPUT)")) - +# $(realpath ...) gets empty if the path does not exist. Run 'mkdir -p' first. +$(shell mkdir -p "$(KBUILD_OUTPUT)") # $(realpath ...) resolves symlinks -abs_objtree := $(realpath $(abs_objtree)) +abs_objtree := $(realpath $(KBUILD_OUTPUT)) +$(if $(abs_objtree),,$(error failed to create output directory "$(KBUILD_OUTPUT)")) endif # ifneq ($(KBUILD_OUTPUT),) ifneq ($(words $(subst :, ,$(abs_srctree))), 1) -- GitLab From f3b2306bea33b3a86ad2df4dcfab53b629e1bc84 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Wed, 20 Dec 2023 00:33:15 +0000 Subject: [PATCH 0604/1290] gen_init_cpio: Apply mtime supplied by user to all file types Currently gen_init_cpio -d is applied to symlinks, directories and special files. These files are created by gen_init_cpio from their description. Without option current time(NULL) is used. And regular files that go in initramfs are created before cpio generation, so their mtime(s) are preserved. This is usually not an issue as reproducible builds should rebuild everything in the distribution, including binaries, configs and whatever other regular files may find their way into kernel's initramfs. On the other hand, gen_initramfs.sh usage claims: > -d Use date for all file mtime values Ar Arista initramfs files are managed with version control system that preserves mtime. Those are configs, boot parameters, init scripts, version files, platform-specific files, probably some others, too. While it's certainly possible to work this around by copying the file into temp directory and adjusting mtime prior to gen_init_cpio call, I don't see why it needs workarounds. The intended user of -d option is the one that needs to create a reproducible build, see commit a8b8017c34fe ("initramfs: Use KBUILD_BUILD_TIMESTAMP for generated entries"). If a user wants the build reproduction, they use -d , which can be set on all types of files, without surprising exceptions and workarounds. Let's KISS here and just apply the time that user specified with -d option. Based-on-a-patch-by: Baptiste Covolato Link: https://lore.kernel.org/lkml/20181025215133.20138-1-baptiste@arista.com/ Signed-off-by: Dmitry Safonov Signed-off-by: Masahiro Yamada --- usr/gen_init_cpio.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/usr/gen_init_cpio.c b/usr/gen_init_cpio.c index 61230532fef10..edcdb8abfa31c 100644 --- a/usr/gen_init_cpio.c +++ b/usr/gen_init_cpio.c @@ -27,6 +27,7 @@ static unsigned int offset; static unsigned int ino = 721; static time_t default_mtime; +static bool do_file_mtime; static bool do_csum = false; struct file_handler { @@ -329,6 +330,7 @@ static int cpio_mkfile(const char *name, const char *location, int file; int retval; int rc = -1; + time_t mtime; int namesize; unsigned int i; uint32_t csum = 0; @@ -347,16 +349,21 @@ static int cpio_mkfile(const char *name, const char *location, goto error; } - if (buf.st_mtime > 0xffffffff) { - fprintf(stderr, "%s: Timestamp exceeds maximum cpio timestamp, clipping.\n", - location); - buf.st_mtime = 0xffffffff; - } + if (do_file_mtime) { + mtime = default_mtime; + } else { + mtime = buf.st_mtime; + if (mtime > 0xffffffff) { + fprintf(stderr, "%s: Timestamp exceeds maximum cpio timestamp, clipping.\n", + location); + mtime = 0xffffffff; + } - if (buf.st_mtime < 0) { - fprintf(stderr, "%s: Timestamp negative, clipping.\n", - location); - buf.st_mtime = 0; + if (mtime < 0) { + fprintf(stderr, "%s: Timestamp negative, clipping.\n", + location); + mtime = 0; + } } if (buf.st_size > 0xffffffff) { @@ -387,7 +394,7 @@ static int cpio_mkfile(const char *name, const char *location, (long) uid, /* uid */ (long) gid, /* gid */ nlinks, /* nlink */ - (long) buf.st_mtime, /* mtime */ + (long) mtime, /* mtime */ size, /* filesize */ 3, /* major */ 1, /* minor */ @@ -536,8 +543,9 @@ static void usage(const char *prog) "file /sbin/kinit /usr/src/klibc/kinit/kinit 0755 0 0\n" "\n" " is time in seconds since Epoch that will be used\n" - "as mtime for symlinks, special files and directories. The default\n" - "is to use the current time for these entries.\n" + "as mtime for symlinks, directories, regular and special files.\n" + "The default is to use the current time for all files, but\n" + "preserve modification time for regular files.\n" "-c: calculate and store 32-bit checksums for file data.\n", prog); } @@ -594,6 +602,7 @@ int main (int argc, char *argv[]) usage(argv[0]); exit(1); } + do_file_mtime = true; break; case 'c': do_csum = true; -- GitLab From 9c65810cfb215f40f14d2c00694911fbc5408761 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 20 Dec 2023 00:40:49 +0900 Subject: [PATCH 0605/1290] kbuild: deb-pkg: split debian/copyright from the mkdebian script Copy debian/copyright instead of generating it by the 'cat' command. I also updated '2018' to '2023' while I was here. Signed-off-by: Masahiro Yamada Reviewed-by: Nicolas Schier --- scripts/package/debian/copyright | 16 ++++++++++++++++ scripts/package/mkdebian | 21 +-------------------- 2 files changed, 17 insertions(+), 20 deletions(-) create mode 100644 scripts/package/debian/copyright diff --git a/scripts/package/debian/copyright b/scripts/package/debian/copyright new file mode 100644 index 0000000000000..4f1f06221f092 --- /dev/null +++ b/scripts/package/debian/copyright @@ -0,0 +1,16 @@ +This is a packaged upstream version of the Linux kernel. + +The sources may be found at most Linux archive sites, including: +https://www.kernel.org/pub/linux/kernel + +Copyright: 1991 - 2023 Linus Torvalds and others. + +The git repository for mainline kernel development is at: +git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 dated June, 1991. + +On Debian GNU/Linux systems, the complete text of the GNU General Public +License version 2 can be found in `/usr/share/common-licenses/GPL-2'. diff --git a/scripts/package/mkdebian b/scripts/package/mkdebian index c1a36da85e84f..91f0e09600b11 100755 --- a/scripts/package/mkdebian +++ b/scripts/package/mkdebian @@ -188,26 +188,6 @@ $sourcename ($packageversion) $distribution; urgency=low -- $maintainer $(date -R) EOF -# Generate copyright file -cat < debian/copyright -This is a packaged upstream version of the Linux kernel. - -The sources may be found at most Linux archive sites, including: -https://www.kernel.org/pub/linux/kernel - -Copyright: 1991 - 2018 Linus Torvalds and others. - -The git repository for mainline kernel development is at: -git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; version 2 dated June, 1991. - -On Debian GNU/Linux systems, the complete text of the GNU General Public -License version 2 can be found in \`/usr/share/common-licenses/GPL-2'. -EOF - # Generate a control file cat < debian/control Source: $sourcename @@ -268,6 +248,7 @@ ARCH := ${ARCH} KERNELRELEASE := ${KERNELRELEASE} EOF +cp "${srctree}/scripts/package/debian/copyright" debian/ cp "${srctree}/scripts/package/debian/rules" debian/ exit 0 -- GitLab From b88365b6d74edc88a9d283c837fec05b13d401a6 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 20 Dec 2023 03:19:56 +0900 Subject: [PATCH 0606/1290] kbuild: deb-pkg: hard-code Build-Depends The condition to require libelf-dev:native is stale because objtool is now enabled by CONFIG_OBJTOOL instead of CONFIG_UNWINDER_ORC. Not only objtool but also resolve_btfids requires libelf-dev:native; therefore, CONFIG_DEBUG_INFO_BTF should be checked as well. Similarly, CONFIG_SYSTEM_TRUSTED_KEYRING is not the only case that requires libssl-dev:native. Perhaps, the following code would provide better coverage, but it is hard to maintain (and may still be imperfect). if is_enabled CONFIG_OBJTOOL || is_enabled CONFIG_DEBUG_INFO_BTF; then build_depends="${build_depends}, libelf-dev:native" fi if is_enabled CONFIG_SYSTEM_TRUSTED_KEYRING || is_enabled CONFIG_SYSTEM_REVOCATION_LIST || is_enabled CONFIG_MODULE_SIG_FORMAT; then build_depends="${build_depends}, libssl-dev:native" fi Let's hard-code the build dependency. Signed-off-by: Masahiro Yamada Reviewed-by: Nicolas Schier --- scripts/package/mkdebian | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/package/mkdebian b/scripts/package/mkdebian index 91f0e09600b11..93a24712b9a1d 100755 --- a/scripts/package/mkdebian +++ b/scripts/package/mkdebian @@ -176,8 +176,6 @@ else fi echo $debarch > debian/arch -extra_build_depends=", $(if_enabled_echo CONFIG_UNWINDER_ORC libelf-dev:native)" -extra_build_depends="$extra_build_depends, $(if_enabled_echo CONFIG_SYSTEM_TRUSTED_KEYRING libssl-dev:native)" # Generate a simple changelog template cat < debian/changelog @@ -195,7 +193,8 @@ Section: kernel Priority: optional Maintainer: $maintainer Rules-Requires-Root: no -Build-Depends: bc, debhelper, rsync, kmod, cpio, bison, flex $extra_build_depends +Build-Depends: debhelper +Build-Depends-Arch: bc, bison, cpio, flex, kmod, libelf-dev:native, libssl-dev:native, rsync Homepage: https://www.kernel.org/ Package: $packagename-$version -- GitLab From 466e6fc43fb9eefa26ec766f78ce18616bf84b9a Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 26 Dec 2023 22:52:38 +0900 Subject: [PATCH 0607/1290] kbuild: deb-pkg: factor out common Make options in debian/rules This avoids code duplication between binary-arch and built-arch. Signed-off-by: Masahiro Yamada Reviewed-by: Nicolas Schier --- scripts/package/debian/rules | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/package/debian/rules b/scripts/package/debian/rules index 3dafa9496c636..26bc6239e200a 100755 --- a/scripts/package/debian/rules +++ b/scripts/package/debian/rules @@ -10,20 +10,20 @@ ifneq (,$(filter-out parallel=1,$(filter parallel=%,$(DEB_BUILD_OPTIONS)))) MAKEFLAGS += -j$(NUMJOBS) endif +make-opts = ARCH=$(ARCH) KERNELRELEASE=$(KERNELRELEASE) + .PHONY: binary binary-indep binary-arch binary: binary-arch binary-indep binary-indep: build-indep binary-arch: build-arch - $(MAKE) -f $(srctree)/Makefile ARCH=$(ARCH) \ - KERNELRELEASE=$(KERNELRELEASE) \ + $(MAKE) -f $(srctree)/Makefile $(make-opts) \ run-command KBUILD_RUN_COMMAND=+$(srctree)/scripts/package/builddeb .PHONY: build build-indep build-arch build: build-arch build-indep build-indep: build-arch: - $(MAKE) -f $(srctree)/Makefile ARCH=$(ARCH) \ - KERNELRELEASE=$(KERNELRELEASE) \ + $(MAKE) -f $(srctree)/Makefile $(make-opts) \ $(shell $(srctree)/scripts/package/deb-build-option) \ olddefconfig all -- GitLab From 7d4f07d5cb71728cea2b6fe8b087a0ce1dbda23a Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 26 Dec 2023 22:52:39 +0900 Subject: [PATCH 0608/1290] kbuild: deb-pkg: squash scripts/package/deb-build-option to debian/rules The binary-arch target needs to use the same CROSS_COMPILE as used in build-arch; otherwise, 'make run-command' may attempt to resync the .config file. Squash scripts/package/deb-build-option into debian/rules, as it is a small amount of code. Signed-off-by: Masahiro Yamada Reviewed-by: Nicolas Schier --- scripts/package/deb-build-option | 14 -------------- scripts/package/debian/rules | 5 +++-- 2 files changed, 3 insertions(+), 16 deletions(-) delete mode 100755 scripts/package/deb-build-option diff --git a/scripts/package/deb-build-option b/scripts/package/deb-build-option deleted file mode 100755 index 7950eff01781a..0000000000000 --- a/scripts/package/deb-build-option +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0-only - -# Set up CROSS_COMPILE if not defined yet -if [ "${CROSS_COMPILE+set}" != "set" -a "${DEB_HOST_ARCH}" != "${DEB_BUILD_ARCH}" ]; then - echo CROSS_COMPILE=${DEB_HOST_GNU_TYPE}- -fi - -version=$(dpkg-parsechangelog -S Version) -debian_revision="${version##*-}" - -if [ "${version}" != "${debian_revision}" ]; then - echo KBUILD_BUILD_VERSION=${debian_revision} -fi diff --git a/scripts/package/debian/rules b/scripts/package/debian/rules index 26bc6239e200a..529b71b55efa5 100755 --- a/scripts/package/debian/rules +++ b/scripts/package/debian/rules @@ -10,7 +10,9 @@ ifneq (,$(filter-out parallel=1,$(filter parallel=%,$(DEB_BUILD_OPTIONS)))) MAKEFLAGS += -j$(NUMJOBS) endif -make-opts = ARCH=$(ARCH) KERNELRELEASE=$(KERNELRELEASE) +revision = $(lastword $(subst -, ,$(shell dpkg-parsechangelog -S Version))) +CROSS_COMPILE ?= $(filter-out $(DEB_BUILD_GNU_TYPE)-, $(DEB_HOST_GNU_TYPE)-) +make-opts = ARCH=$(ARCH) KERNELRELEASE=$(KERNELRELEASE) KBUILD_BUILD_VERSION=$(revision) $(addprefix CROSS_COMPILE=,$(CROSS_COMPILE)) .PHONY: binary binary-indep binary-arch binary: binary-arch binary-indep @@ -24,7 +26,6 @@ build: build-arch build-indep build-indep: build-arch: $(MAKE) -f $(srctree)/Makefile $(make-opts) \ - $(shell $(srctree)/scripts/package/deb-build-option) \ olddefconfig all .PHONY: clean -- GitLab From 2cb54a19ac7153b9a26a72098c495187f64c2276 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Fri, 29 Dec 2023 06:54:41 -0800 Subject: [PATCH 0609/1290] apparmor: Fix ref count leak in task_kill apparmor_task_kill was not putting the task_cred reference tc, or the cred_label reference tc when dealing with a passed in cred, fix this by using a single fn exit. Fixes: 90c436a64a6e ("apparmor: pass cred through to audit info.") Signed-off-by: John Johansen --- security/apparmor/lsm.c | 1 - 1 file changed, 1 deletion(-) diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 3eb992801a7f5..d851a6cc19d32 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -954,7 +954,6 @@ static int apparmor_task_kill(struct task_struct *target, struct kernel_siginfo cl = aa_get_newest_cred_label(cred); error = aa_may_signal(cred, cl, tc, tl, sig); aa_put_label(cl); - return error; } else { cl = __begin_current_label_crit_section(); error = aa_may_signal(current_cred(), cl, tc, tl, sig); -- GitLab From 65cdd3ada7dcec1863022b2ca218ae409f99c759 Mon Sep 17 00:00:00 2001 From: Jeff LaBundy Date: Mon, 1 Jan 2024 14:02:06 -0600 Subject: [PATCH 0610/1290] dt-bindings: input: iqs269a: Add bindings for slider gestures This patch adds bindings for slider gestures that can be expressed by the device. Signed-off-by: Jeff LaBundy Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/ZZMaPrbSi4IrzwKF@nixie71 Signed-off-by: Dmitry Torokhov --- .../devicetree/bindings/input/iqs269a.yaml | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/Documentation/devicetree/bindings/input/iqs269a.yaml b/Documentation/devicetree/bindings/input/iqs269a.yaml index 3c430d38594f1..b42f07542d27d 100644 --- a/Documentation/devicetree/bindings/input/iqs269a.yaml +++ b/Documentation/devicetree/bindings/input/iqs269a.yaml @@ -9,6 +9,9 @@ title: Azoteq IQS269A Capacitive Touch Controller maintainers: - Jeff LaBundy +allOf: + - $ref: input.yaml# + description: | The Azoteq IQS269A is an 8-channel capacitive touch controller that features additional Hall-effect and inductive sensing capabilities. @@ -204,6 +207,63 @@ properties: default: 1 description: Specifies the slider coordinate filter strength. + linux,keycodes: + minItems: 1 + maxItems: 8 + description: | + Specifies the numeric keycodes associated with each available gesture in + the following order (enter 0 for unused gestures): + 0: Slider 0 tap + 1: Slider 0 hold + 2: Slider 0 positive flick or swipe + 3: Slider 0 negative flick or swipe + 4: Slider 1 tap + 5: Slider 1 hold + 6: Slider 1 positive flick or swipe + 7: Slider 1 negative flick or swipe + + azoteq,gesture-swipe: + type: boolean + description: + Directs the device to interpret axial gestures as a swipe (finger remains + on slider) instead of a flick (finger leaves slider). + + azoteq,timeout-tap-ms: + multipleOf: 16 + minimum: 0 + maximum: 4080 + default: 400 + description: + Specifies the length of time (in ms) within which a slider touch must be + released in order to be interpreted as a tap. Default and maximum values + as well as step size are reduced by a factor of 4 with device version 2. + + azoteq,timeout-swipe-ms: + multipleOf: 16 + minimum: 0 + maximum: 4080 + default: 2000 + description: + Specifies the length of time (in ms) within which an axial gesture must be + completed in order to be interpreted as a flick or swipe. Default and max- + imum values as well as step size are reduced by a factor of 4 with device + version 2. + + azoteq,thresh-swipe: + $ref: /schemas/types.yaml#/definitions/uint32 + minimum: 0 + maximum: 255 + default: 128 + description: + Specifies the number of points across which an axial gesture must travel + in order to be interpreted as a flick or swipe. + +dependencies: + azoteq,gesture-swipe: ["linux,keycodes"] + azoteq,timeout-tap-ms: ["linux,keycodes"] + azoteq,timeout-swipe-ms: ["linux,keycodes"] + azoteq,thresh-swipe: ["linux,keycodes"] + patternProperties: "^channel@[0-7]$": type: object @@ -484,6 +544,14 @@ examples: azoteq,hall-enable; azoteq,suspend-mode = <2>; + linux,keycodes = , + , + , + ; + + azoteq,timeout-tap-ms = <400>; + azoteq,timeout-swipe-ms = <800>; + channel@0 { reg = <0x0>; -- GitLab From 00521a9bf96eaab358886d7a0c531d52027d3241 Mon Sep 17 00:00:00 2001 From: Jeff LaBundy Date: Mon, 1 Jan 2024 14:02:23 -0600 Subject: [PATCH 0611/1290] Input: iqs269a - add support for slider gestures This patch adds support for slider gestures that can be expressed by the device. Each gesture (e.g. tap or hold) can be mapped to a unique keycode for either slider 0 or 1. With this change, raw slider coordinates are reported only if the slider has no keycodes defined. This prevents unwanted mouse cur- sor movement when expressing axial gestures (e.g. swipe) and also eliminates some unnecessary I2C traffic. Different revisions of silicon use different tap and swipe timeout step sizes. Apply an appropriate scaling factor depending on which revision is found. To facilitate this change, store the iqs269_ver_info struct in the driver's private data so that other functions can use it after the driver has probed. Last but not least, a former reserved field in iqs269_ver_info now contains useful information; give it a name (fw_num). Signed-off-by: Jeff LaBundy Link: https://lore.kernel.org/r/ZZMaT46WQq1/Nrsb@nixie71 Signed-off-by: Dmitry Torokhov --- drivers/input/misc/iqs269a.c | 220 ++++++++++++++++++++++++++++++----- 1 file changed, 191 insertions(+), 29 deletions(-) diff --git a/drivers/input/misc/iqs269a.c b/drivers/input/misc/iqs269a.c index 1abce35b955e4..0d0b5cdc78303 100644 --- a/drivers/input/misc/iqs269a.c +++ b/drivers/input/misc/iqs269a.c @@ -9,6 +9,7 @@ * axial sliders presented by the device. */ +#include #include #include #include @@ -26,6 +27,8 @@ #define IQS269_VER_INFO 0x00 #define IQS269_VER_INFO_PROD_NUM 0x4F +#define IQS269_VER_INFO_FW_NUM_2 0x03 +#define IQS269_VER_INFO_FW_NUM_3 0x10 #define IQS269_SYS_FLAGS 0x02 #define IQS269_SYS_FLAGS_SHOW_RESET BIT(15) @@ -53,6 +56,7 @@ #define IQS269_SYS_SETTINGS_ULP_UPDATE_MASK GENMASK(10, 8) #define IQS269_SYS_SETTINGS_ULP_UPDATE_SHIFT 8 #define IQS269_SYS_SETTINGS_ULP_UPDATE_MAX 7 +#define IQS269_SYS_SETTINGS_SLIDER_SWIPE BIT(7) #define IQS269_SYS_SETTINGS_RESEED_OFFSET BIT(6) #define IQS269_SYS_SETTINGS_EVENT_MODE BIT(5) #define IQS269_SYS_SETTINGS_EVENT_MODE_LP BIT(4) @@ -69,6 +73,7 @@ #define IQS269_FILT_STR_MAX 3 #define IQS269_EVENT_MASK_SYS BIT(6) +#define IQS269_EVENT_MASK_GESTURE BIT(3) #define IQS269_EVENT_MASK_DEEP BIT(2) #define IQS269_EVENT_MASK_TOUCH BIT(1) #define IQS269_EVENT_MASK_PROX BIT(0) @@ -97,6 +102,10 @@ #define IQS269_MISC_B_TRACKING_UI_ENABLE BIT(4) #define IQS269_MISC_B_FILT_STR_SLIDER GENMASK(1, 0) +#define IQS269_TIMEOUT_TAP_MS_MAX 4080 +#define IQS269_TIMEOUT_SWIPE_MS_MAX 4080 +#define IQS269_THRESH_SWIPE_MAX 255 + #define IQS269_CHx_ENG_A_MEAS_CAP_SIZE BIT(15) #define IQS269_CHx_ENG_A_RX_GND_INACTIVE BIT(13) #define IQS269_CHx_ENG_A_LOCAL_CAP_SIZE BIT(12) @@ -175,6 +184,20 @@ enum iqs269_event_id { IQS269_EVENT_DEEP_UP, }; +enum iqs269_slider_id { + IQS269_SLIDER_NONE, + IQS269_SLIDER_KEY, + IQS269_SLIDER_RAW, +}; + +enum iqs269_gesture_id { + IQS269_GESTURE_TAP, + IQS269_GESTURE_HOLD, + IQS269_GESTURE_FLICK_POS, + IQS269_GESTURE_FLICK_NEG, + IQS269_NUM_GESTURES, +}; + struct iqs269_switch_desc { unsigned int code; bool enabled; @@ -234,7 +257,7 @@ struct iqs269_ver_info { u8 prod_num; u8 sw_num; u8 hw_num; - u8 padding; + u8 fw_num; } __packed; struct iqs269_ch_reg { @@ -285,16 +308,33 @@ struct iqs269_private { struct regmap *regmap; struct mutex lock; struct iqs269_switch_desc switches[ARRAY_SIZE(iqs269_events)]; + struct iqs269_ver_info ver_info; struct iqs269_sys_reg sys_reg; struct completion ati_done; struct input_dev *keypad; struct input_dev *slider[IQS269_NUM_SL]; unsigned int keycode[ARRAY_SIZE(iqs269_events) * IQS269_NUM_CH]; + unsigned int sl_code[IQS269_NUM_SL][IQS269_NUM_GESTURES]; unsigned int ch_num; bool hall_enable; bool ati_current; }; +static enum iqs269_slider_id iqs269_slider_type(struct iqs269_private *iqs269, + int slider_num) +{ + int i; + + if (!iqs269->sys_reg.slider_select[slider_num]) + return IQS269_SLIDER_NONE; + + for (i = 0; i < IQS269_NUM_GESTURES; i++) + if (iqs269->sl_code[slider_num][i] != KEY_RESERVED) + return IQS269_SLIDER_KEY; + + return IQS269_SLIDER_RAW; +} + static int iqs269_ati_mode_set(struct iqs269_private *iqs269, unsigned int ch_num, unsigned int mode) { @@ -1004,6 +1044,76 @@ static int iqs269_parse_prop(struct iqs269_private *iqs269) general |= (val << IQS269_SYS_SETTINGS_ULP_UPDATE_SHIFT); } + if (device_property_present(&client->dev, "linux,keycodes")) { + int scale = 1; + int count = device_property_count_u32(&client->dev, + "linux,keycodes"); + if (count > IQS269_NUM_GESTURES * IQS269_NUM_SL) { + dev_err(&client->dev, "Too many keycodes present\n"); + return -EINVAL; + } else if (count < 0) { + dev_err(&client->dev, "Failed to count keycodes: %d\n", + count); + return count; + } + + error = device_property_read_u32_array(&client->dev, + "linux,keycodes", + *iqs269->sl_code, count); + if (error) { + dev_err(&client->dev, "Failed to read keycodes: %d\n", + error); + return error; + } + + if (device_property_present(&client->dev, + "azoteq,gesture-swipe")) + general |= IQS269_SYS_SETTINGS_SLIDER_SWIPE; + + /* + * Early revisions of silicon use a more granular step size for + * tap and swipe gesture timeouts; scale them appropriately. + */ + if (iqs269->ver_info.fw_num < IQS269_VER_INFO_FW_NUM_3) + scale = 4; + + if (!device_property_read_u32(&client->dev, + "azoteq,timeout-tap-ms", &val)) { + if (val > IQS269_TIMEOUT_TAP_MS_MAX / scale) { + dev_err(&client->dev, "Invalid timeout: %u\n", + val); + return -EINVAL; + } + + sys_reg->timeout_tap = val / (16 / scale); + } + + if (!device_property_read_u32(&client->dev, + "azoteq,timeout-swipe-ms", + &val)) { + if (val > IQS269_TIMEOUT_SWIPE_MS_MAX / scale) { + dev_err(&client->dev, "Invalid timeout: %u\n", + val); + return -EINVAL; + } + + sys_reg->timeout_swipe = val / (16 / scale); + } + + if (!device_property_read_u32(&client->dev, + "azoteq,thresh-swipe", &val)) { + if (val > IQS269_THRESH_SWIPE_MAX) { + dev_err(&client->dev, "Invalid threshold: %u\n", + val); + return -EINVAL; + } + + sys_reg->thresh_swipe = val; + } + + sys_reg->event_mask &= ~IQS269_EVENT_MASK_GESTURE; + } + general &= ~IQS269_SYS_SETTINGS_RESEED_OFFSET; if (device_property_present(&client->dev, "azoteq,reseed-offset")) general |= IQS269_SYS_SETTINGS_RESEED_OFFSET; @@ -1012,10 +1122,11 @@ static int iqs269_parse_prop(struct iqs269_private *iqs269) /* * As per the datasheet, enable streaming during normal-power mode if - * either slider is in use. In that case, the device returns to event - * mode during low-power mode. + * raw coordinates will be read from either slider. In that case, the + * device returns to event mode during low-power mode. */ - if (sys_reg->slider_select[0] || sys_reg->slider_select[1]) + if (iqs269_slider_type(iqs269, 0) == IQS269_SLIDER_RAW || + iqs269_slider_type(iqs269, 1) == IQS269_SLIDER_RAW) general |= IQS269_SYS_SETTINGS_EVENT_MODE_LP; general |= IQS269_SYS_SETTINGS_REDO_ATI; @@ -1106,19 +1217,37 @@ static int iqs269_input_init(struct iqs269_private *iqs269) } for (i = 0; i < IQS269_NUM_SL; i++) { - if (!iqs269->sys_reg.slider_select[i]) + if (iqs269_slider_type(iqs269, i) == IQS269_SLIDER_NONE) continue; iqs269->slider[i] = devm_input_allocate_device(&client->dev); if (!iqs269->slider[i]) return -ENOMEM; + iqs269->slider[i]->keycodemax = ARRAY_SIZE(iqs269->sl_code[i]); + iqs269->slider[i]->keycode = iqs269->sl_code[i]; + iqs269->slider[i]->keycodesize = sizeof(**iqs269->sl_code); + iqs269->slider[i]->name = i ? "iqs269a_slider_1" : "iqs269a_slider_0"; iqs269->slider[i]->id.bustype = BUS_I2C; - input_set_capability(iqs269->slider[i], EV_KEY, BTN_TOUCH); - input_set_abs_params(iqs269->slider[i], ABS_X, 0, 255, 0, 0); + for (j = 0; j < IQS269_NUM_GESTURES; j++) + if (iqs269->sl_code[i][j] != KEY_RESERVED) + input_set_capability(iqs269->slider[i], EV_KEY, + iqs269->sl_code[i][j]); + + /* + * Present the slider as a narrow trackpad if one or more chan- + * nels have been selected to participate, but no gestures have + * been mapped to a keycode. + */ + if (iqs269_slider_type(iqs269, i) == IQS269_SLIDER_RAW) { + input_set_capability(iqs269->slider[i], + EV_KEY, BTN_TOUCH); + input_set_abs_params(iqs269->slider[i], + ABS_X, 0, 255, 0, 0); + } error = input_register_device(iqs269->slider[i]); if (error) { @@ -1167,28 +1296,62 @@ static int iqs269_report(struct iqs269_private *iqs269) if (be16_to_cpu(flags.system) & IQS269_SYS_FLAGS_IN_ATI) return 0; - error = regmap_raw_read(iqs269->regmap, IQS269_SLIDER_X, slider_x, - sizeof(slider_x)); - if (error) { - dev_err(&client->dev, "Failed to read slider position: %d\n", - error); - return error; + if (iqs269_slider_type(iqs269, 0) == IQS269_SLIDER_RAW || + iqs269_slider_type(iqs269, 1) == IQS269_SLIDER_RAW) { + error = regmap_raw_read(iqs269->regmap, IQS269_SLIDER_X, + slider_x, sizeof(slider_x)); + if (error) { + dev_err(&client->dev, + "Failed to read slider position: %d\n", error); + return error; + } } for (i = 0; i < IQS269_NUM_SL; i++) { - if (!iqs269->sys_reg.slider_select[i]) + flags.gesture >>= (i * IQS269_NUM_GESTURES); + + switch (iqs269_slider_type(iqs269, i)) { + case IQS269_SLIDER_NONE: continue; - /* - * Report BTN_TOUCH if any channel that participates in the - * slider is in a state of touch. - */ - if (flags.states[IQS269_ST_OFFS_TOUCH] & - iqs269->sys_reg.slider_select[i]) { - input_report_key(iqs269->slider[i], BTN_TOUCH, 1); - input_report_abs(iqs269->slider[i], ABS_X, slider_x[i]); - } else { - input_report_key(iqs269->slider[i], BTN_TOUCH, 0); + case IQS269_SLIDER_KEY: + for (j = 0; j < IQS269_NUM_GESTURES; j++) + input_report_key(iqs269->slider[i], + iqs269->sl_code[i][j], + flags.gesture & BIT(j)); + + if (!(flags.gesture & (BIT(IQS269_GESTURE_FLICK_NEG) | + BIT(IQS269_GESTURE_FLICK_POS) | + BIT(IQS269_GESTURE_TAP)))) + break; + + input_sync(iqs269->slider[i]); + + /* + * Momentary gestures are followed by a complementary + * release cycle so as to emulate a full keystroke. + */ + for (j = 0; j < IQS269_NUM_GESTURES; j++) + if (j != IQS269_GESTURE_HOLD) + input_report_key(iqs269->slider[i], + iqs269->sl_code[i][j], + 0); + break; + + case IQS269_SLIDER_RAW: + /* + * The slider is considered to be in a state of touch + * if any selected channels are in a state of touch. + */ + state = flags.states[IQS269_ST_OFFS_TOUCH]; + state &= iqs269->sys_reg.slider_select[i]; + + input_report_key(iqs269->slider[i], BTN_TOUCH, state); + + if (state) + input_report_abs(iqs269->slider[i], + ABS_X, slider_x[i]); + break; } input_sync(iqs269->slider[i]); @@ -1595,7 +1758,6 @@ static const struct regmap_config iqs269_regmap_config = { static int iqs269_probe(struct i2c_client *client) { - struct iqs269_ver_info ver_info; struct iqs269_private *iqs269; int error; @@ -1617,14 +1779,14 @@ static int iqs269_probe(struct i2c_client *client) mutex_init(&iqs269->lock); init_completion(&iqs269->ati_done); - error = regmap_raw_read(iqs269->regmap, IQS269_VER_INFO, &ver_info, - sizeof(ver_info)); + error = regmap_raw_read(iqs269->regmap, IQS269_VER_INFO, + &iqs269->ver_info, sizeof(iqs269->ver_info)); if (error) return error; - if (ver_info.prod_num != IQS269_VER_INFO_PROD_NUM) { + if (iqs269->ver_info.prod_num != IQS269_VER_INFO_PROD_NUM) { dev_err(&client->dev, "Unrecognized product number: 0x%02X\n", - ver_info.prod_num); + iqs269->ver_info.prod_num); return -EINVAL; } -- GitLab From 56c083e3f5723c68055ab5f97d21e55aab7bd8ef Mon Sep 17 00:00:00 2001 From: Jeff LaBundy Date: Mon, 1 Jan 2024 14:02:35 -0600 Subject: [PATCH 0612/1290] dt-bindings: input: iqs269a: Add bindings for OTP variants This patch adds bindings for the D0 order code of the device. This order code represents an OTP variant that enables a touch-and-hold function in place of slider 1. Also included is the ability to specify the 00 order code (default option with no OTP customization) explicitly. Signed-off-by: Jeff LaBundy Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/ZZMaW9RkQ9bKXOUn@nixie71 Signed-off-by: Dmitry Torokhov --- .../devicetree/bindings/input/iqs269a.yaml | 30 ++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/input/iqs269a.yaml b/Documentation/devicetree/bindings/input/iqs269a.yaml index b42f07542d27d..2c3f693b8982c 100644 --- a/Documentation/devicetree/bindings/input/iqs269a.yaml +++ b/Documentation/devicetree/bindings/input/iqs269a.yaml @@ -20,7 +20,10 @@ description: | properties: compatible: - const: azoteq,iqs269a + enum: + - azoteq,iqs269a + - azoteq,iqs269a-00 + - azoteq,iqs269a-d0 reg: maxItems: 1 @@ -207,6 +210,16 @@ properties: default: 1 description: Specifies the slider coordinate filter strength. + azoteq,touch-hold-ms: + multipleOf: 256 + minimum: 256 + maximum: 65280 + default: 5120 + description: + Specifies the length of time (in ms) for which the channel selected by + 'azoteq,gpio3-select' must be held in a state of touch in order for an + approximately 60-ms pulse to be asserted on the GPIO4 pin. + linux,keycodes: minItems: 1 maxItems: 8 @@ -514,6 +527,21 @@ patternProperties: additionalProperties: false +if: + properties: + compatible: + contains: + enum: + - azoteq,iqs269a-d0 +then: + patternProperties: + "^channel@[0-7]$": + properties: + azoteq,slider1-select: false +else: + properties: + azoteq,touch-hold-ms: false + required: - compatible - reg -- GitLab From 992bbc9e9ab9abe5bc1ea9a1a8d61331b28f848e Mon Sep 17 00:00:00 2001 From: Jeff LaBundy Date: Mon, 1 Jan 2024 14:02:45 -0600 Subject: [PATCH 0613/1290] Input: iqs269a - add support for OTP variants This patch adds support for each available OTP variant of the device. The OTP configuration cannot be read over I2C, so it is derived from a compatible string instead. Early revisions of the D0 order code require their OTP-enabled func- tionality to be manually restored following a soft reset; this patch accommodates this erratum as well. Signed-off-by: Jeff LaBundy Link: https://lore.kernel.org/r/ZZMaZbdk6iAKUjlm@nixie71 Signed-off-by: Dmitry Torokhov --- drivers/input/misc/iqs269a.c | 92 ++++++++++++++++++++++++++++++++++-- 1 file changed, 89 insertions(+), 3 deletions(-) diff --git a/drivers/input/misc/iqs269a.c b/drivers/input/misc/iqs269a.c index 0d0b5cdc78303..cd14ff9f57cf2 100644 --- a/drivers/input/misc/iqs269a.c +++ b/drivers/input/misc/iqs269a.c @@ -102,6 +102,11 @@ #define IQS269_MISC_B_TRACKING_UI_ENABLE BIT(4) #define IQS269_MISC_B_FILT_STR_SLIDER GENMASK(1, 0) +#define IQS269_TOUCH_HOLD_SLIDER_SEL 0x89 +#define IQS269_TOUCH_HOLD_DEFAULT 0x14 +#define IQS269_TOUCH_HOLD_MS_MIN 256 +#define IQS269_TOUCH_HOLD_MS_MAX 65280 + #define IQS269_TIMEOUT_TAP_MS_MAX 4080 #define IQS269_TIMEOUT_SWIPE_MS_MAX 4080 #define IQS269_THRESH_SWIPE_MAX 255 @@ -151,6 +156,10 @@ #define IQS269_MAX_REG 0xFF +#define IQS269_OTP_OPTION_DEFAULT 0x00 +#define IQS269_OTP_OPTION_TWS 0xD0 +#define IQS269_OTP_OPTION_HOLD BIT(7) + #define IQS269_NUM_CH 8 #define IQS269_NUM_SL 2 @@ -315,6 +324,7 @@ struct iqs269_private { struct input_dev *slider[IQS269_NUM_SL]; unsigned int keycode[ARRAY_SIZE(iqs269_events) * IQS269_NUM_CH]; unsigned int sl_code[IQS269_NUM_SL][IQS269_NUM_GESTURES]; + unsigned int otp_option; unsigned int ch_num; bool hall_enable; bool ati_current; @@ -325,6 +335,14 @@ static enum iqs269_slider_id iqs269_slider_type(struct iqs269_private *iqs269, { int i; + /* + * Slider 1 is unavailable if the touch-and-hold option is enabled via + * OTP. In that case, the channel selection register is repurposed for + * the touch-and-hold timer ceiling. + */ + if (slider_num && (iqs269->otp_option & IQS269_OTP_OPTION_HOLD)) + return IQS269_SLIDER_NONE; + if (!iqs269->sys_reg.slider_select[slider_num]) return IQS269_SLIDER_NONE; @@ -565,7 +583,8 @@ static int iqs269_parse_chan(struct iqs269_private *iqs269, if (fwnode_property_present(ch_node, "azoteq,slider0-select")) iqs269->sys_reg.slider_select[0] |= BIT(reg); - if (fwnode_property_present(ch_node, "azoteq,slider1-select")) + if (fwnode_property_present(ch_node, "azoteq,slider1-select") && + !(iqs269->otp_option & IQS269_OTP_OPTION_HOLD)) iqs269->sys_reg.slider_select[1] |= BIT(reg); ch_reg = &iqs269->sys_reg.ch_reg[reg]; @@ -990,7 +1009,43 @@ static int iqs269_parse_prop(struct iqs269_private *iqs269) sys_reg->blocking = 0; sys_reg->slider_select[0] = 0; - sys_reg->slider_select[1] = 0; + + /* + * If configured via OTP to do so, the device asserts a pulse on the + * GPIO4 pin for approximately 60 ms once a selected channel is held + * in a state of touch for a configurable length of time. + * + * In that case, the register used for slider 1 channel selection is + * repurposed for the touch-and-hold timer ceiling. + */ + if (iqs269->otp_option & IQS269_OTP_OPTION_HOLD) { + if (!device_property_read_u32(&client->dev, + "azoteq,touch-hold-ms", &val)) { + if (val < IQS269_TOUCH_HOLD_MS_MIN || + val > IQS269_TOUCH_HOLD_MS_MAX) { + dev_err(&client->dev, + "Invalid touch-and-hold ceiling: %u\n", + val); + return -EINVAL; + } + + sys_reg->slider_select[1] = val / 256; + } else if (iqs269->ver_info.fw_num < IQS269_VER_INFO_FW_NUM_3) { + /* + * The default touch-and-hold timer ceiling initially + * read from early revisions of silicon is invalid if + * the device experienced a soft reset between power- + * on and the read operation. + * + * To protect against this case, explicitly cache the + * default value so that it is restored each time the + * device is re-initialized. + */ + sys_reg->slider_select[1] = IQS269_TOUCH_HOLD_DEFAULT; + } + } else { + sys_reg->slider_select[1] = 0; + } sys_reg->event_mask = ~((u8)IQS269_EVENT_MASK_SYS); @@ -1137,12 +1192,30 @@ static int iqs269_parse_prop(struct iqs269_private *iqs269) return 0; } +static const struct reg_sequence iqs269_tws_init[] = { + { IQS269_TOUCH_HOLD_SLIDER_SEL, IQS269_TOUCH_HOLD_DEFAULT }, + { 0xF0, 0x580F }, + { 0xF0, 0x59EF }, +}; + static int iqs269_dev_init(struct iqs269_private *iqs269) { int error; mutex_lock(&iqs269->lock); + /* + * Early revisions of silicon require the following workaround in order + * to restore any OTP-enabled functionality after a soft reset. + */ + if (iqs269->otp_option == IQS269_OTP_OPTION_TWS && + iqs269->ver_info.fw_num < IQS269_VER_INFO_FW_NUM_3) { + error = regmap_multi_reg_write(iqs269->regmap, iqs269_tws_init, + ARRAY_SIZE(iqs269_tws_init)); + if (error) + goto err_mutex; + } + error = regmap_update_bits(iqs269->regmap, IQS269_HALL_UI, IQS269_HALL_UI_ENABLE, iqs269->hall_enable ? ~0 : 0); @@ -1779,6 +1852,8 @@ static int iqs269_probe(struct i2c_client *client) mutex_init(&iqs269->lock); init_completion(&iqs269->ati_done); + iqs269->otp_option = (uintptr_t)device_get_match_data(&client->dev); + error = regmap_raw_read(iqs269->regmap, IQS269_VER_INFO, &iqs269->ver_info, sizeof(iqs269->ver_info)); if (error) @@ -1889,7 +1964,18 @@ static int iqs269_resume(struct device *dev) static DEFINE_SIMPLE_DEV_PM_OPS(iqs269_pm, iqs269_suspend, iqs269_resume); static const struct of_device_id iqs269_of_match[] = { - { .compatible = "azoteq,iqs269a" }, + { + .compatible = "azoteq,iqs269a", + .data = (void *)IQS269_OTP_OPTION_DEFAULT, + }, + { + .compatible = "azoteq,iqs269a-00", + .data = (void *)IQS269_OTP_OPTION_DEFAULT, + }, + { + .compatible = "azoteq,iqs269a-d0", + .data = (void *)IQS269_OTP_OPTION_TWS, + }, { } }; MODULE_DEVICE_TABLE(of, iqs269_of_match); -- GitLab From 58f1e9d3a30438042fc9ed65b3dc56b2e5f7886a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 2 Jan 2024 09:39:17 -0800 Subject: [PATCH 0614/1290] cxl/region: use %pap format to print resource_size_t Use "%pap" to print a resource_size_t (phys_addr_t derived type) to prevent build warnings on 32-bit arches (seen on i386 and riscv-32). ../drivers/cxl/core/region.c: In function 'alloc_hpa': ../drivers/cxl/core/region.c:556:25: warning: format '%llx' expects argument of type 'long long unsigned int', but argument 5 has type 'resource_size_t' {aka 'unsigned int'} [-Wformat=] 556 | "HPA allocation error (%ld) for size:%#llx in %s %pr\n", Fixes: 7984d22f1315 ("cxl/region: Add dev_dbg() detail on failure to allocate HPA space") Signed-off-by: Randy Dunlap Cc: Fan Ni Cc: Davidlohr Bueso Cc: Jonathan Cameron Cc: Dave Jiang Cc: Alison Schofield Cc: Vishal Verma Cc: Ira Weiny Cc: Dan Williams Cc: Link: https://lore.kernel.org/r/20240102173917.19718-1-rdunlap@infradead.org Signed-off-by: Dan Williams --- drivers/cxl/core/region.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index d904f9752bc38..0e88f1aed0184 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -553,8 +553,8 @@ static int alloc_hpa(struct cxl_region *cxlr, resource_size_t size) dev_name(&cxlr->dev)); if (IS_ERR(res)) { dev_dbg(&cxlr->dev, - "HPA allocation error (%ld) for size:%#llx in %s %pr\n", - PTR_ERR(res), size, cxlrd->res->name, cxlrd->res); + "HPA allocation error (%ld) for size:%pap in %s %pr\n", + PTR_ERR(res), &size, cxlrd->res->name, cxlrd->res); return PTR_ERR(res); } -- GitLab From efa56305908ba20de2104f1b8508c6a7401833be Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Fri, 22 Dec 2023 16:17:48 +0100 Subject: [PATCH 0615/1290] nvmet-tcp: Fix a kernel panic when host sends an invalid H2C PDU length If the host sends an H2CData command with an invalid DATAL, the kernel may crash in nvmet_tcp_build_pdu_iovec(). Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 lr : nvmet_tcp_io_work+0x6ac/0x718 [nvmet_tcp] Call trace: process_one_work+0x174/0x3c8 worker_thread+0x2d0/0x3e8 kthread+0x104/0x110 Fix the bug by raising a fatal error if DATAL isn't coherent with the packet size. Also, the PDU length should never exceed the MAXH2CDATA parameter which has been communicated to the host in nvmet_tcp_handle_icreq(). Fixes: 872d26a391da ("nvmet-tcp: add NVMe over TCP target driver") Signed-off-by: Maurizio Lombardi Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 4cc27856aa8fe..ad16795934b83 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -24,6 +24,7 @@ #include "nvmet.h" #define NVMET_TCP_DEF_INLINE_DATA_SIZE (4 * PAGE_SIZE) +#define NVMET_TCP_MAXH2CDATA 0x400000 /* 16M arbitrary limit */ static int param_store_val(const char *str, int *val, int min, int max) { @@ -923,7 +924,7 @@ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue) icresp->hdr.pdo = 0; icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen); icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0); - icresp->maxdata = cpu_to_le32(0x400000); /* 16M arbitrary limit */ + icresp->maxdata = cpu_to_le32(NVMET_TCP_MAXH2CDATA); icresp->cpda = 0; if (queue->hdr_digest) icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE; @@ -978,6 +979,7 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) { struct nvme_tcp_data_pdu *data = &queue->pdu.data; struct nvmet_tcp_cmd *cmd; + unsigned int plen; if (likely(queue->nr_cmds)) { if (unlikely(data->ttag >= queue->nr_cmds)) { @@ -1001,7 +1003,16 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) return -EPROTO; } + plen = le32_to_cpu(data->hdr.plen); cmd->pdu_len = le32_to_cpu(data->data_length); + if (unlikely(cmd->pdu_len != (plen - sizeof(*data)) || + cmd->pdu_len == 0 || + cmd->pdu_len > NVMET_TCP_MAXH2CDATA)) { + pr_err("H2CData PDU len %u is invalid\n", cmd->pdu_len); + /* FIXME: use proper transport errors */ + nvmet_tcp_fatal_error(queue); + return -EPROTO; + } cmd->pdu_recv = 0; nvmet_tcp_build_pdu_iovec(cmd); queue->cmd = cmd; -- GitLab From 0849a5441358cef02586fb2d60f707c0db195628 Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Fri, 22 Dec 2023 16:17:49 +0100 Subject: [PATCH 0616/1290] nvmet-tcp: fix a crash in nvmet_req_complete() in nvmet_tcp_handle_h2c_data_pdu(), if the host sends a data_offset different from rbytes_done, the driver ends up calling nvmet_req_complete() passing a status error. The problem is that at this point cmd->req is not yet initialized, the kernel will crash after dereferencing a NULL pointer. Fix the bug by replacing the call to nvmet_req_complete() with nvmet_tcp_fatal_error(). Fixes: 872d26a391da ("nvmet-tcp: add NVMe over TCP target driver") Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Maurizio Lombardi Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index ad16795934b83..b4b6a8ac80895 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -998,8 +998,7 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) data->ttag, le32_to_cpu(data->data_offset), cmd->rbytes_done); /* FIXME: use path and transport errors */ - nvmet_req_complete(&cmd->req, - NVME_SC_INVALID_FIELD | NVME_SC_DNR); + nvmet_tcp_fatal_error(queue); return -EPROTO; } -- GitLab From 75011bd0f9c55db523242f9f9a0b0b826165f14b Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Fri, 22 Dec 2023 16:17:50 +0100 Subject: [PATCH 0617/1290] nvmet-tcp: remove boilerplate code Simplify the nvmet_tcp_handle_h2c_data_pdu() function by removing boilerplate code. Signed-off-by: Maurizio Lombardi Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index b4b6a8ac80895..3569c1255c5eb 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -985,8 +985,7 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) if (unlikely(data->ttag >= queue->nr_cmds)) { pr_err("queue %d: received out of bound ttag %u, nr_cmds %u\n", queue->idx, data->ttag, queue->nr_cmds); - nvmet_tcp_fatal_error(queue); - return -EPROTO; + goto err_proto; } cmd = &queue->cmds[data->ttag]; } else { @@ -997,9 +996,7 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) pr_err("ttag %u unexpected data offset %u (expected %u)\n", data->ttag, le32_to_cpu(data->data_offset), cmd->rbytes_done); - /* FIXME: use path and transport errors */ - nvmet_tcp_fatal_error(queue); - return -EPROTO; + goto err_proto; } plen = le32_to_cpu(data->hdr.plen); @@ -1008,9 +1005,7 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) cmd->pdu_len == 0 || cmd->pdu_len > NVMET_TCP_MAXH2CDATA)) { pr_err("H2CData PDU len %u is invalid\n", cmd->pdu_len); - /* FIXME: use proper transport errors */ - nvmet_tcp_fatal_error(queue); - return -EPROTO; + goto err_proto; } cmd->pdu_recv = 0; nvmet_tcp_build_pdu_iovec(cmd); @@ -1018,6 +1013,11 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) queue->rcv_state = NVMET_TCP_RECV_DATA; return 0; + +err_proto: + /* FIXME: use proper transport errors */ + nvmet_tcp_fatal_error(queue); + return -EPROTO; } static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue) -- GitLab From 7097c96411d22a1b3f6370dfd7eb2e3b7b83ff98 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 2 Jan 2024 16:28:04 +0000 Subject: [PATCH 0618/1290] cachefiles: Fix __cachefiles_prepare_write() Fix __cachefiles_prepare_write() to correctly determine whether the requested write will fit correctly with the DIO alignment. Reported-by: Gao Xiang Signed-off-by: David Howells Tested-by: Yiqun Leng Tested-by: Jia Zhu cc: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-erofs@lists.ozlabs.org cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/cachefiles/io.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index bffffedce4a93..7529b40bc95a9 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -522,16 +522,22 @@ int __cachefiles_prepare_write(struct cachefiles_object *object, bool no_space_allocated_yet) { struct cachefiles_cache *cache = object->volume->cache; - loff_t start = *_start, pos; - size_t len = *_len, down; + unsigned long long start = *_start, pos; + size_t len = *_len; int ret; /* Round to DIO size */ - down = start - round_down(start, PAGE_SIZE); - *_start = start - down; - *_len = round_up(down + len, PAGE_SIZE); - if (down < start || *_len > upper_len) + start = round_down(*_start, PAGE_SIZE); + if (start != *_start) { + kleave(" = -ENOBUFS [down]"); + return -ENOBUFS; + } + if (*_len > upper_len) { + kleave(" = -ENOBUFS [up]"); return -ENOBUFS; + } + + *_len = round_up(len, PAGE_SIZE); /* We need to work out whether there's sufficient disk space to perform * the write - but we can skip that check if we have space already @@ -542,7 +548,7 @@ int __cachefiles_prepare_write(struct cachefiles_object *object, pos = cachefiles_inject_read_error(); if (pos == 0) - pos = vfs_llseek(file, *_start, SEEK_DATA); + pos = vfs_llseek(file, start, SEEK_DATA); if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) { if (pos == -ENXIO) goto check_space; /* Unallocated tail */ @@ -550,7 +556,7 @@ int __cachefiles_prepare_write(struct cachefiles_object *object, cachefiles_trace_seek_error); return pos; } - if ((u64)pos >= (u64)*_start + *_len) + if (pos >= start + *_len) goto check_space; /* Unallocated region */ /* We have a block that's at least partially filled - if we're low on @@ -563,13 +569,13 @@ int __cachefiles_prepare_write(struct cachefiles_object *object, pos = cachefiles_inject_read_error(); if (pos == 0) - pos = vfs_llseek(file, *_start, SEEK_HOLE); + pos = vfs_llseek(file, start, SEEK_HOLE); if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) { trace_cachefiles_io_error(object, file_inode(file), pos, cachefiles_trace_seek_error); return pos; } - if ((u64)pos >= (u64)*_start + *_len) + if (pos >= start + *_len) return 0; /* Fully allocated */ /* Partially allocated, but insufficient space: cull. */ @@ -577,7 +583,7 @@ int __cachefiles_prepare_write(struct cachefiles_object *object, ret = cachefiles_inject_remove_error(); if (ret == 0) ret = vfs_fallocate(file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, - *_start, *_len); + start, *_len); if (ret < 0) { trace_cachefiles_io_error(object, file_inode(file), ret, cachefiles_trace_fallocate_error); -- GitLab From 9546ac78b232bac56ff975072b1965e0e755ebd4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 2 Jan 2024 20:33:17 +0000 Subject: [PATCH 0619/1290] 9p: Fix initialisation of netfs_inode for 9p The 9p filesystem is calling netfs_inode_init() in v9fs_init_inode() - before the struct inode fields have been initialised from the obtained file stats (ie. after v9fs_stat2inode*() has been called), but netfslib wants to set a couple of its fields from i_size. Reported-by: Marc Dionne Signed-off-by: David Howells Tested-by: Marc Dionne Tested-by: Dominique Martinet Acked-by: Dominique Martinet cc: Eric Van Hensbergen cc: Latchesar Ionkov cc: Dominique Martinet cc: Christian Schoenebeck cc: v9fs@lists.linux.dev cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org --- fs/9p/v9fs_vfs.h | 1 + fs/9p/vfs_inode.c | 6 +++--- fs/9p/vfs_inode_dotl.c | 1 + 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 731e3d14b67d3..0e8418066a482 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -42,6 +42,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb); void v9fs_free_inode(struct inode *inode); struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev); +void v9fs_set_netfs_context(struct inode *inode); int v9fs_init_inode(struct v9fs_session_info *v9ses, struct inode *inode, umode_t mode, dev_t rdev); void v9fs_evict_inode(struct inode *inode); diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index b66466e97459c..32572982f72e6 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -246,7 +246,7 @@ void v9fs_free_inode(struct inode *inode) /* * Set parameters for the netfs library */ -static void v9fs_set_netfs_context(struct inode *inode) +void v9fs_set_netfs_context(struct inode *inode) { struct v9fs_inode *v9inode = V9FS_I(inode); netfs_inode_init(&v9inode->netfs, &v9fs_req_ops, true); @@ -326,8 +326,6 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, err = -EINVAL; goto error; } - - v9fs_set_netfs_context(inode); error: return err; @@ -359,6 +357,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev) iput(inode); return ERR_PTR(err); } + v9fs_set_netfs_context(inode); return inode; } @@ -461,6 +460,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb, goto error; v9fs_stat2inode(st, inode, sb, 0); + v9fs_set_netfs_context(inode); v9fs_cache_inode_get_cookie(inode); unlock_new_inode(inode); return inode; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index e25fbc988f095..3505227e17040 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -128,6 +128,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, goto error; v9fs_stat2inode_dotl(st, inode, 0); + v9fs_set_netfs_context(inode); v9fs_cache_inode_get_cookie(inode); retval = v9fs_get_acl(inode, fid); if (retval) -- GitLab From ef184b8844bf98a2a80fab8eecda1489aed5d97f Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Sun, 31 Dec 2023 14:56:44 +0800 Subject: [PATCH 0620/1290] nvme: tcp: remove unnecessary goto statement There is no requirement to call nvme_tcp_free_queue() for queue deallocation if the pskid is null or the queue allocation fails, as the NVME_TCP_Q_ALLOCATED flag would not be set in such scenarios. Signed-off-by: Guixin Liu Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index d79811cfa0ce8..5056bcae2f391 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1922,14 +1922,13 @@ static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl) ctrl->opts->subsysnqn); if (!pskid) { dev_err(ctrl->device, "no valid PSK found\n"); - ret = -ENOKEY; - goto out_free_queue; + return -ENOKEY; } } ret = nvme_tcp_alloc_queue(ctrl, 0, pskid); if (ret) - goto out_free_queue; + return ret; ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl)); if (ret) -- GitLab From 2ad28ce9b98f8b22feaecc0966c706a8ef59cbf0 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 1 Jan 2024 12:35:27 +0200 Subject: [PATCH 0621/1290] nvme: remove unused definition There is no users for NVMF_AUTH_HASH_LEN macro. Reviewed-by: Israel Rukshin Reviewed-by: Sagi Grimberg Signed-off-by: Max Gurtovoy Signed-off-by: Keith Busch --- include/linux/nvme.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 44325c068b6a0..462c21e0e4176 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -20,7 +20,6 @@ #define NVMF_TRSVCID_SIZE 32 #define NVMF_TRADDR_SIZE 256 #define NVMF_TSAS_SIZE 256 -#define NVMF_AUTH_HASH_LEN 64 #define NVME_DISC_SUBSYS_NAME "nqn.2014-08.org.nvmexpress.discovery" -- GitLab From 2abd2c39ada8200ca5f02d483dccfa82799f51a7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Dec 2023 08:14:12 +0000 Subject: [PATCH 0622/1290] nvme-common: mark nvme_tls_psk_prio static Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/common/keyring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/common/keyring.c b/drivers/nvme/common/keyring.c index ee341b83eebaf..a5c0431c101cf 100644 --- a/drivers/nvme/common/keyring.c +++ b/drivers/nvme/common/keyring.c @@ -111,7 +111,7 @@ static struct key *nvme_tls_psk_lookup(struct key *keyring, * should be preferred to 'generated' PSKs, * and SHA-384 should be preferred to SHA-256. */ -struct nvme_tls_psk_priority_list { +static struct nvme_tls_psk_priority_list { bool generated; enum nvme_tcp_tls_cipher cipher; } nvme_tls_psk_prio[] = { -- GitLab From 3a96bff229d6e3016805fd6c3dba0655ccba01eb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Dec 2023 08:13:29 +0000 Subject: [PATCH 0623/1290] nvmet-tcp: fix a missing endianess conversion in nvmet_tcp_try_peek_pdu No, a __le32 cast doesn't magically byteswap on big-endian systems.. Fixes: 70525e5d82f6 ("nvmet-tcp: peek icreq before starting TLS") Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 3569c1255c5eb..792828fb91ccf 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1778,7 +1778,7 @@ static int nvmet_tcp_try_peek_pdu(struct nvmet_tcp_queue *queue) (int)sizeof(struct nvme_tcp_icreq_pdu)); if (hdr->type == nvme_tcp_icreq && hdr->hlen == sizeof(struct nvme_tcp_icreq_pdu) && - hdr->plen == (__le32)sizeof(struct nvme_tcp_icreq_pdu)) { + hdr->plen == cpu_to_le32(sizeof(struct nvme_tcp_icreq_pdu))) { pr_debug("queue %d: icreq detected\n", queue->idx); return len; -- GitLab From d3074e9a73e3c0511f1033b15345e2feb9664b3c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Dec 2023 08:58:41 +0000 Subject: [PATCH 0624/1290] nvme: update the explanation for not updating the limits in nvme_config_discard Expeand the comment a bit to explain what is going on. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d144d1acb09a0..56107cfc97b7b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1743,7 +1743,13 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk, queue->limits.discard_granularity = size; - /* If discard is already enabled, don't reset queue limits */ + /* + * If discard is already enabled, don't reset queue limits. + * + * This works around the fact that the block layer can't cope well with + * updating the hardware limits when overridden through sysfs. This is + * harmless because discard limits in NVMe are purely advisory. + */ if (queue->limits.max_discard_sectors) return; -- GitLab From a4be9679aa3e862adcab465122c7678c2b5d40e6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Dec 2023 08:58:42 +0000 Subject: [PATCH 0625/1290] nvme: also skip discard granularity updates in nvme_config_discard Don't just skip the discard sectors and segments but also the granularity if a value was already set before. Signed-off-by: Christoph Hellwig Reviewed-by: Max Gurtovoy Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 56107cfc97b7b..6c52b0ab382c8 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1727,7 +1727,6 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk, struct nvme_ns_head *head) { struct request_queue *queue = disk->queue; - u32 size = queue_logical_block_size(queue); if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(head, UINT_MAX)) ctrl->max_discard_sectors = @@ -1741,8 +1740,6 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk, BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) < NVME_DSM_MAX_RANGES); - queue->limits.discard_granularity = size; - /* * If discard is already enabled, don't reset queue limits. * @@ -1755,6 +1752,7 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk, blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors); blk_queue_max_discard_segments(queue, ctrl->max_discard_segments); + queue->limits.discard_granularity = queue_logical_block_size(queue); if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) blk_queue_max_write_zeroes_sectors(queue, UINT_MAX); -- GitLab From f29886c249ec2ed566e423fd02f6071b8f0a3346 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Dec 2023 08:58:43 +0000 Subject: [PATCH 0626/1290] nvme: fix max_discard_sectors calculation ctrl->max_discard_sectors stores a value that is potentially based of the DMRSL field in Identify Controller, which is in units of LBAs and thus dependent on the Format of a namespace. Fix this by moving the calculation of max_discard_sectors entirely into nvme_config_discard and replacing the ctrl->max_discard_sectors value with a local variable so that the calculation is always namespace-specific. Fixes: 1a86924e4f46 ("nvme: fix interpretation of DMRSL") Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 20 +++++++++----------- drivers/nvme/host/nvme.h | 1 - 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6c52b0ab382c8..86b9a1c4876f5 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1727,12 +1727,13 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk, struct nvme_ns_head *head) { struct request_queue *queue = disk->queue; + u32 max_discard_sectors; - if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(head, UINT_MAX)) - ctrl->max_discard_sectors = - nvme_lba_to_sect(head, ctrl->dmrsl); - - if (ctrl->max_discard_sectors == 0) { + if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(head, UINT_MAX)) { + max_discard_sectors = nvme_lba_to_sect(head, ctrl->dmrsl); + } else if (ctrl->oncs & NVME_CTRL_ONCS_DSM) { + max_discard_sectors = UINT_MAX; + } else { blk_queue_max_discard_sectors(queue, 0); return; } @@ -1750,7 +1751,7 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk, if (queue->limits.max_discard_sectors) return; - blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors); + blk_queue_max_discard_sectors(queue, max_discard_sectors); blk_queue_max_discard_segments(queue, ctrl->max_discard_segments); queue->limits.discard_granularity = queue_logical_block_size(queue); @@ -2911,13 +2912,10 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl) struct nvme_id_ctrl_nvm *id; int ret; - if (ctrl->oncs & NVME_CTRL_ONCS_DSM) { - ctrl->max_discard_sectors = UINT_MAX; + if (ctrl->oncs & NVME_CTRL_ONCS_DSM) ctrl->max_discard_segments = NVME_DSM_MAX_RANGES; - } else { - ctrl->max_discard_sectors = 0; + else ctrl->max_discard_segments = 0; - } /* * Even though NVMe spec explicitly states that MDTS is not applicable diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 3dbd187896d8d..9a698c49ea036 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -297,7 +297,6 @@ struct nvme_ctrl { u32 max_hw_sectors; u32 max_segments; u32 max_integrity_segments; - u32 max_discard_sectors; u32 max_discard_segments; u32 max_zeroes_sectors; #ifdef CONFIG_BLK_DEV_ZONED -- GitLab From 3b946fe1cc149b23dad3a233c77b1475834f4d6f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Dec 2023 08:58:44 +0000 Subject: [PATCH 0627/1290] nvme: simplify the max_discard_segments calculation Just stash away the DMRL value in the nvme_ctrl struture, and leave all interpretation to nvme_config_discard, where we know DSM is supported by the time we're configuring the number of segments. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 13 +++++-------- drivers/nvme/host/nvme.h | 2 +- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 86b9a1c4876f5..50818dbcfa1ae 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1752,7 +1752,10 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk, return; blk_queue_max_discard_sectors(queue, max_discard_sectors); - blk_queue_max_discard_segments(queue, ctrl->max_discard_segments); + if (ctrl->dmrl) + blk_queue_max_discard_segments(queue, ctrl->dmrl); + else + blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES); queue->limits.discard_granularity = queue_logical_block_size(queue); if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) @@ -2912,11 +2915,6 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl) struct nvme_id_ctrl_nvm *id; int ret; - if (ctrl->oncs & NVME_CTRL_ONCS_DSM) - ctrl->max_discard_segments = NVME_DSM_MAX_RANGES; - else - ctrl->max_discard_segments = 0; - /* * Even though NVMe spec explicitly states that MDTS is not applicable * to the write-zeroes, we are cautious and limit the size to the @@ -2946,8 +2944,7 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl) if (ret) goto free_data; - if (id->dmrl) - ctrl->max_discard_segments = id->dmrl; + ctrl->dmrl = id->dmrl; ctrl->dmrsl = le32_to_cpu(id->dmrsl); if (id->wzsl) ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9a698c49ea036..297b80430f1b6 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -297,13 +297,13 @@ struct nvme_ctrl { u32 max_hw_sectors; u32 max_segments; u32 max_integrity_segments; - u32 max_discard_segments; u32 max_zeroes_sectors; #ifdef CONFIG_BLK_DEV_ZONED u32 max_zone_append; #endif u16 crdt[3]; u16 oncs; + u8 dmrl; u32 dmrsl; u16 oacs; u16 sqsize; -- GitLab From 72e8c9379dbef2662c2479c3d142e4c44d598a5b Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Mon, 18 Dec 2023 16:30:50 +0100 Subject: [PATCH 0628/1290] nvmet-fc: remove unnecessary bracket There is no need for the bracket around the identifier. Remove it. Signed-off-by: Daniel Wagner Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/target/fc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index bd59990b52501..bda7a3009e851 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1031,7 +1031,7 @@ nvmet_fc_match_hostport(struct nvmet_fc_tgtport *tgtport, void *hosthandle) list_for_each_entry(host, &tgtport->host_list, host_list) { if (host->hosthandle == hosthandle && !host->invalid) { if (nvmet_fc_hostport_get(host)) - return (host); + return host; } } -- GitLab From 0e716cec6fb11a14c220ee17c404b67962e902f7 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Mon, 18 Dec 2023 16:30:51 +0100 Subject: [PATCH 0629/1290] nvmet-trace: avoid dereferencing pointer too early The first command issued from the host to the target is the fabrics connect command. At this point, neither the target queue nor the controller have been allocated. But we already try to trace this command in nvmet_req_init. Reported by KASAN. Reviewed-by: Hannes Reinecke Signed-off-by: Daniel Wagner Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/trace.c | 6 +++--- drivers/nvme/target/trace.h | 28 +++++++++++++++++----------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/drivers/nvme/target/trace.c b/drivers/nvme/target/trace.c index bff454d46255b..6ee1f3db81d04 100644 --- a/drivers/nvme/target/trace.c +++ b/drivers/nvme/target/trace.c @@ -211,7 +211,7 @@ const char *nvmet_trace_disk_name(struct trace_seq *p, char *name) return ret; } -const char *nvmet_trace_ctrl_name(struct trace_seq *p, struct nvmet_ctrl *ctrl) +const char *nvmet_trace_ctrl_id(struct trace_seq *p, u16 ctrl_id) { const char *ret = trace_seq_buffer_ptr(p); @@ -224,8 +224,8 @@ const char *nvmet_trace_ctrl_name(struct trace_seq *p, struct nvmet_ctrl *ctrl) * If we can know the extra data of the connect command in this stage, * we can update this print statement later. */ - if (ctrl) - trace_seq_printf(p, "%d", ctrl->cntlid); + if (ctrl_id) + trace_seq_printf(p, "%d", ctrl_id); else trace_seq_printf(p, "_"); trace_seq_putc(p, 0); diff --git a/drivers/nvme/target/trace.h b/drivers/nvme/target/trace.h index 6109b3806b12b..2f15070ddc569 100644 --- a/drivers/nvme/target/trace.h +++ b/drivers/nvme/target/trace.h @@ -32,18 +32,24 @@ const char *nvmet_trace_parse_fabrics_cmd(struct trace_seq *p, u8 fctype, nvmet_trace_parse_nvm_cmd(p, opcode, cdw10) : \ nvmet_trace_parse_admin_cmd(p, opcode, cdw10))) -const char *nvmet_trace_ctrl_name(struct trace_seq *p, struct nvmet_ctrl *ctrl); -#define __print_ctrl_name(ctrl) \ - nvmet_trace_ctrl_name(p, ctrl) +const char *nvmet_trace_ctrl_id(struct trace_seq *p, u16 ctrl_id); +#define __print_ctrl_id(ctrl_id) \ + nvmet_trace_ctrl_id(p, ctrl_id) const char *nvmet_trace_disk_name(struct trace_seq *p, char *name); #define __print_disk_name(name) \ nvmet_trace_disk_name(p, name) #ifndef TRACE_HEADER_MULTI_READ -static inline struct nvmet_ctrl *nvmet_req_to_ctrl(struct nvmet_req *req) +static inline u16 nvmet_req_to_ctrl_id(struct nvmet_req *req) { - return req->sq->ctrl; + /* + * The queue and controller pointers are not valid until an association + * has been established. + */ + if (!req->sq || !req->sq->ctrl) + return 0; + return req->sq->ctrl->cntlid; } static inline void __assign_req_name(char *name, struct nvmet_req *req) @@ -63,7 +69,7 @@ TRACE_EVENT(nvmet_req_init, TP_ARGS(req, cmd), TP_STRUCT__entry( __field(struct nvme_command *, cmd) - __field(struct nvmet_ctrl *, ctrl) + __field(u16, ctrl_id) __array(char, disk, DISK_NAME_LEN) __field(int, qid) __field(u16, cid) @@ -76,7 +82,7 @@ TRACE_EVENT(nvmet_req_init, ), TP_fast_assign( __entry->cmd = cmd; - __entry->ctrl = nvmet_req_to_ctrl(req); + __entry->ctrl_id = nvmet_req_to_ctrl_id(req); __assign_req_name(__entry->disk, req); __entry->qid = req->sq->qid; __entry->cid = cmd->common.command_id; @@ -90,7 +96,7 @@ TRACE_EVENT(nvmet_req_init, ), TP_printk("nvmet%s: %sqid=%d, cmdid=%u, nsid=%u, flags=%#x, " "meta=%#llx, cmd=(%s, %s)", - __print_ctrl_name(__entry->ctrl), + __print_ctrl_id(__entry->ctrl_id), __print_disk_name(__entry->disk), __entry->qid, __entry->cid, __entry->nsid, __entry->flags, __entry->metadata, @@ -104,7 +110,7 @@ TRACE_EVENT(nvmet_req_complete, TP_PROTO(struct nvmet_req *req), TP_ARGS(req), TP_STRUCT__entry( - __field(struct nvmet_ctrl *, ctrl) + __field(u16, ctrl_id) __array(char, disk, DISK_NAME_LEN) __field(int, qid) __field(int, cid) @@ -112,7 +118,7 @@ TRACE_EVENT(nvmet_req_complete, __field(u16, status) ), TP_fast_assign( - __entry->ctrl = nvmet_req_to_ctrl(req); + __entry->ctrl_id = nvmet_req_to_ctrl_id(req); __entry->qid = req->cq->qid; __entry->cid = req->cqe->command_id; __entry->result = le64_to_cpu(req->cqe->result.u64); @@ -120,7 +126,7 @@ TRACE_EVENT(nvmet_req_complete, __assign_req_name(__entry->disk, req); ), TP_printk("nvmet%s: %sqid=%d, cmdid=%u, res=%#llx, status=%#x", - __print_ctrl_name(__entry->ctrl), + __print_ctrl_id(__entry->ctrl_id), __print_disk_name(__entry->disk), __entry->qid, __entry->cid, __entry->result, __entry->status) -- GitLab From 1af5aa82c976753e93eb52b72784e586a7d2844b Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Mon, 27 Nov 2023 20:59:04 +0300 Subject: [PATCH 0630/1290] apparmor: free the allocated pdb objects policy_db objects are allocated with kzalloc() inside aa_alloc_pdb() and are not cleared in the corresponding aa_free_pdb() function causing leak: unreferenced object 0xffff88801f0a1400 (size 192): comm "apparmor_parser", pid 1247, jiffies 4295122827 (age 2306.399s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] __kmem_cache_alloc_node+0x1e2/0x2d0 [] kmalloc_trace+0x25/0xc0 [] aa_alloc_pdb+0x82/0x140 [] unpack_pdb+0xc7/0x2700 [] unpack_profile+0x450/0x4960 [] aa_unpack+0x309/0x15e0 [] aa_replace_profiles+0x213/0x33c0 [] policy_update+0x261/0x370 [] profile_replace+0x20e/0x2a0 [] vfs_write+0x2af/0xe00 [] ksys_write+0x126/0x250 [] do_syscall_64+0x46/0xf0 [] entry_SYSCALL_64_after_hwframe+0x6e/0x76 Free the pdbs inside aa_free_pdb(). While at it, rename the variable representing an aa_policydb object to make the function more unified with aa_pdb_free_kref() and aa_alloc_pdb(). Found by Linux Verification Center (linuxtesting.org). Fixes: 98b824ff8984 ("apparmor: refcount the pdb") Signed-off-by: Fedor Pchelkin Signed-off-by: John Johansen --- security/apparmor/policy.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c index ed4c9803c8fad..957654d253dd7 100644 --- a/security/apparmor/policy.c +++ b/security/apparmor/policy.c @@ -99,13 +99,14 @@ const char *const aa_profile_mode_names[] = { }; -static void aa_free_pdb(struct aa_policydb *policy) +static void aa_free_pdb(struct aa_policydb *pdb) { - if (policy) { - aa_put_dfa(policy->dfa); - if (policy->perms) - kvfree(policy->perms); - aa_free_str_table(&policy->trans); + if (pdb) { + aa_put_dfa(pdb->dfa); + if (pdb->perms) + kvfree(pdb->perms); + aa_free_str_table(&pdb->trans); + kfree(pdb); } } -- GitLab From 9c51f8788b5d4e9f46afbcf563255cfd355690b3 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 6 Dec 2023 17:46:55 -0800 Subject: [PATCH 0631/1290] perf env: Avoid recursively taking env->bpf_progs.lock Add variants of perf_env__insert_bpf_prog_info(), perf_env__insert_btf() and perf_env__find_btf prefixed with __ to indicate the env->bpf_progs.lock is assumed held. Call these variants when the lock is held to avoid recursively taking it and potentially having a thread deadlock with itself. Fixes: f8dfeae009effc0b ("perf bpf: Show more BPF program info in print_bpf_prog_info()") Signed-off-by: Ian Rogers Acked-by: Jiri Olsa Acked-by: Song Liu Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Huacai Chen Cc: Ingo Molnar Cc: K Prateek Nayak Cc: Kan Liang Cc: Mark Rutland Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Link: https://lore.kernel.org/r/20231207014655.1252484-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf-event.c | 8 +++--- tools/perf/util/bpf-event.h | 12 ++++----- tools/perf/util/env.c | 50 ++++++++++++++++++++++++------------- tools/perf/util/env.h | 4 +++ tools/perf/util/header.c | 8 +++--- 5 files changed, 50 insertions(+), 32 deletions(-) diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index 830711cae30d3..3573e0b7ef3ed 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -545,9 +545,9 @@ int evlist__add_bpf_sb_event(struct evlist *evlist, struct perf_env *env) return evlist__add_sb_event(evlist, &attr, bpf_event__sb_cb, env); } -void bpf_event__print_bpf_prog_info(struct bpf_prog_info *info, - struct perf_env *env, - FILE *fp) +void __bpf_event__print_bpf_prog_info(struct bpf_prog_info *info, + struct perf_env *env, + FILE *fp) { __u32 *prog_lens = (__u32 *)(uintptr_t)(info->jited_func_lens); __u64 *prog_addrs = (__u64 *)(uintptr_t)(info->jited_ksyms); @@ -563,7 +563,7 @@ void bpf_event__print_bpf_prog_info(struct bpf_prog_info *info, if (info->btf_id) { struct btf_node *node; - node = perf_env__find_btf(env, info->btf_id); + node = __perf_env__find_btf(env, info->btf_id); if (node) btf = btf__new((__u8 *)(node->data), node->data_size); diff --git a/tools/perf/util/bpf-event.h b/tools/perf/util/bpf-event.h index 1bcbd4fb6c669..e2f0420905f59 100644 --- a/tools/perf/util/bpf-event.h +++ b/tools/perf/util/bpf-event.h @@ -33,9 +33,9 @@ struct btf_node { int machine__process_bpf(struct machine *machine, union perf_event *event, struct perf_sample *sample); int evlist__add_bpf_sb_event(struct evlist *evlist, struct perf_env *env); -void bpf_event__print_bpf_prog_info(struct bpf_prog_info *info, - struct perf_env *env, - FILE *fp); +void __bpf_event__print_bpf_prog_info(struct bpf_prog_info *info, + struct perf_env *env, + FILE *fp); #else static inline int machine__process_bpf(struct machine *machine __maybe_unused, union perf_event *event __maybe_unused, @@ -50,9 +50,9 @@ static inline int evlist__add_bpf_sb_event(struct evlist *evlist __maybe_unused, return 0; } -static inline void bpf_event__print_bpf_prog_info(struct bpf_prog_info *info __maybe_unused, - struct perf_env *env __maybe_unused, - FILE *fp __maybe_unused) +static inline void __bpf_event__print_bpf_prog_info(struct bpf_prog_info *info __maybe_unused, + struct perf_env *env __maybe_unused, + FILE *fp __maybe_unused) { } diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index c68b7a004f294..a459374d0a1a1 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -24,13 +24,19 @@ struct perf_env perf_env; void perf_env__insert_bpf_prog_info(struct perf_env *env, struct bpf_prog_info_node *info_node) +{ + down_write(&env->bpf_progs.lock); + __perf_env__insert_bpf_prog_info(env, info_node); + up_write(&env->bpf_progs.lock); +} + +void __perf_env__insert_bpf_prog_info(struct perf_env *env, struct bpf_prog_info_node *info_node) { __u32 prog_id = info_node->info_linear->info.id; struct bpf_prog_info_node *node; struct rb_node *parent = NULL; struct rb_node **p; - down_write(&env->bpf_progs.lock); p = &env->bpf_progs.infos.rb_node; while (*p != NULL) { @@ -42,15 +48,13 @@ void perf_env__insert_bpf_prog_info(struct perf_env *env, p = &(*p)->rb_right; } else { pr_debug("duplicated bpf prog info %u\n", prog_id); - goto out; + return; } } rb_link_node(&info_node->rb_node, parent, p); rb_insert_color(&info_node->rb_node, &env->bpf_progs.infos); env->bpf_progs.infos_cnt++; -out: - up_write(&env->bpf_progs.lock); } struct bpf_prog_info_node *perf_env__find_bpf_prog_info(struct perf_env *env, @@ -79,14 +83,22 @@ out: } bool perf_env__insert_btf(struct perf_env *env, struct btf_node *btf_node) +{ + bool ret; + + down_write(&env->bpf_progs.lock); + ret = __perf_env__insert_btf(env, btf_node); + up_write(&env->bpf_progs.lock); + return ret; +} + +bool __perf_env__insert_btf(struct perf_env *env, struct btf_node *btf_node) { struct rb_node *parent = NULL; __u32 btf_id = btf_node->id; struct btf_node *node; struct rb_node **p; - bool ret = true; - down_write(&env->bpf_progs.lock); p = &env->bpf_progs.btfs.rb_node; while (*p != NULL) { @@ -98,25 +110,31 @@ bool perf_env__insert_btf(struct perf_env *env, struct btf_node *btf_node) p = &(*p)->rb_right; } else { pr_debug("duplicated btf %u\n", btf_id); - ret = false; - goto out; + return false; } } rb_link_node(&btf_node->rb_node, parent, p); rb_insert_color(&btf_node->rb_node, &env->bpf_progs.btfs); env->bpf_progs.btfs_cnt++; -out: - up_write(&env->bpf_progs.lock); - return ret; + return true; } struct btf_node *perf_env__find_btf(struct perf_env *env, __u32 btf_id) +{ + struct btf_node *res; + + down_read(&env->bpf_progs.lock); + res = __perf_env__find_btf(env, btf_id); + up_read(&env->bpf_progs.lock); + return res; +} + +struct btf_node *__perf_env__find_btf(struct perf_env *env, __u32 btf_id) { struct btf_node *node = NULL; struct rb_node *n; - down_read(&env->bpf_progs.lock); n = env->bpf_progs.btfs.rb_node; while (n) { @@ -126,13 +144,9 @@ struct btf_node *perf_env__find_btf(struct perf_env *env, __u32 btf_id) else if (btf_id > node->id) n = n->rb_right; else - goto out; + return node; } - node = NULL; - -out: - up_read(&env->bpf_progs.lock); - return node; + return NULL; } /* purge data in bpf_progs.infos tree */ diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h index bf7e3c4c211f9..7c527e65c1864 100644 --- a/tools/perf/util/env.h +++ b/tools/perf/util/env.h @@ -175,12 +175,16 @@ const char *perf_env__raw_arch(struct perf_env *env); int perf_env__nr_cpus_avail(struct perf_env *env); void perf_env__init(struct perf_env *env); +void __perf_env__insert_bpf_prog_info(struct perf_env *env, + struct bpf_prog_info_node *info_node); void perf_env__insert_bpf_prog_info(struct perf_env *env, struct bpf_prog_info_node *info_node); struct bpf_prog_info_node *perf_env__find_bpf_prog_info(struct perf_env *env, __u32 prog_id); bool perf_env__insert_btf(struct perf_env *env, struct btf_node *btf_node); +bool __perf_env__insert_btf(struct perf_env *env, struct btf_node *btf_node); struct btf_node *perf_env__find_btf(struct perf_env *env, __u32 btf_id); +struct btf_node *__perf_env__find_btf(struct perf_env *env, __u32 btf_id); int perf_env__numa_node(struct perf_env *env, struct perf_cpu cpu); char *perf_env__find_pmu_cap(struct perf_env *env, const char *pmu_name, diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index a9f71f8343f08..3fe28edc3d017 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1849,8 +1849,8 @@ static void print_bpf_prog_info(struct feat_fd *ff, FILE *fp) node = rb_entry(next, struct bpf_prog_info_node, rb_node); next = rb_next(&node->rb_node); - bpf_event__print_bpf_prog_info(&node->info_linear->info, - env, fp); + __bpf_event__print_bpf_prog_info(&node->info_linear->info, + env, fp); } up_read(&env->bpf_progs.lock); @@ -3188,7 +3188,7 @@ static int process_bpf_prog_info(struct feat_fd *ff, void *data __maybe_unused) /* after reading from file, translate offset to address */ bpil_offs_to_addr(info_linear); info_node->info_linear = info_linear; - perf_env__insert_bpf_prog_info(env, info_node); + __perf_env__insert_bpf_prog_info(env, info_node); } up_write(&env->bpf_progs.lock); @@ -3235,7 +3235,7 @@ static int process_bpf_btf(struct feat_fd *ff, void *data __maybe_unused) if (__do_read(ff, node->data, data_size)) goto out; - perf_env__insert_btf(env, node); + __perf_env__insert_btf(env, node); node = NULL; } -- GitLab From 7d1405c71df21f6c394b8a885aa8a133f749fa22 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 6 Dec 2023 18:16:27 -0800 Subject: [PATCH 0632/1290] perf record: Reduce memory for recording PERF_RECORD_LOST_SAMPLES event Reduce from PERF_SAMPLE_MAX_SIZE to "sizeof(*lost) + session->machines.host.id_hdr_size". Suggested-by: Namhyung Kim Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231207021627.1322884-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index a89013c44fd53..91e6828c38cc2 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1954,7 +1954,8 @@ static void record__read_lost_samples(struct record *rec) if (count.lost) { if (!lost) { - lost = zalloc(PERF_SAMPLE_MAX_SIZE); + lost = zalloc(sizeof(*lost) + + session->machines.host.id_hdr_size); if (!lost) { pr_debug("Memory allocation failed\n"); return; @@ -1970,7 +1971,8 @@ static void record__read_lost_samples(struct record *rec) lost_count = perf_bpf_filter__lost_count(evsel); if (lost_count) { if (!lost) { - lost = zalloc(PERF_SAMPLE_MAX_SIZE); + lost = zalloc(sizeof(*lost) + + session->machines.host.id_hdr_size); if (!lost) { pr_debug("Memory allocation failed\n"); return; -- GitLab From f2567e12a090f0eb22553a4468d4c4fe04aad906 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 11 Dec 2023 10:12:41 -0800 Subject: [PATCH 0633/1290] perf stat: Fix hard coded LL miss units Copy-paste error where LL cache misses are reported as l1i. Fixes: 0a57b910807ad163 ("perf stat: Use counts rather than saved_value") Suggested-by: Guillaume Endignoux Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231211181242.1721059-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/stat-shadow.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index 1c5c3eeba4cfb..e31426167852a 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -264,7 +264,7 @@ static void print_ll_miss(struct perf_stat_config *config, static const double color_ratios[3] = {20.0, 10.0, 5.0}; print_ratio(config, evsel, aggr_idx, misses, out, STAT_LL_CACHE, color_ratios, - "of all L1-icache accesses"); + "of all LL-cache accesses"); } static void print_dtlb_miss(struct perf_stat_config *config, -- GitLab From 346878dacc81f53667381c8f4bb5018195ca10be Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Wed, 29 Nov 2023 11:55:04 +0530 Subject: [PATCH 0634/1290] perf vendor events amd: Add Zen 4 memory controller events Make the jevents parser aware of the Unified Memory Controller (UMC) PMU and add events taken from Section 8.2.1 "UMC Performance Monitor Events" of the Processor Programming Reference (PPR) for AMD Family 19h Model 11h processors. The events capture UMC command activity such as CAS, ACTIVATE, PRECHARGE etc. while the metrics derive data bus utilization and memory bandwidth out of these events. Signed-off-by: Sandipan Das Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ananth Narayan Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Stephane Eranian Link: https://lore.kernel.org/r/e0d8a7e8ca8ee3e378d8029e80b456ac327d6419.1701238314.git.sandipan.das@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/amdzen4/memory-controller.json | 101 ++++++++++++++++++ .../arch/x86/amdzen4/recommended.json | 84 +++++++++++++++ tools/perf/pmu-events/jevents.py | 2 + 3 files changed, 187 insertions(+) create mode 100644 tools/perf/pmu-events/arch/x86/amdzen4/memory-controller.json diff --git a/tools/perf/pmu-events/arch/x86/amdzen4/memory-controller.json b/tools/perf/pmu-events/arch/x86/amdzen4/memory-controller.json new file mode 100644 index 0000000000000..55263e5e4f69a --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen4/memory-controller.json @@ -0,0 +1,101 @@ +[ + { + "EventName": "umc_mem_clk", + "PublicDescription": "Number of memory clock cycles.", + "EventCode": "0x00", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_act_cmd.all", + "PublicDescription": "Number of ACTIVATE commands sent.", + "EventCode": "0x05", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_act_cmd.rd", + "PublicDescription": "Number of ACTIVATE commands sent for reads.", + "EventCode": "0x05", + "RdWrMask": "0x1", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_act_cmd.wr", + "PublicDescription": "Number of ACTIVATE commands sent for writes.", + "EventCode": "0x05", + "RdWrMask": "0x2", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_pchg_cmd.all", + "PublicDescription": "Number of PRECHARGE commands sent.", + "EventCode": "0x06", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_pchg_cmd.rd", + "PublicDescription": "Number of PRECHARGE commands sent for reads.", + "EventCode": "0x06", + "RdWrMask": "0x1", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_pchg_cmd.wr", + "PublicDescription": "Number of PRECHARGE commands sent for writes.", + "EventCode": "0x06", + "RdWrMask": "0x2", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_cas_cmd.all", + "PublicDescription": "Number of CAS commands sent.", + "EventCode": "0x0a", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_cas_cmd.rd", + "PublicDescription": "Number of CAS commands sent for reads.", + "EventCode": "0x0a", + "RdWrMask": "0x1", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_cas_cmd.wr", + "PublicDescription": "Number of CAS commands sent for writes.", + "EventCode": "0x0a", + "RdWrMask": "0x2", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_data_slot_clks.all", + "PublicDescription": "Number of clocks used by the data bus.", + "EventCode": "0x14", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_data_slot_clks.rd", + "PublicDescription": "Number of clocks used by the data bus for reads.", + "EventCode": "0x14", + "RdWrMask": "0x1", + "PerPkg": "1", + "Unit": "UMCPMC" + }, + { + "EventName": "umc_data_slot_clks.wr", + "PublicDescription": "Number of clocks used by the data bus for writes.", + "EventCode": "0x14", + "RdWrMask": "0x2", + "PerPkg": "1", + "Unit": "UMCPMC" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen4/recommended.json b/tools/perf/pmu-events/arch/x86/amdzen4/recommended.json index 5e6a793acf7b2..96e06401c6cbb 100644 --- a/tools/perf/pmu-events/arch/x86/amdzen4/recommended.json +++ b/tools/perf/pmu-events/arch/x86/amdzen4/recommended.json @@ -330,5 +330,89 @@ "MetricGroup": "data_fabric", "PerPkg": "1", "ScaleUnit": "6.103515625e-5MiB" + }, + { + "MetricName": "umc_data_bus_utilization", + "BriefDescription": "Memory controller data bus utilization.", + "MetricExpr": "d_ratio(umc_data_slot_clks.all / 2, umc_mem_clk)", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "100%" + }, + { + "MetricName": "umc_cas_cmd_rate", + "BriefDescription": "Memory controller CAS command rate.", + "MetricExpr": "d_ratio(umc_cas_cmd.all * 1000, umc_mem_clk)", + "MetricGroup": "memory_controller", + "PerPkg": "1" + }, + { + "MetricName": "umc_cas_cmd_read_ratio", + "BriefDescription": "Ratio of memory controller CAS commands for reads.", + "MetricExpr": "d_ratio(umc_cas_cmd.rd, umc_cas_cmd.all)", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "100%" + }, + { + "MetricName": "umc_cas_cmd_write_ratio", + "BriefDescription": "Ratio of memory controller CAS commands for writes.", + "MetricExpr": "d_ratio(umc_cas_cmd.wr, umc_cas_cmd.all)", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "100%" + }, + { + "MetricName": "umc_mem_read_bandwidth", + "BriefDescription": "Estimated memory read bandwidth.", + "MetricExpr": "(umc_cas_cmd.rd * 64) / 1e6 / duration_time", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "1MB/s" + }, + { + "MetricName": "umc_mem_write_bandwidth", + "BriefDescription": "Estimated memory write bandwidth.", + "MetricExpr": "(umc_cas_cmd.wr * 64) / 1e6 / duration_time", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "1MB/s" + }, + { + "MetricName": "umc_mem_bandwidth", + "BriefDescription": "Estimated combined memory bandwidth.", + "MetricExpr": "(umc_cas_cmd.all * 64) / 1e6 / duration_time", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "1MB/s" + }, + { + "MetricName": "umc_cas_cmd_read_ratio", + "BriefDescription": "Ratio of memory controller CAS commands for reads.", + "MetricExpr": "d_ratio(umc_cas_cmd.rd, umc_cas_cmd.all)", + "MetricGroup": "memory_controller", + "PerPkg": "1", + "ScaleUnit": "100%" + }, + { + "MetricName": "umc_cas_cmd_rate", + "BriefDescription": "Memory controller CAS command rate.", + "MetricExpr": "d_ratio(umc_cas_cmd.all * 1000, umc_mem_clk)", + "MetricGroup": "memory_controller", + "PerPkg": "1" + }, + { + "MetricName": "umc_activate_cmd_rate", + "BriefDescription": "Memory controller ACTIVATE command rate.", + "MetricExpr": "d_ratio(umc_act_cmd.all * 1000, umc_mem_clk)", + "MetricGroup": "memory_controller", + "PerPkg": "1" + }, + { + "MetricName": "umc_precharge_cmd_rate", + "BriefDescription": "Memory controller PRECHARGE command rate.", + "MetricExpr": "d_ratio(umc_pchg_cmd.all * 1000, umc_mem_clk)", + "MetricGroup": "memory_controller", + "PerPkg": "1" } ] diff --git a/tools/perf/pmu-events/jevents.py b/tools/perf/pmu-events/jevents.py index 0093c998cb6e6..53ab050c8fa43 100755 --- a/tools/perf/pmu-events/jevents.py +++ b/tools/perf/pmu-events/jevents.py @@ -286,6 +286,7 @@ class JsonEvent: 'imx8_ddr': 'imx8_ddr', 'L3PMC': 'amd_l3', 'DFPMC': 'amd_df', + 'UMCPMC': 'amd_umc', 'cpu_core': 'cpu_core', 'cpu_atom': 'cpu_atom', 'ali_drw': 'ali_drw', @@ -354,6 +355,7 @@ class JsonEvent: ('SampleAfterValue', 'period='), ('UMask', 'umask='), ('NodeType', 'type='), + ('RdWrMask', 'rdwrmask='), ] for key, value in event_fields: if key in jd and jd[key] != '0': -- GitLab From eb00697b91646de22d6d9b4e6a5fadf4495fdf69 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 3 Jan 2024 09:01:58 -0800 Subject: [PATCH 0635/1290] perf x86 test: Update hybrid expectations The legacy events cpu-cycles and instructions have sysfs event equivalents on x86 (see /sys/devices/cpu_core/events). As sysfs/JSON events are now higher in priority than legacy events this causes the hybrid test expectations not to be met. To fix this switch to legacy events that don't have sysfs versions, namely cpu-cycles becomes cycles and instructions becomes branches. Fixes: a24d9d9dc096fc0d ("perf parse-events: Make legacy events lower priority than sysfs/JSON") Reported-by: Arnaldo Carvalho de Melo Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Closes: https://lore.kernel.org/lkml/ZYbm5L7tw7bdpDpE@kernel.org/ Link: https://lore.kernel.org/r/20240103170159.1435753-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/tests/hybrid.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/perf/arch/x86/tests/hybrid.c b/tools/perf/arch/x86/tests/hybrid.c index eb152770f1485..05a5f81e81671 100644 --- a/tools/perf/arch/x86/tests/hybrid.c +++ b/tools/perf/arch/x86/tests/hybrid.c @@ -47,7 +47,7 @@ static int test__hybrid_hw_group_event(struct evlist *evlist) evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong hybrid type", test_hybrid_type(evsel, PERF_TYPE_RAW)); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_BRANCH_INSTRUCTIONS)); TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); return TEST_OK; } @@ -102,7 +102,7 @@ static int test__hybrid_group_modifier1(struct evlist *evlist) evsel = evsel__next(evsel); TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); TEST_ASSERT_VAL("wrong hybrid type", test_hybrid_type(evsel, PERF_TYPE_RAW)); - TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_INSTRUCTIONS)); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_BRANCH_INSTRUCTIONS)); TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); @@ -171,27 +171,27 @@ struct evlist_test { static const struct evlist_test test__hybrid_events[] = { { - .name = "cpu_core/cpu-cycles/", + .name = "cpu_core/cycles/", .check = test__hybrid_hw_event_with_pmu, /* 0 */ }, { - .name = "{cpu_core/cpu-cycles/,cpu_core/instructions/}", + .name = "{cpu_core/cycles/,cpu_core/branches/}", .check = test__hybrid_hw_group_event, /* 1 */ }, { - .name = "{cpu-clock,cpu_core/cpu-cycles/}", + .name = "{cpu-clock,cpu_core/cycles/}", .check = test__hybrid_sw_hw_group_event, /* 2 */ }, { - .name = "{cpu_core/cpu-cycles/,cpu-clock}", + .name = "{cpu_core/cycles/,cpu-clock}", .check = test__hybrid_hw_sw_group_event, /* 3 */ }, { - .name = "{cpu_core/cpu-cycles/k,cpu_core/instructions/u}", + .name = "{cpu_core/cycles/k,cpu_core/branches/u}", .check = test__hybrid_group_modifier1, /* 4 */ }, -- GitLab From ec5257d99e6894d65fae772ca43c53b3d6855115 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 3 Jan 2024 09:01:59 -0800 Subject: [PATCH 0636/1290] perf x86 test: Add hybrid test for conflicting legacy/sysfs event The cpu-cycles event is both a legacy event and declared in /sys/devices/cpu_core/events/cpu-cycles. The cycles event is a legacy event but with no sysfs version. Add a test that the sysfs version is preferred to the legacy for cpu-cycles, while for cycles we use the legacy version. Suggested-by: Kan Liang Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240103170159.1435753-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/tests/hybrid.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tools/perf/arch/x86/tests/hybrid.c b/tools/perf/arch/x86/tests/hybrid.c index 05a5f81e81671..40f5d17fedab6 100644 --- a/tools/perf/arch/x86/tests/hybrid.c +++ b/tools/perf/arch/x86/tests/hybrid.c @@ -163,6 +163,24 @@ static int test__checkevent_pmu(struct evlist *evlist) return TEST_OK; } +static int test__hybrid_hw_group_event_2(struct evlist *evlist) +{ + struct evsel *evsel, *leader; + + evsel = leader = evlist__first(evlist); + TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong hybrid type", test_hybrid_type(evsel, PERF_TYPE_RAW)); + TEST_ASSERT_VAL("wrong config", test_config(evsel, PERF_COUNT_HW_CPU_CYCLES)); + TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); + + evsel = evsel__next(evsel); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", evsel->core.attr.config == 0x3c); + TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader)); + return TEST_OK; +} + struct evlist_test { const char *name; bool (*valid)(void); @@ -215,6 +233,11 @@ static const struct evlist_test test__hybrid_events[] = { .check = test__hybrid_cache_event, /* 8 */ }, + { + .name = "{cpu_core/cycles/,cpu_core/cpu-cycles/}", + .check = test__hybrid_hw_group_event_2, + /* 9 */ + }, }; static int test_event(const struct evlist_test *e) -- GitLab From 6d0fc416c42a98b39a74151376928d577873941c Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 20 Dec 2023 16:17:28 -0800 Subject: [PATCH 0637/1290] cxl/trace: Pass UUID explicitly to event traces CXL CPER events are identified by the CPER Section Type GUID. The GUID correlates with the CXL UUID for the event record. It turns out that a CXL CPER record is a strict subset of the CXL event record, only the UUID header field is chopped. In order to unify handling between native and CPER flavors of CXL events, prepare the code for the UUID to be passed in rather than inferred from the record itself. Later patches update the passed in record to only refer to the common data between the formats. Pass the UUID explicitly to each trace event to be able to remove the UUID from the event structures. Originally it was desirable to remove the UUID from the well known event because the UUID value was redundant. However, the trace API was already in place.[1] Signed-off-by: Ira Weiny Link: https://lore.kernel.org/all/36f2d12934d64a278f2c0313cbd01abc@huawei.com [1] Link: https://lore.kernel.org/r/20231220-cxl-cper-v5-1-1bb8a4ca2c7a@intel.com Reviewed-by: Jonathan Cameron Acked-by: Ard Biesheuvel Signed-off-by: Dan Williams --- drivers/cxl/core/mbox.c | 8 ++++---- drivers/cxl/core/trace.h | 28 ++++++++++++++-------------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 36270dcfb42ef..00f429c440dfe 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -870,19 +870,19 @@ static void cxl_event_trace_record(const struct cxl_memdev *cxlmd, struct cxl_event_gen_media *rec = (struct cxl_event_gen_media *)record; - trace_cxl_general_media(cxlmd, type, rec); + trace_cxl_general_media(cxlmd, type, id, rec); } else if (uuid_equal(id, &dram_event_uuid)) { struct cxl_event_dram *rec = (struct cxl_event_dram *)record; - trace_cxl_dram(cxlmd, type, rec); + trace_cxl_dram(cxlmd, type, id, rec); } else if (uuid_equal(id, &mem_mod_event_uuid)) { struct cxl_event_mem_module *rec = (struct cxl_event_mem_module *)record; - trace_cxl_memory_module(cxlmd, type, rec); + trace_cxl_memory_module(cxlmd, type, id, rec); } else { /* For unknown record types print just the header */ - trace_cxl_generic_event(cxlmd, type, record); + trace_cxl_generic_event(cxlmd, type, id, record); } } diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h index a0b5819bc70b3..3da16026b8dbf 100644 --- a/drivers/cxl/core/trace.h +++ b/drivers/cxl/core/trace.h @@ -198,12 +198,12 @@ TRACE_EVENT(cxl_overflow, __field(u8, hdr_length) \ __field(u8, hdr_maint_op_class) -#define CXL_EVT_TP_fast_assign(cxlmd, l, hdr) \ +#define CXL_EVT_TP_fast_assign(cxlmd, l, uuid, hdr) \ __assign_str(memdev, dev_name(&(cxlmd)->dev)); \ __assign_str(host, dev_name((cxlmd)->dev.parent)); \ __entry->log = (l); \ __entry->serial = (cxlmd)->cxlds->serial; \ - memcpy(&__entry->hdr_uuid, &(hdr).id, sizeof(uuid_t)); \ + memcpy(&__entry->hdr_uuid, (uuid), sizeof(uuid_t)); \ __entry->hdr_length = (hdr).length; \ __entry->hdr_flags = get_unaligned_le24((hdr).flags); \ __entry->hdr_handle = le16_to_cpu((hdr).handle); \ @@ -225,9 +225,9 @@ TRACE_EVENT(cxl_overflow, TRACE_EVENT(cxl_generic_event, TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log, - struct cxl_event_record_raw *rec), + const uuid_t *uuid, struct cxl_event_record_raw *rec), - TP_ARGS(cxlmd, log, rec), + TP_ARGS(cxlmd, log, uuid, rec), TP_STRUCT__entry( CXL_EVT_TP_entry @@ -235,7 +235,7 @@ TRACE_EVENT(cxl_generic_event, ), TP_fast_assign( - CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr); + CXL_EVT_TP_fast_assign(cxlmd, log, uuid, rec->hdr); memcpy(__entry->data, &rec->data, CXL_EVENT_RECORD_DATA_LENGTH); ), @@ -315,9 +315,9 @@ TRACE_EVENT(cxl_generic_event, TRACE_EVENT(cxl_general_media, TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log, - struct cxl_event_gen_media *rec), + const uuid_t *uuid, struct cxl_event_gen_media *rec), - TP_ARGS(cxlmd, log, rec), + TP_ARGS(cxlmd, log, uuid, rec), TP_STRUCT__entry( CXL_EVT_TP_entry @@ -336,7 +336,7 @@ TRACE_EVENT(cxl_general_media, ), TP_fast_assign( - CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr); + CXL_EVT_TP_fast_assign(cxlmd, log, uuid, rec->hdr); /* General Media */ __entry->dpa = le64_to_cpu(rec->phys_addr); @@ -398,9 +398,9 @@ TRACE_EVENT(cxl_general_media, TRACE_EVENT(cxl_dram, TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log, - struct cxl_event_dram *rec), + const uuid_t *uuid, struct cxl_event_dram *rec), - TP_ARGS(cxlmd, log, rec), + TP_ARGS(cxlmd, log, uuid, rec), TP_STRUCT__entry( CXL_EVT_TP_entry @@ -422,7 +422,7 @@ TRACE_EVENT(cxl_dram, ), TP_fast_assign( - CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr); + CXL_EVT_TP_fast_assign(cxlmd, log, uuid, rec->hdr); /* DRAM */ __entry->dpa = le64_to_cpu(rec->phys_addr); @@ -547,9 +547,9 @@ TRACE_EVENT(cxl_dram, TRACE_EVENT(cxl_memory_module, TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log, - struct cxl_event_mem_module *rec), + const uuid_t *uuid, struct cxl_event_mem_module *rec), - TP_ARGS(cxlmd, log, rec), + TP_ARGS(cxlmd, log, uuid, rec), TP_STRUCT__entry( CXL_EVT_TP_entry @@ -569,7 +569,7 @@ TRACE_EVENT(cxl_memory_module, ), TP_fast_assign( - CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr); + CXL_EVT_TP_fast_assign(cxlmd, log, uuid, rec->hdr); /* Memory Module Event */ __entry->event_type = rec->event_type; -- GitLab From c7ad3dc3649730af483ee1e78be5d0362da25bfe Mon Sep 17 00:00:00 2001 From: Jim Harris Date: Fri, 3 Nov 2023 20:18:34 +0000 Subject: [PATCH 0638/1290] cxl/region: fix x9 interleave typo CXL supports x3, x6 and x12 - not x9. Fixes: 80d10a6cee050 ("cxl/region: Add interleave geometry attributes") Signed-off-by: Jim Harris Reviewed-by: Dave Jiang Reviewed-by: Fan Ni Link: https://lore.kernel.org/r/169904271254.204936.8580772404462743630.stgit@ubuntu Signed-off-by: Dan Williams --- drivers/cxl/core/region.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 3e817a6f94c6a..76fec47a13a4c 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -397,7 +397,7 @@ static ssize_t interleave_ways_store(struct device *dev, return rc; /* - * Even for x3, x9, and x12 interleaves the region interleave must be a + * Even for x3, x6, and x12 interleaves the region interleave must be a * power of 2 multiple of the host bridge interleave. */ if (!is_power_of_2(val / cxld->interleave_ways) || -- GitLab From b0f7e2d739b4aac131ea1662d086a07775097b05 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 3 Jan 2024 20:52:48 -0500 Subject: [PATCH 0639/1290] eventfs: Remove "lookup" parameter from create_dir/file_dentry() The "lookup" parameter is a way to differentiate the call to create_file/dir_dentry() from when it's just a lookup (no need to up the dentry refcount) and accessed via a readdir (need to up the refcount). But reality, it just makes the code more complex. Just up the refcount and let the caller decide to dput() the result or not. Link: https://lore.kernel.org/linux-trace-kernel/20240103102553.17a19cea@gandalf.local.home Link: https://lore.kernel.org/linux-trace-kernel/20240104015435.517502710@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Linus Torvalds Cc: Ajay Kaher Cc: Al Viro Cc: Christian Brauner Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 55 +++++++++++++++------------------------- 1 file changed, 20 insertions(+), 35 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index f0677ea0ec24e..c360300fb866e 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -390,16 +390,14 @@ void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry) * @mode: The mode of the file. * @data: The data to use to set the inode of the file with on open() * @fops: The fops of the file to be created. - * @lookup: If called by the lookup routine, in which case, dput() the created dentry. * * Create a dentry for a file of an eventfs_inode @ei and place it into the - * address located at @e_dentry. If the @e_dentry already has a dentry, then - * just do a dget() on it and return. Otherwise create the dentry and attach it. + * address located at @e_dentry. */ static struct dentry * create_file_dentry(struct eventfs_inode *ei, int idx, struct dentry *parent, const char *name, umode_t mode, void *data, - const struct file_operations *fops, bool lookup) + const struct file_operations *fops) { struct eventfs_attr *attr = NULL; struct dentry **e_dentry = &ei->d_children[idx]; @@ -414,9 +412,7 @@ create_file_dentry(struct eventfs_inode *ei, int idx, } /* If the e_dentry already has a dentry, use it */ if (*e_dentry) { - /* lookup does not need to up the ref count */ - if (!lookup) - dget(*e_dentry); + dget(*e_dentry); mutex_unlock(&eventfs_mutex); return *e_dentry; } @@ -441,13 +437,12 @@ create_file_dentry(struct eventfs_inode *ei, int idx, * way to being freed, don't return it. If e_dentry is NULL * it means it was already freed. */ - if (ei->is_freed) + if (ei->is_freed) { dentry = NULL; - else + } else { dentry = *e_dentry; - /* The lookup does not need to up the dentry refcount */ - if (dentry && !lookup) dget(dentry); + } mutex_unlock(&eventfs_mutex); return dentry; } @@ -465,9 +460,6 @@ create_file_dentry(struct eventfs_inode *ei, int idx, } mutex_unlock(&eventfs_mutex); - if (lookup) - dput(dentry); - return dentry; } @@ -500,13 +492,12 @@ static void eventfs_post_create_dir(struct eventfs_inode *ei) * @pei: The eventfs_inode parent of ei. * @ei: The eventfs_inode to create the directory for * @parent: The dentry of the parent of this directory - * @lookup: True if this is called by the lookup code * * This creates and attaches a directory dentry to the eventfs_inode @ei. */ static struct dentry * create_dir_dentry(struct eventfs_inode *pei, struct eventfs_inode *ei, - struct dentry *parent, bool lookup) + struct dentry *parent) { struct dentry *dentry = NULL; @@ -518,11 +509,9 @@ create_dir_dentry(struct eventfs_inode *pei, struct eventfs_inode *ei, return NULL; } if (ei->dentry) { - /* If the dentry already has a dentry, use it */ + /* If the eventfs_inode already has a dentry, use it */ dentry = ei->dentry; - /* lookup does not need to up the ref count */ - if (!lookup) - dget(dentry); + dget(dentry); mutex_unlock(&eventfs_mutex); return dentry; } @@ -542,7 +531,7 @@ create_dir_dentry(struct eventfs_inode *pei, struct eventfs_inode *ei, * way to being freed. */ dentry = ei->dentry; - if (dentry && !lookup) + if (dentry) dget(dentry); mutex_unlock(&eventfs_mutex); return dentry; @@ -562,9 +551,6 @@ create_dir_dentry(struct eventfs_inode *pei, struct eventfs_inode *ei, } mutex_unlock(&eventfs_mutex); - if (lookup) - dput(dentry); - return dentry; } @@ -589,8 +575,8 @@ static struct dentry *eventfs_root_lookup(struct inode *dir, struct eventfs_inode *ei; struct dentry *ei_dentry = NULL; struct dentry *ret = NULL; + struct dentry *d; const char *name = dentry->d_name.name; - bool created = false; umode_t mode; void *data; int idx; @@ -626,13 +612,10 @@ static struct dentry *eventfs_root_lookup(struct inode *dir, ret = simple_lookup(dir, dentry, flags); if (IS_ERR(ret)) goto out; - create_dir_dentry(ei, ei_child, ei_dentry, true); - created = true; - break; - } - - if (created) + d = create_dir_dentry(ei, ei_child, ei_dentry); + dput(d); goto out; + } for (i = 0; i < ei->nr_entries; i++) { entry = &ei->entries[i]; @@ -650,8 +633,8 @@ static struct dentry *eventfs_root_lookup(struct inode *dir, ret = simple_lookup(dir, dentry, flags); if (IS_ERR(ret)) goto out; - create_file_dentry(ei, i, ei_dentry, name, mode, cdata, - fops, true); + d = create_file_dentry(ei, i, ei_dentry, name, mode, cdata, fops); + dput(d); break; } } @@ -768,9 +751,10 @@ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file) inode_lock(parent->d_inode); list_for_each_entry_srcu(ei_child, &ei->children, list, srcu_read_lock_held(&eventfs_srcu)) { - d = create_dir_dentry(ei, ei_child, parent, false); + d = create_dir_dentry(ei, ei_child, parent); if (d) { ret = add_dentries(&dentries, d, cnt); + dput(d); if (ret < 0) break; cnt++; @@ -790,9 +774,10 @@ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file) mutex_unlock(&eventfs_mutex); if (r <= 0) continue; - d = create_file_dentry(ei, i, parent, name, mode, cdata, fops, false); + d = create_file_dentry(ei, i, parent, name, mode, cdata, fops); if (d) { ret = add_dentries(&dentries, d, cnt); + dput(d); if (ret < 0) break; cnt++; -- GitLab From 493ec81a8fb8e4ada6f223b8b73791a1280d4774 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 3 Jan 2024 20:52:49 -0500 Subject: [PATCH 0640/1290] eventfs: Stop using dcache_readdir() for getdents() The eventfs creates dynamically allocated dentries and inodes. Using the dcache_readdir() logic for its own directory lookups requires hiding the cursor of the dcache logic and playing games to allow the dcache_readdir() to still have access to the cursor while the eventfs saved what it created and what it needs to release. Instead, just have eventfs have its own iterate_shared callback function that will fill in the dent entries. This simplifies the code quite a bit. Link: https://lore.kernel.org/linux-trace-kernel/20240104015435.682218477@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Linus Torvalds Cc: Ajay Kaher Cc: Al Viro Cc: Christian Brauner Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 194 +++++++++++++-------------------------- 1 file changed, 64 insertions(+), 130 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index c360300fb866e..41af56f44f0a0 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -52,9 +52,7 @@ enum { static struct dentry *eventfs_root_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); -static int dcache_dir_open_wrapper(struct inode *inode, struct file *file); -static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx); -static int eventfs_release(struct inode *inode, struct file *file); +static int eventfs_iterate(struct file *file, struct dir_context *ctx); static void update_attr(struct eventfs_attr *attr, struct iattr *iattr) { @@ -148,11 +146,9 @@ static const struct inode_operations eventfs_file_inode_operations = { }; static const struct file_operations eventfs_file_operations = { - .open = dcache_dir_open_wrapper, .read = generic_read_dir, - .iterate_shared = dcache_readdir_wrapper, + .iterate_shared = eventfs_iterate, .llseek = generic_file_llseek, - .release = eventfs_release, }; /* Return the evenfs_inode of the "events" directory */ @@ -643,128 +639,87 @@ static struct dentry *eventfs_root_lookup(struct inode *dir, return ret; } -struct dentry_list { - void *cursor; - struct dentry **dentries; -}; - -/** - * eventfs_release - called to release eventfs file/dir - * @inode: inode to be released - * @file: file to be released (not used) - */ -static int eventfs_release(struct inode *inode, struct file *file) -{ - struct tracefs_inode *ti; - struct dentry_list *dlist = file->private_data; - void *cursor; - int i; - - ti = get_tracefs(inode); - if (!(ti->flags & TRACEFS_EVENT_INODE)) - return -EINVAL; - - if (WARN_ON_ONCE(!dlist)) - return -EINVAL; - - for (i = 0; dlist->dentries && dlist->dentries[i]; i++) { - dput(dlist->dentries[i]); - } - - cursor = dlist->cursor; - kfree(dlist->dentries); - kfree(dlist); - file->private_data = cursor; - return dcache_dir_close(inode, file); -} - -static int add_dentries(struct dentry ***dentries, struct dentry *d, int cnt) -{ - struct dentry **tmp; - - tmp = krealloc(*dentries, sizeof(d) * (cnt + 2), GFP_NOFS); - if (!tmp) - return -1; - tmp[cnt] = d; - tmp[cnt + 1] = NULL; - *dentries = tmp; - return 0; -} - -/** - * dcache_dir_open_wrapper - eventfs open wrapper - * @inode: not used - * @file: dir to be opened (to create it's children) - * - * Used to dynamic create file/dir with-in @file, all the - * file/dir will be created. If already created then references - * will be increased +/* + * Walk the children of a eventfs_inode to fill in getdents(). */ -static int dcache_dir_open_wrapper(struct inode *inode, struct file *file) +static int eventfs_iterate(struct file *file, struct dir_context *ctx) { const struct file_operations *fops; + struct inode *f_inode = file_inode(file); const struct eventfs_entry *entry; struct eventfs_inode *ei_child; struct tracefs_inode *ti; struct eventfs_inode *ei; - struct dentry_list *dlist; - struct dentry **dentries = NULL; - struct dentry *parent = file_dentry(file); - struct dentry *d; - struct inode *f_inode = file_inode(file); - const char *name = parent->d_name.name; + struct dentry *ei_dentry = NULL; + struct dentry *dentry; + const char *name; umode_t mode; - void *data; - int cnt = 0; int idx; - int ret; - int i; - int r; + int ret = -EINVAL; + int ino; + int i, r, c; + + if (!dir_emit_dots(file, ctx)) + return 0; ti = get_tracefs(f_inode); if (!(ti->flags & TRACEFS_EVENT_INODE)) return -EINVAL; - if (WARN_ON_ONCE(file->private_data)) - return -EINVAL; + c = ctx->pos - 2; idx = srcu_read_lock(&eventfs_srcu); mutex_lock(&eventfs_mutex); ei = READ_ONCE(ti->private); + if (ei && !ei->is_freed) + ei_dentry = READ_ONCE(ei->dentry); mutex_unlock(&eventfs_mutex); - if (!ei) { - srcu_read_unlock(&eventfs_srcu, idx); - return -EINVAL; - } - - - data = ei->data; + if (!ei || !ei_dentry) + goto out; - dlist = kmalloc(sizeof(*dlist), GFP_KERNEL); - if (!dlist) { - srcu_read_unlock(&eventfs_srcu, idx); - return -ENOMEM; - } + ret = 0; - inode_lock(parent->d_inode); + /* + * Need to create the dentries and inodes to have a consistent + * inode number. + */ list_for_each_entry_srcu(ei_child, &ei->children, list, srcu_read_lock_held(&eventfs_srcu)) { - d = create_dir_dentry(ei, ei_child, parent); - if (d) { - ret = add_dentries(&dentries, d, cnt); - dput(d); - if (ret < 0) - break; - cnt++; + + if (c > 0) { + c--; + continue; } + + if (ei_child->is_freed) + continue; + + name = ei_child->name; + + dentry = create_dir_dentry(ei, ei_child, ei_dentry); + if (!dentry) + goto out; + ino = dentry->d_inode->i_ino; + dput(dentry); + + if (!dir_emit(ctx, name, strlen(name), ino, DT_DIR)) + goto out; + ctx->pos++; } for (i = 0; i < ei->nr_entries; i++) { - void *cdata = data; + void *cdata = ei->data; + + if (c > 0) { + c--; + continue; + } + entry = &ei->entries[i]; name = entry->name; + mutex_lock(&eventfs_mutex); /* If ei->is_freed, then the event itself may be too */ if (!ei->is_freed) @@ -774,42 +729,21 @@ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file) mutex_unlock(&eventfs_mutex); if (r <= 0) continue; - d = create_file_dentry(ei, i, parent, name, mode, cdata, fops); - if (d) { - ret = add_dentries(&dentries, d, cnt); - dput(d); - if (ret < 0) - break; - cnt++; - } - } - inode_unlock(parent->d_inode); - srcu_read_unlock(&eventfs_srcu, idx); - ret = dcache_dir_open(inode, file); - /* - * dcache_dir_open() sets file->private_data to a dentry cursor. - * Need to save that but also save all the dentries that were - * opened by this function. - */ - dlist->cursor = file->private_data; - dlist->dentries = dentries; - file->private_data = dlist; - return ret; -} + dentry = create_file_dentry(ei, i, ei_dentry, name, mode, cdata, fops); + if (!dentry) + goto out; + ino = dentry->d_inode->i_ino; + dput(dentry); -/* - * This just sets the file->private_data back to the cursor and back. - */ -static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx) -{ - struct dentry_list *dlist = file->private_data; - int ret; + if (!dir_emit(ctx, name, strlen(name), ino, DT_REG)) + goto out; + ctx->pos++; + } + ret = 1; + out: + srcu_read_unlock(&eventfs_srcu, idx); - file->private_data = dlist->cursor; - ret = dcache_readdir(file, ctx); - dlist->cursor = file->private_data; - file->private_data = dlist; return ret; } -- GitLab From 8186fff7ab649085e2c60d032d9a20a85af1d87c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 3 Jan 2024 21:50:16 -0500 Subject: [PATCH 0641/1290] tracefs/eventfs: Use root and instance inodes as default ownership Instead of walking the dentries on mount/remount to update the gid values of all the dentries if a gid option is specified on mount, just update the root inode. Add .getattr, .setattr, and .permissions on the tracefs inode operations to update the permissions of the files and directories. For all files and directories in the top level instance: /sys/kernel/tracing/* It will use the root inode as the default permissions. The inode that represents: /sys/kernel/tracing (or wherever it is mounted). When an instance is created: mkdir /sys/kernel/tracing/instance/foo The directory "foo" and all its files and directories underneath will use the default of what foo is when it was created. A remount of tracefs will not affect it. If a user were to modify the permissions of any file or directory in tracefs, it will also no longer be modified by a change in ownership of a remount. The events directory, if it is in the top level instance, will use the tracefs root inode as the default ownership for itself and all the files and directories below it. For the events directory in an instance ("foo"), it will keep the ownership of what it was when it was created, and that will be used as the default ownership for the files and directories beneath it. Link: https://lore.kernel.org/linux-trace-kernel/CAHk-=wjVdGkjDXBbvLn2wbZnqP4UsH46E3gqJ9m7UG6DpX2+WA@mail.gmail.com/ Link: https://lore.kernel.org/linux-trace-kernel/20240103215016.1e0c9811@gandalf.local.home Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Linus Torvalds Cc: Al Viro Cc: Christian Brauner Cc: Greg Kroah-Hartman Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 79 +++++++++++++++- fs/tracefs/inode.c | 198 ++++++++++++++++++++++----------------- fs/tracefs/internal.h | 3 + 3 files changed, 190 insertions(+), 90 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 41af56f44f0a0..72912b5f9a90b 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -45,6 +45,7 @@ enum { EVENTFS_SAVE_MODE = BIT(16), EVENTFS_SAVE_UID = BIT(17), EVENTFS_SAVE_GID = BIT(18), + EVENTFS_TOPLEVEL = BIT(19), }; #define EVENTFS_MODE_MASK (EVENTFS_SAVE_MODE - 1) @@ -115,10 +116,17 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry, * The events directory dentry is never freed, unless its * part of an instance that is deleted. It's attr is the * default for its child files and directories. - * Do not update it. It's not used for its own mode or ownership + * Do not update it. It's not used for its own mode or ownership. */ - if (!ei->is_events) + if (ei->is_events) { + /* But it still needs to know if it was modified */ + if (iattr->ia_valid & ATTR_UID) + ei->attr.mode |= EVENTFS_SAVE_UID; + if (iattr->ia_valid & ATTR_GID) + ei->attr.mode |= EVENTFS_SAVE_GID; + } else { update_attr(&ei->attr, iattr); + } } else { name = dentry->d_name.name; @@ -136,9 +144,66 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry, return ret; } +static void update_top_events_attr(struct eventfs_inode *ei, struct dentry *dentry) +{ + struct inode *inode; + + /* Only update if the "events" was on the top level */ + if (!ei || !(ei->attr.mode & EVENTFS_TOPLEVEL)) + return; + + /* Get the tracefs root inode. */ + inode = d_inode(dentry->d_sb->s_root); + ei->attr.uid = inode->i_uid; + ei->attr.gid = inode->i_gid; +} + +static void set_top_events_ownership(struct inode *inode) +{ + struct tracefs_inode *ti = get_tracefs(inode); + struct eventfs_inode *ei = ti->private; + struct dentry *dentry; + + /* The top events directory doesn't get automatically updated */ + if (!ei || !ei->is_events || !(ei->attr.mode & EVENTFS_TOPLEVEL)) + return; + + dentry = ei->dentry; + + update_top_events_attr(ei, dentry); + + if (!(ei->attr.mode & EVENTFS_SAVE_UID)) + inode->i_uid = ei->attr.uid; + + if (!(ei->attr.mode & EVENTFS_SAVE_GID)) + inode->i_gid = ei->attr.gid; +} + +static int eventfs_get_attr(struct mnt_idmap *idmap, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) +{ + struct dentry *dentry = path->dentry; + struct inode *inode = d_backing_inode(dentry); + + set_top_events_ownership(inode); + + generic_fillattr(idmap, request_mask, inode, stat); + return 0; +} + +static int eventfs_permission(struct mnt_idmap *idmap, + struct inode *inode, int mask) +{ + set_top_events_ownership(inode); + return generic_permission(idmap, inode, mask); +} + static const struct inode_operations eventfs_root_dir_inode_operations = { .lookup = eventfs_root_lookup, .setattr = eventfs_set_attr, + .getattr = eventfs_get_attr, + .permission = eventfs_permission, }; static const struct inode_operations eventfs_file_inode_operations = { @@ -174,6 +239,8 @@ static struct eventfs_inode *eventfs_find_events(struct dentry *dentry) } while (!ei->is_events); mutex_unlock(&eventfs_mutex); + update_top_events_attr(ei, dentry); + return ei; } @@ -887,6 +954,14 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry uid = d_inode(dentry->d_parent)->i_uid; gid = d_inode(dentry->d_parent)->i_gid; + /* + * If the events directory is of the top instance, then parent + * is NULL. Set the attr.mode to reflect this and its permissions will + * default to the tracefs root dentry. + */ + if (!parent) + ei->attr.mode = EVENTFS_TOPLEVEL; + /* This is used as the default ownership of the files and directories */ ei->attr.uid = uid; ei->attr.gid = gid; diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index bc86ffdb103bc..e1b172c0e091a 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -91,6 +91,7 @@ static int tracefs_syscall_mkdir(struct mnt_idmap *idmap, struct inode *inode, struct dentry *dentry, umode_t mode) { + struct tracefs_inode *ti; char *name; int ret; @@ -98,6 +99,15 @@ static int tracefs_syscall_mkdir(struct mnt_idmap *idmap, if (!name) return -ENOMEM; + /* + * This is a new directory that does not take the default of + * the rootfs. It becomes the default permissions for all the + * files and directories underneath it. + */ + ti = get_tracefs(inode); + ti->flags |= TRACEFS_INSTANCE_INODE; + ti->private = inode; + /* * The mkdir call can call the generic functions that create * the files within the tracefs system. It is up to the individual @@ -141,10 +151,76 @@ static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry) return ret; } -static const struct inode_operations tracefs_dir_inode_operations = { +static void set_tracefs_inode_owner(struct inode *inode) +{ + struct tracefs_inode *ti = get_tracefs(inode); + struct inode *root_inode = ti->private; + + /* + * If this inode has never been referenced, then update + * the permissions to the superblock. + */ + if (!(ti->flags & TRACEFS_UID_PERM_SET)) + inode->i_uid = root_inode->i_uid; + + if (!(ti->flags & TRACEFS_GID_PERM_SET)) + inode->i_gid = root_inode->i_gid; +} + +static int tracefs_permission(struct mnt_idmap *idmap, + struct inode *inode, int mask) +{ + set_tracefs_inode_owner(inode); + return generic_permission(idmap, inode, mask); +} + +static int tracefs_getattr(struct mnt_idmap *idmap, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) +{ + struct inode *inode = d_backing_inode(path->dentry); + + set_tracefs_inode_owner(inode); + generic_fillattr(idmap, request_mask, inode, stat); + return 0; +} + +static int tracefs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr) +{ + unsigned int ia_valid = attr->ia_valid; + struct inode *inode = d_inode(dentry); + struct tracefs_inode *ti = get_tracefs(inode); + + if (ia_valid & ATTR_UID) + ti->flags |= TRACEFS_UID_PERM_SET; + + if (ia_valid & ATTR_GID) + ti->flags |= TRACEFS_GID_PERM_SET; + + return simple_setattr(idmap, dentry, attr); +} + +static const struct inode_operations tracefs_instance_dir_inode_operations = { .lookup = simple_lookup, .mkdir = tracefs_syscall_mkdir, .rmdir = tracefs_syscall_rmdir, + .permission = tracefs_permission, + .getattr = tracefs_getattr, + .setattr = tracefs_setattr, +}; + +static const struct inode_operations tracefs_dir_inode_operations = { + .lookup = simple_lookup, + .permission = tracefs_permission, + .getattr = tracefs_getattr, + .setattr = tracefs_setattr, +}; + +static const struct inode_operations tracefs_file_inode_operations = { + .permission = tracefs_permission, + .getattr = tracefs_getattr, + .setattr = tracefs_setattr, }; struct inode *tracefs_get_inode(struct super_block *sb) @@ -183,87 +259,6 @@ struct tracefs_fs_info { struct tracefs_mount_opts mount_opts; }; -static void change_gid(struct dentry *dentry, kgid_t gid) -{ - if (!dentry->d_inode) - return; - dentry->d_inode->i_gid = gid; -} - -/* - * Taken from d_walk, but without he need for handling renames. - * Nothing can be renamed while walking the list, as tracefs - * does not support renames. This is only called when mounting - * or remounting the file system, to set all the files to - * the given gid. - */ -static void set_gid(struct dentry *parent, kgid_t gid) -{ - struct dentry *this_parent; - struct list_head *next; - - this_parent = parent; - spin_lock(&this_parent->d_lock); - - change_gid(this_parent, gid); -repeat: - next = this_parent->d_subdirs.next; -resume: - while (next != &this_parent->d_subdirs) { - struct tracefs_inode *ti; - struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_child); - next = tmp->next; - - /* Note, getdents() can add a cursor dentry with no inode */ - if (!dentry->d_inode) - continue; - - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - - change_gid(dentry, gid); - - /* If this is the events directory, update that too */ - ti = get_tracefs(dentry->d_inode); - if (ti && (ti->flags & TRACEFS_EVENT_INODE)) - eventfs_update_gid(dentry, gid); - - if (!list_empty(&dentry->d_subdirs)) { - spin_unlock(&this_parent->d_lock); - spin_release(&dentry->d_lock.dep_map, _RET_IP_); - this_parent = dentry; - spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); - goto repeat; - } - spin_unlock(&dentry->d_lock); - } - /* - * All done at this level ... ascend and resume the search. - */ - rcu_read_lock(); -ascend: - if (this_parent != parent) { - struct dentry *child = this_parent; - this_parent = child->d_parent; - - spin_unlock(&child->d_lock); - spin_lock(&this_parent->d_lock); - - /* go into the first sibling still alive */ - do { - next = child->d_child.next; - if (next == &this_parent->d_subdirs) - goto ascend; - child = list_entry(next, struct dentry, d_child); - } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED)); - rcu_read_unlock(); - goto resume; - } - rcu_read_unlock(); - spin_unlock(&this_parent->d_lock); - return; -} - static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts) { substring_t args[MAX_OPT_ARGS]; @@ -336,10 +331,8 @@ static int tracefs_apply_options(struct super_block *sb, bool remount) if (!remount || opts->opts & BIT(Opt_uid)) inode->i_uid = opts->uid; - if (!remount || opts->opts & BIT(Opt_gid)) { - /* Set all the group ids to the mount option */ - set_gid(sb->s_root, opts->gid); - } + if (!remount || opts->opts & BIT(Opt_gid)) + inode->i_gid = opts->gid; return 0; } @@ -573,6 +566,26 @@ struct dentry *eventfs_end_creating(struct dentry *dentry) return dentry; } +/* Find the inode that this will use for default */ +static struct inode *instance_inode(struct dentry *parent, struct inode *inode) +{ + struct tracefs_inode *ti; + + /* If parent is NULL then use root inode */ + if (!parent) + return d_inode(inode->i_sb->s_root); + + /* Find the inode that is flagged as an instance or the root inode */ + while (!IS_ROOT(parent)) { + ti = get_tracefs(d_inode(parent)); + if (ti->flags & TRACEFS_INSTANCE_INODE) + break; + parent = parent->d_parent; + } + + return d_inode(parent); +} + /** * tracefs_create_file - create a file in the tracefs filesystem * @name: a pointer to a string containing the name of the file to create. @@ -603,6 +616,7 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) { + struct tracefs_inode *ti; struct dentry *dentry; struct inode *inode; @@ -621,7 +635,11 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, if (unlikely(!inode)) return tracefs_failed_creating(dentry); + ti = get_tracefs(inode); + ti->private = instance_inode(parent, inode); + inode->i_mode = mode; + inode->i_op = &tracefs_file_inode_operations; inode->i_fop = fops ? fops : &tracefs_file_operations; inode->i_private = data; inode->i_uid = d_inode(dentry->d_parent)->i_uid; @@ -634,6 +652,7 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, static struct dentry *__create_dir(const char *name, struct dentry *parent, const struct inode_operations *ops) { + struct tracefs_inode *ti; struct dentry *dentry = tracefs_start_creating(name, parent); struct inode *inode; @@ -651,6 +670,9 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent, inode->i_uid = d_inode(dentry->d_parent)->i_uid; inode->i_gid = d_inode(dentry->d_parent)->i_gid; + ti = get_tracefs(inode); + ti->private = instance_inode(parent, inode); + /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); d_instantiate(dentry, inode); @@ -681,7 +703,7 @@ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent) if (security_locked_down(LOCKDOWN_TRACEFS)) return NULL; - return __create_dir(name, parent, &simple_dir_inode_operations); + return __create_dir(name, parent, &tracefs_dir_inode_operations); } /** @@ -712,7 +734,7 @@ __init struct dentry *tracefs_create_instance_dir(const char *name, if (WARN_ON(tracefs_ops.mkdir || tracefs_ops.rmdir)) return NULL; - dentry = __create_dir(name, parent, &tracefs_dir_inode_operations); + dentry = __create_dir(name, parent, &tracefs_instance_dir_inode_operations); if (!dentry) return NULL; diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h index 42bdeb471a072..12b7d0150ae9e 100644 --- a/fs/tracefs/internal.h +++ b/fs/tracefs/internal.h @@ -5,6 +5,9 @@ enum { TRACEFS_EVENT_INODE = BIT(1), TRACEFS_EVENT_TOP_INODE = BIT(2), + TRACEFS_GID_PERM_SET = BIT(3), + TRACEFS_UID_PERM_SET = BIT(4), + TRACEFS_INSTANCE_INODE = BIT(5), }; struct tracefs_inode { -- GitLab From daf7795406bf307997366f694888bd317ae5b5fa Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 18 Dec 2023 14:52:14 -0800 Subject: [PATCH 0642/1290] scsi: ufs: core: Simplify power management during async scan ufshcd_init() calls pm_runtime_get_sync() before it calls async_schedule(). ufshcd_async_scan() calls pm_runtime_put_sync() directly or indirectly from ufshcd_add_lus(). Simplify ufshcd_async_scan() by always calling pm_runtime_put_sync() from ufshcd_async_scan(). Cc: Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20231218225229.2542156-2-bvanassche@acm.org Reviewed-by: Can Guo Reviewed-by: Manivannan Sadhasivam Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index d6ae5d17892c8..0ad8bde39cd19 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -8711,7 +8711,6 @@ static int ufshcd_add_lus(struct ufs_hba *hba) ufs_bsg_probe(hba); scsi_scan_host(hba->host); - pm_runtime_put_sync(hba->dev); out: return ret; @@ -8980,15 +8979,15 @@ static void ufshcd_async_scan(void *data, async_cookie_t cookie) /* Probe and add UFS logical units */ ret = ufshcd_add_lus(hba); + out: + pm_runtime_put_sync(hba->dev); /* * If we failed to initialize the device or the device is not * present, turn off the power/clocks etc. */ - if (ret) { - pm_runtime_put_sync(hba->dev); + if (ret) ufshcd_hba_exit(hba); - } } static enum scsi_timeout_action ufshcd_eh_timed_out(struct scsi_cmnd *scmd) -- GitLab From ee36710912b2075c417100a8acc642c9c6496501 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 18 Dec 2023 14:52:15 -0800 Subject: [PATCH 0643/1290] scsi: ufs: core: Remove the ufshcd_hba_exit() call from ufshcd_async_scan() Calling ufshcd_hba_exit() from a function that is called asynchronously from ufshcd_init() is wrong because this triggers multiple race conditions. Instead of calling ufshcd_hba_exit(), log an error message. Reported-by: Daniel Mentz Fixes: 1d337ec2f35e ("ufs: improve init sequence") Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20231218225229.2542156-3-bvanassche@acm.org Reviewed-by: Can Guo Reviewed-by: Manivannan Sadhasivam Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 0ad8bde39cd19..7c59d7a02243e 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -8982,12 +8982,9 @@ static void ufshcd_async_scan(void *data, async_cookie_t cookie) out: pm_runtime_put_sync(hba->dev); - /* - * If we failed to initialize the device or the device is not - * present, turn off the power/clocks etc. - */ + if (ret) - ufshcd_hba_exit(hba); + dev_err(hba->dev, "%s failed: %d\n", __func__, ret); } static enum scsi_timeout_action ufshcd_eh_timed_out(struct scsi_cmnd *scmd) -- GitLab From b08d86e6eb03566c5dc32e6ff10147f80aeb7511 Mon Sep 17 00:00:00 2001 From: ChanWoo Lee Date: Tue, 19 Dec 2023 17:27:40 +0900 Subject: [PATCH 0644/1290] scsi: ufs: qcom: Remove unnecessary goto statement from ufs_qcom_config_esi() There is only one place where goto is used, and it is unnecessary to check the ret value through 'goto out' because the ret value is already true. Therefore, remove the goto statement and integrate the '!ret' condition into the existing code. Signed-off-by: ChanWoo Lee Link: https://lore.kernel.org/r/20231219082740.27644-1-cw9316.lee@samsung.com Reviewed-by: Christophe JAILLET Reviewed-by: Manivannan Sadhasivam Signed-off-by: Martin K. Petersen --- drivers/ufs/host/ufs-qcom.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/ufs/host/ufs-qcom.c b/drivers/ufs/host/ufs-qcom.c index 9fd8d737edea9..0879f5ed4e5b1 100644 --- a/drivers/ufs/host/ufs-qcom.c +++ b/drivers/ufs/host/ufs-qcom.c @@ -1714,7 +1714,7 @@ static int ufs_qcom_config_esi(struct ufs_hba *hba) ufs_qcom_write_msi_msg); if (ret) { dev_err(hba->dev, "Failed to request Platform MSI %d\n", ret); - goto out; + return ret; } msi_lock_descs(hba->dev); @@ -1748,11 +1748,8 @@ static int ufs_qcom_config_esi(struct ufs_hba *hba) FIELD_PREP(ESI_VEC_MASK, MAX_ESI_VEC - 1), REG_UFS_CFG3); ufshcd_mcq_enable_esi(hba); - } - -out: - if (!ret) host->esi_enabled = true; + } return ret; } -- GitLab From c6d5aa44eaf6d119f9ceb3bfc7d22405ac04232a Mon Sep 17 00:00:00 2001 From: David Strahan Date: Tue, 19 Dec 2023 13:36:50 -0600 Subject: [PATCH 0645/1290] scsi: smartpqi: Add new controller PCI IDs All PCI ID entries in Hex. Add PCI IDs for Cisco controllers: VID / DID / SVID / SDID ---- ---- ---- ---- Cisco 24G TriMode M1 RAID 4GB FBWC 32D 9005 / 028f / 1137 / 02f8 Cisco 24G TriMode M1 RAID 4GB FBWC 16D 9005 / 028f / 1137 / 02f9 Cisco 24G TriMode M1 HBA 16D 9005 / 028f / 1137 / 02fa Add PCI IDs for CloudNine controllers: VID / DID / SVID / SDID ---- ---- ---- ---- SmartRAID P7604N-16i 9005 / 028f / 1f51 / 100e SmartRAID P7604N-8i 9005 / 028f / 1f51 / 100f SmartRAID P7504N-16i 9005 / 028f / 1f51 / 1010 SmartRAID P7504N-8i 9005 / 028f / 1f51 / 1011 SmartRAID P7504N-8i 9005 / 028f / 1f51 / 1043 SmartHBA P6500-8i 9005 / 028f / 1f51 / 1044 SmartRAID P7504-8i 9005 / 028f / 1f51 / 1045 Reviewed-by: Murthy Bhat Reviewed-by: Mahesh Rajashekhara Reviewed-by: Scott Teel Reviewed-by: Scott Benesh Reviewed-by: Mike McGowen Reviewed-by: Kevin Barnett Signed-off-by: David Strahan Signed-off-by: Don Brace Link: https://lore.kernel.org/r/20231219193653.277553-2-don.brace@microchip.com Signed-off-by: Martin K. Petersen --- drivers/scsi/smartpqi/smartpqi_init.c | 40 +++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/drivers/scsi/smartpqi/smartpqi_init.c b/drivers/scsi/smartpqi/smartpqi_init.c index 9a58df9312fa7..d562011200877 100644 --- a/drivers/scsi/smartpqi/smartpqi_init.c +++ b/drivers/scsi/smartpqi/smartpqi_init.c @@ -10142,6 +10142,18 @@ static const struct pci_device_id pqi_pci_id_table[] = { PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, 0x1014, 0x0718) }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + 0x1137, 0x02f8) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + 0x1137, 0x02f9) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + 0x1137, 0x02fa) + }, { PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, 0x1e93, 0x1000) @@ -10198,6 +10210,34 @@ static const struct pci_device_id pqi_pci_id_table[] = { PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, 0x1f51, 0x100a) }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + 0x1f51, 0x100e) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + 0x1f51, 0x100f) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + 0x1f51, 0x1010) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + 0x1f51, 0x1011) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + 0x1f51, 0x1043) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + 0x1f51, 0x1044) + }, + { + PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, + 0x1f51, 0x1045) + }, { PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x028f, PCI_ANY_ID, PCI_ANY_ID) -- GitLab From fb4cece17b4583f55b34a8538e27a4adc833c9d4 Mon Sep 17 00:00:00 2001 From: Mahesh Rajashekhara Date: Tue, 19 Dec 2023 13:36:51 -0600 Subject: [PATCH 0646/1290] scsi: smartpqi: Fix logical volume rescan race condition Correct rescan flag race condition. Multiple conditions are being evaluated before notifying OS to do a rescan. Driver will skip rescanning the device if any one of the following conditions are met: - Devices that have not yet been added to the OS or devices that have been removed. - Devices which are already marked for removal or in the phase of removal. Under very rare conditions, after logical volume size expansion, the OS still sees the size of the logical volume which was before expansion. The rescan flag in the driver is used to signal the need for a logical volume rescan. A race condition can occur in the driver, and it leads to one thread overwriting the flag inadvertently. As a result, driver is not notifying the OS SML to rescan the logical volume. Move device->rescan update into new function pqi_mark_volumes_for_rescan() and protect with a spin lock. Move check for device->rescan into new function pqi_volume_rescan_needed() and protect function call with a spin_lock. Reviewed-by: Scott Teel Reviewed-by: Scott Benesh Reviewed-by: Mike McGowen Reviewed-by: Kevin Barnett Co-developed-by: Murthy Bhat Signed-off-by: Murthy Bhat Signed-off-by: Mahesh Rajashekhara Signed-off-by: Don Brace Link: https://lore.kernel.org/r/20231219193653.277553-3-don.brace@microchip.com Signed-off-by: Martin K. Petersen --- drivers/scsi/smartpqi/smartpqi.h | 1 - drivers/scsi/smartpqi/smartpqi_init.c | 43 ++++++++++++++++++++++----- 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/drivers/scsi/smartpqi/smartpqi.h b/drivers/scsi/smartpqi/smartpqi.h index 0419401835169..cdedc271857aa 100644 --- a/drivers/scsi/smartpqi/smartpqi.h +++ b/drivers/scsi/smartpqi/smartpqi.h @@ -1347,7 +1347,6 @@ struct pqi_ctrl_info { bool controller_online; bool block_requests; bool scan_blocked; - u8 logical_volume_rescan_needed : 1; u8 inbound_spanning_supported : 1; u8 outbound_spanning_supported : 1; u8 pqi_mode_enabled : 1; diff --git a/drivers/scsi/smartpqi/smartpqi_init.c b/drivers/scsi/smartpqi/smartpqi_init.c index d562011200877..081bb2c098063 100644 --- a/drivers/scsi/smartpqi/smartpqi_init.c +++ b/drivers/scsi/smartpqi/smartpqi_init.c @@ -2093,8 +2093,6 @@ static void pqi_scsi_update_device(struct pqi_ctrl_info *ctrl_info, if (existing_device->devtype == TYPE_DISK) { existing_device->raid_level = new_device->raid_level; existing_device->volume_status = new_device->volume_status; - if (ctrl_info->logical_volume_rescan_needed) - existing_device->rescan = true; memset(existing_device->next_bypass_group, 0, sizeof(existing_device->next_bypass_group)); if (!pqi_raid_maps_equal(existing_device->raid_map, new_device->raid_map)) { kfree(existing_device->raid_map); @@ -2164,6 +2162,20 @@ static inline void pqi_init_device_tmf_work(struct pqi_scsi_dev *device) INIT_WORK(&tmf_work->work_struct, pqi_tmf_worker); } +static inline bool pqi_volume_rescan_needed(struct pqi_scsi_dev *device) +{ + if (pqi_device_in_remove(device)) + return false; + + if (device->sdev == NULL) + return false; + + if (!scsi_device_online(device->sdev)) + return false; + + return device->rescan; +} + static void pqi_update_device_list(struct pqi_ctrl_info *ctrl_info, struct pqi_scsi_dev *new_device_list[], unsigned int num_new_devices) { @@ -2284,9 +2296,13 @@ static void pqi_update_device_list(struct pqi_ctrl_info *ctrl_info, if (device->sdev && device->queue_depth != device->advertised_queue_depth) { device->advertised_queue_depth = device->queue_depth; scsi_change_queue_depth(device->sdev, device->advertised_queue_depth); - if (device->rescan) { - scsi_rescan_device(device->sdev); + spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags); + if (pqi_volume_rescan_needed(device)) { device->rescan = false; + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); + scsi_rescan_device(device->sdev); + } else { + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); } } } @@ -2308,8 +2324,6 @@ static void pqi_update_device_list(struct pqi_ctrl_info *ctrl_info, } } - ctrl_info->logical_volume_rescan_needed = false; - } static inline bool pqi_is_supported_device(struct pqi_scsi_dev *device) @@ -3702,6 +3716,21 @@ static bool pqi_ofa_process_event(struct pqi_ctrl_info *ctrl_info, return ack_event; } +static void pqi_mark_volumes_for_rescan(struct pqi_ctrl_info *ctrl_info) +{ + unsigned long flags; + struct pqi_scsi_dev *device; + + spin_lock_irqsave(&ctrl_info->scsi_device_list_lock, flags); + + list_for_each_entry(device, &ctrl_info->scsi_device_list, scsi_device_list_entry) { + if (pqi_is_logical_device(device) && device->devtype == TYPE_DISK) + device->rescan = true; + } + + spin_unlock_irqrestore(&ctrl_info->scsi_device_list_lock, flags); +} + static void pqi_disable_raid_bypass(struct pqi_ctrl_info *ctrl_info) { unsigned long flags; @@ -3742,7 +3771,7 @@ static void pqi_event_worker(struct work_struct *work) ack_event = true; rescan_needed = true; if (event->event_type == PQI_EVENT_TYPE_LOGICAL_DEVICE) - ctrl_info->logical_volume_rescan_needed = true; + pqi_mark_volumes_for_rescan(ctrl_info); else if (event->event_type == PQI_EVENT_TYPE_AIO_STATE_CHANGE) pqi_disable_raid_bypass(ctrl_info); } -- GitLab From 8c9955107762a23043db544d83959c4e0103bae3 Mon Sep 17 00:00:00 2001 From: Don Brace Date: Tue, 19 Dec 2023 13:36:52 -0600 Subject: [PATCH 0647/1290] scsi: smartpqi: Bump driver version to 2.1.26-030 Reviewed-by: Mahesh Rajashekhara Reviewed-by: Murthy Bhat Reviewed-by: Scott Benesh Reviewed-by: Scott Teel Reviewed-by: Mike McGowen Reviewed-by: Kevin Barnett Signed-off-by: Don Brace Link: https://lore.kernel.org/r/20231219193653.277553-4-don.brace@microchip.com Signed-off-by: Martin K. Petersen --- drivers/scsi/smartpqi/smartpqi_init.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/smartpqi/smartpqi_init.c b/drivers/scsi/smartpqi/smartpqi_init.c index 081bb2c098063..ceff1ec13f9ea 100644 --- a/drivers/scsi/smartpqi/smartpqi_init.c +++ b/drivers/scsi/smartpqi/smartpqi_init.c @@ -33,11 +33,11 @@ #define BUILD_TIMESTAMP #endif -#define DRIVER_VERSION "2.1.24-046" +#define DRIVER_VERSION "2.1.26-030" #define DRIVER_MAJOR 2 #define DRIVER_MINOR 1 -#define DRIVER_RELEASE 24 -#define DRIVER_REVISION 46 +#define DRIVER_RELEASE 26 +#define DRIVER_REVISION 30 #define DRIVER_NAME "Microchip SmartPQI Driver (v" \ DRIVER_VERSION BUILD_TIMESTAMP ")" -- GitLab From 904fdd2062f3101fb09db8ee077abf7ffd95e538 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 20 Dec 2023 21:31:13 -0800 Subject: [PATCH 0648/1290] scsi: mpi3mr: Fix mpi3mr_fw.c kernel-doc warnings Use correct format for function return values. Delete blank lines that are reported as "bad line:". mpi3mr_fw.c:482: warning: No description found for return value of 'mpi3mr_get_reply_desc' mpi3mr_fw.c:1066: warning: bad line: mpi3mr_fw.c:1109: warning: bad line: mpi3mr_fw.c:1249: warning: No description found for return value of 'mpi3mr_revalidate_factsdata' Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20231221053113.32191-1-rdunlap@infradead.org Cc: Sathya Prakash Veerichetty Cc: Kashyap Desai Cc: Sumit Saxena Cc: Sreekanth Reddy Cc: Cc: James E.J. Bottomley Cc: Martin K. Petersen Cc: Signed-off-by: Martin K. Petersen --- drivers/scsi/mpi3mr/mpi3mr_fw.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/mpi3mr/mpi3mr_fw.c b/drivers/scsi/mpi3mr/mpi3mr_fw.c index d8c57a0a518f4..528f19f782f21 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_fw.c +++ b/drivers/scsi/mpi3mr/mpi3mr_fw.c @@ -475,7 +475,7 @@ int mpi3mr_process_admin_reply_q(struct mpi3mr_ioc *mrioc) * @op_reply_q: op_reply_qinfo object * @reply_ci: operational reply descriptor's queue consumer index * - * Returns reply descriptor frame address + * Returns: reply descriptor frame address */ static inline struct mpi3_default_reply_descriptor * mpi3mr_get_reply_desc(struct op_reply_qinfo *op_reply_q, u32 reply_ci) @@ -1063,7 +1063,6 @@ enum mpi3mr_iocstate mpi3mr_get_iocstate(struct mpi3mr_ioc *mrioc) * @mrioc: Adapter instance reference * * Free the DMA memory allocated for IOCTL handling purpose. - * * Return: None */ @@ -1106,7 +1105,6 @@ static void mpi3mr_free_ioctl_dma_memory(struct mpi3mr_ioc *mrioc) /** * mpi3mr_alloc_ioctl_dma_memory - Alloc memory for ioctl dma * @mrioc: Adapter instance reference - * * This function allocates dmaable memory required to handle the * application issued MPI3 IOCTL requests. @@ -1241,7 +1239,7 @@ static int mpi3mr_issue_and_process_mur(struct mpi3mr_ioc *mrioc, * during reset/resume * @mrioc: Adapter instance reference * - * Return zero if the new IOCFacts parameters value is compatible with + * Return: zero if the new IOCFacts parameters value is compatible with * older values else return -EPERM */ static int -- GitLab From 1342ad786073e96fa813ad943c19f586157ae297 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Mon, 4 Dec 2023 21:19:44 +0300 Subject: [PATCH 0649/1290] apparmor: fix possible memory leak in unpack_trans_table If we fail to unpack the transition table then the table elements which have been already allocated are not freed on error path. unreferenced object 0xffff88802539e000 (size 128): comm "apparmor_parser", pid 903, jiffies 4294914938 (age 35.085s) hex dump (first 32 bytes): 20 73 6f 6d 65 20 6e 61 73 74 79 20 73 74 72 69 some nasty stri 6e 67 20 73 6f 6d 65 20 6e 61 73 74 79 20 73 74 ng some nasty st backtrace: [] __kmem_cache_alloc_node+0x1e2/0x2d0 [] __kmalloc_node_track_caller+0x54/0x170 [] kmemdup+0x29/0x60 [] aa_unpack_strdup+0xe5/0x1b0 [] unpack_pdb+0xeb8/0x2700 [] unpack_profile+0x1507/0x4a30 [] aa_unpack+0x36a/0x1560 [] aa_replace_profiles+0x213/0x33c0 [] policy_update+0x261/0x370 [] profile_replace+0x20e/0x2a0 [] vfs_write+0x2af/0xe00 [] ksys_write+0x126/0x250 [] do_syscall_64+0x46/0xf0 [] entry_SYSCALL_64_after_hwframe+0x6e/0x76 Call aa_free_str_table() on error path as was done before the blamed commit. It implements all necessary checks, frees str_table if it is available and nullifies the pointers. Found by Linux Verification Center (linuxtesting.org). Fixes: a0792e2ceddc ("apparmor: make transition table unpack generic so it can be reused") Signed-off-by: Fedor Pchelkin Signed-off-by: John Johansen --- security/apparmor/lib.c | 1 + security/apparmor/policy_unpack.c | 7 +++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/security/apparmor/lib.c b/security/apparmor/lib.c index 4c198d273f091..cd569fbbfe36d 100644 --- a/security/apparmor/lib.c +++ b/security/apparmor/lib.c @@ -41,6 +41,7 @@ void aa_free_str_table(struct aa_str_table *t) kfree_sensitive(t->table[i]); kfree_sensitive(t->table); t->table = NULL; + t->size = 0; } } diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 47ec097d6741f..9575da5fd4cb4 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -478,6 +478,8 @@ static bool unpack_trans_table(struct aa_ext *e, struct aa_str_table *strs) if (!table) goto fail; + strs->table = table; + strs->size = size; for (i = 0; i < size; i++) { char *str; int c, j, pos, size2 = aa_unpack_strdup(e, &str, NULL); @@ -520,14 +522,11 @@ static bool unpack_trans_table(struct aa_ext *e, struct aa_str_table *strs) goto fail; if (!aa_unpack_nameX(e, AA_STRUCTEND, NULL)) goto fail; - - strs->table = table; - strs->size = size; } return true; fail: - kfree_sensitive(table); + aa_free_str_table(strs); e->pos = saved_pos; return false; } -- GitLab From 55a8210c9e7d21ff2644809699765796d4bfb200 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Thu, 28 Dec 2023 19:07:43 +0300 Subject: [PATCH 0650/1290] apparmor: avoid crash when parsed profile name is empty When processing a packed profile in unpack_profile() described like "profile :ns::samba-dcerpcd /usr/lib*/samba/{,samba/}samba-dcerpcd {...}" a string ":samba-dcerpcd" is unpacked as a fully-qualified name and then passed to aa_splitn_fqname(). aa_splitn_fqname() treats ":samba-dcerpcd" as only containing a namespace. Thus it returns NULL for tmpname, meanwhile tmpns is non-NULL. Later aa_alloc_profile() crashes as the new profile name is NULL now. general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] PREEMPT SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] CPU: 6 PID: 1657 Comm: apparmor_parser Not tainted 6.7.0-rc2-dirty #16 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-3-gd478f380-rebuilt.opensuse.org 04/01/2014 RIP: 0010:strlen+0x1e/0xa0 Call Trace: ? strlen+0x1e/0xa0 aa_policy_init+0x1bb/0x230 aa_alloc_profile+0xb1/0x480 unpack_profile+0x3bc/0x4960 aa_unpack+0x309/0x15e0 aa_replace_profiles+0x213/0x33c0 policy_update+0x261/0x370 profile_replace+0x20e/0x2a0 vfs_write+0x2af/0xe00 ksys_write+0x126/0x250 do_syscall_64+0x46/0xf0 entry_SYSCALL_64_after_hwframe+0x6e/0x76 ---[ end trace 0000000000000000 ]--- RIP: 0010:strlen+0x1e/0xa0 It seems such behaviour of aa_splitn_fqname() is expected and checked in other places where it is called (e.g. aa_remove_profiles). Well, there is an explicit comment "a ns name without a following profile is allowed" inside. AFAICS, nothing can prevent unpacked "name" to be in form like ":samba-dcerpcd" - it is passed from userspace. Deny the whole profile set replacement in such case and inform user with EPROTO and an explaining message. Found by Linux Verification Center (linuxtesting.org). Fixes: 04dc715e24d0 ("apparmor: audit policy ns specified in policy load") Signed-off-by: Fedor Pchelkin Signed-off-by: John Johansen --- security/apparmor/policy_unpack.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 9575da5fd4cb4..dbf7d96257ada 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -832,6 +832,10 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) tmpname = aa_splitn_fqname(name, strlen(name), &tmpns, &ns_len); if (tmpns) { + if (!tmpname) { + info = "empty profile name"; + goto fail; + } *ns_name = kstrndup(tmpns, ns_len, GFP_KERNEL); if (!*ns_name) { info = "out of memory"; -- GitLab From 6c2c1e0009e97381a032d8c84747a46082fd327c Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 3 Jan 2024 12:08:46 +0000 Subject: [PATCH 0651/1290] 9p: Do a couple of cleanups Do a couple of cleanups to 9p: (1) Remove a couple of unused variables. (2) Turn a BUG_ON() into a warning, consolidate with another warning and make the warning message include the inode number rather than whatever's in i_private (which will get hashed anyway). Suggested-by: Dominique Martinet Link: https://lore.kernel.org/r/ZZULNQAZ0n0WQv7p@codewreck.org/ Signed-off-by: David Howells Acked-by: Dominique Martinet cc: Eric Van Hensbergen cc: Latchesar Ionkov cc: Christian Schoenebeck cc: v9fs@lists.linux.dev cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org --- fs/9p/vfs_addr.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index d8fb407189a09..f7f83eec3bccf 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -28,8 +28,6 @@ static void v9fs_upload_to_server(struct netfs_io_subrequest *subreq) { - struct inode *inode = subreq->rreq->inode; - struct v9fs_inode __maybe_unused *v9inode = V9FS_I(inode); struct p9_fid *fid = subreq->rreq->netfs_priv; int err; @@ -98,15 +96,13 @@ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file) if (file) { fid = file->private_data; - BUG_ON(!fid); + if (!fid) + goto no_fid; p9_fid_get(fid); } else { fid = v9fs_fid_find_inode(rreq->inode, writing, INVALID_UID, true); - if (!fid) { - WARN_ONCE(1, "folio expected an open fid inode->i_private=%p\n", - rreq->inode->i_private); - return -EINVAL; - } + if (!fid) + goto no_fid; } /* we might need to read from a fid that was opened write-only @@ -115,6 +111,11 @@ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file) WARN_ON(rreq->origin == NETFS_READ_FOR_WRITE && !(fid->mode & P9_ORDWR)); rreq->netfs_priv = fid; return 0; + +no_fid: + WARN_ONCE(1, "folio expected an open fid inode->i_ino=%lx\n", + rreq->inode->i_ino); + return -EINVAL; } /** -- GitLab From 252cf7b2eaf7cb904580ffbb0126d23411bcb43d Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 3 Jan 2024 14:30:48 +0000 Subject: [PATCH 0652/1290] 9p: Use length of data written to the server in preference to error In v9fs_upload_to_server(), we pass the error to netfslib to terminate the subreq rather than the amount of data written - even if we did actually write something. Further, we assume that the write is always entirely done if successful - but it might have been partially complete - as returned by p9_client_write(), but we ignore that. Fix this by indicating the amount written by preference and only returning the error if we didn't write anything. (We might want to return both in future if both are available as this might be useful as to whether we retry or not.) Suggested-by: Dominique Martinet Link: https://lore.kernel.org/r/ZZULNQAZ0n0WQv7p@codewreck.org/ Signed-off-by: David Howells Acked-by: Dominique Martinet cc: Eric Van Hensbergen cc: Latchesar Ionkov cc: Christian Schoenebeck cc: v9fs@lists.linux.dev cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org --- fs/9p/vfs_addr.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index f7f83eec3bccf..047855033d32f 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -29,12 +29,11 @@ static void v9fs_upload_to_server(struct netfs_io_subrequest *subreq) { struct p9_fid *fid = subreq->rreq->netfs_priv; - int err; + int err, len; trace_netfs_sreq(subreq, netfs_sreq_trace_submit); - p9_client_write(fid, subreq->start, &subreq->io_iter, &err); - netfs_write_subrequest_terminated(subreq, err < 0 ? err : subreq->len, - false); + len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err); + netfs_write_subrequest_terminated(subreq, len ?: err, false); } static void v9fs_upload_to_server_worker(struct work_struct *work) -- GitLab From 040a82be54c09a72162a3db2f5cd2ba289c0f224 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 13 Oct 2023 12:26:31 +0100 Subject: [PATCH 0653/1290] netfs: Rearrange netfs_io_subrequest to put request pointer first Rearrange the netfs_io_subrequest struct to put the netfs_io_request pointer (rreq) first. This then allows netfs_io_subrequest to be put in a union with a pointer to a wrapper around netfs_io_request. This will be useful in the future for cifs and maybe ceph. Signed-off-by: David Howells cc: Steve French cc: Shyam Prasad N cc: Rohith Surabattula cc: Jeff Layton cc: linux-cifs@vger.kernel.org cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/netfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 852956aa3c4bb..d3bac60fcd6f3 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -204,8 +204,8 @@ struct netfs_cache_resources { * the pages it points to can be relied on to exist for the duration. */ struct netfs_io_subrequest { - struct work_struct work; struct netfs_io_request *rreq; /* Supervising I/O request */ + struct work_struct work; struct list_head rreq_link; /* Link in rreq->subrequests */ struct iov_iter io_iter; /* Iterator for this subrequest */ loff_t start; /* Where to start the I/O */ -- GitLab From 43833f2ba5ce1543148a1b7cdd2513f5a663a17c Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 3 Jan 2024 21:08:11 +0000 Subject: [PATCH 0654/1290] netfs: Fix proc/fs/fscache symlink to point to "netfs" not "../netfs" Fix the proc/fs/fscache symlink to point to "netfs" not "../netfs". Reported-by: Marc Dionne Signed-off-by: David Howells cc: Jeff Layton cc: Christian Brauner cc: linux-fsdevel@vger.kernel.org cc: linux-cachefs@redhat.com --- fs/netfs/fscache_proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/netfs/fscache_proc.c b/fs/netfs/fscache_proc.c index ecd0d1edafaa4..874d951bc3901 100644 --- a/fs/netfs/fscache_proc.c +++ b/fs/netfs/fscache_proc.c @@ -16,7 +16,7 @@ */ int __init fscache_proc_init(void) { - if (!proc_symlink("fs/fscache", NULL, "../netfs")) + if (!proc_symlink("fs/fscache", NULL, "netfs")) goto error_sym; if (!proc_create_seq("fs/netfs/caches", S_IFREG | 0444, NULL, -- GitLab From ec4fcc6b6a63585af8897b49d9aae92af994505d Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 20 Dec 2023 19:09:05 -0800 Subject: [PATCH 0655/1290] Input: da9063_onkey - avoid using OF-specific APIs There is nothing OF-specific in the driver, so switch from OF properties helpers to generic device helpers. Reviewed-by: Biju Das Link: https://lore.kernel.org/r/ZYOsUfKceOFXuCt5@google.com Signed-off-by: Dmitry Torokhov --- drivers/input/misc/da9063_onkey.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/input/misc/da9063_onkey.c b/drivers/input/misc/da9063_onkey.c index a8b7f1cd0ec2a..ce499c28a7b26 100644 --- a/drivers/input/misc/da9063_onkey.c +++ b/drivers/input/misc/da9063_onkey.c @@ -9,11 +9,12 @@ #include #include #include +#include #include #include +#include #include #include -#include #include #include #include @@ -199,8 +200,8 @@ static int da9063_onkey_probe(struct platform_device *pdev) return dev_err_probe(&pdev->dev, -ENXIO, "Parent regmap unavailable.\n"); - onkey->key_power = !of_property_read_bool(pdev->dev.of_node, - "dlg,disable-key-power"); + onkey->key_power = !device_property_read_bool(&pdev->dev, + "dlg,disable-key-power"); onkey->input = devm_input_allocate_device(&pdev->dev); if (!onkey->input) -- GitLab From 0c64117d112b35dc3ba2020108ec26f538b95ae6 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 20 Dec 2023 19:09:45 -0800 Subject: [PATCH 0656/1290] Input: da9063_onkey - avoid explicitly setting input's parent devm_input_allocate_device() already sets parent of the new input device, there's no need to set it up explicitly. Reviewed-by: Biju Das Link: https://lore.kernel.org/r/ZYOseYfVgg0Ve6Zl@google.com Signed-off-by: Dmitry Torokhov --- drivers/input/misc/da9063_onkey.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/input/misc/da9063_onkey.c b/drivers/input/misc/da9063_onkey.c index ce499c28a7b26..c338765e0ecd0 100644 --- a/drivers/input/misc/da9063_onkey.c +++ b/drivers/input/misc/da9063_onkey.c @@ -211,7 +211,6 @@ static int da9063_onkey_probe(struct platform_device *pdev) snprintf(onkey->phys, sizeof(onkey->phys), "%s/input0", onkey->config->name); onkey->input->phys = onkey->phys; - onkey->input->dev.parent = &pdev->dev; input_set_capability(onkey->input, EV_KEY, KEY_POWER); -- GitLab From 982b6acec662ff06e9e89c61838f297f6544a84d Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 3 Jan 2024 23:42:56 -0800 Subject: [PATCH 0657/1290] perf vendor events intel: Alderlake/rocketlake metric fixes Fix that the core PMU is being specified for 2 uncore events. Specify a PMU for the alderlake UNCORE_FREQ metric. Conversion script updated in: https://github.com/intel/perfmon/pull/126 Committer testing: Before this patch the "perf all metricgroups test" was failing, now: root@number:~# perf test metric 10: PMU events : 10.3: Parsing of PMU event table metrics : Ok 10.4: Parsing of PMU event table metrics with fake PMUs : Ok 10.5: Parsing of metric thresholds with fake PMUs : Ok 61: Parse and process metrics : Ok 98: perf stat metrics (shadow stat) test : Skip 101: perf all metricgroups test : Ok 102: perf all metrics test : FAILED! 107: perf metrics value validation : Ok root@number:~# Test 102 is failing for another reason, not being able to get as many counters as needed, Ian Rogers suggested disabling the NMI watchdog to have more counters available: root@number:/home/acme# cat /proc/sys/kernel/nmi_watchdog 1 root@number:/home/acme# echo 0 > /proc/sys/kernel/nmi_watchdog root@number:/home/acme# perf test 102 102: perf all metrics test : Ok root@number:/home/acme# Closes: https://lore.kernel.org/lkml/ZZWOdHXJJ_oecWwm@kernel.org/ Reported-by: Arnaldo Carvalho de Melo Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Edward Baker Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240104074259.653219-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/x86/alderlake/adl-metrics.json | 15 ++++++++------- .../arch/x86/rocketlake/rkl-metrics.json | 2 +- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json b/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json index 3388b58b8f1a6..35124a4ddcb2b 100644 --- a/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json @@ -69,12 +69,6 @@ "MetricName": "C9_Pkg_Residency", "ScaleUnit": "100%" }, - { - "BriefDescription": "Uncore frequency per die [GHZ]", - "MetricExpr": "tma_info_system_socket_clks / #num_dies / duration_time / 1e9", - "MetricGroup": "SoC", - "MetricName": "UNCORE_FREQ" - }, { "BriefDescription": "Percentage of cycles spent in System Management Interrupts.", "MetricExpr": "((msr@aperf@ - cycles) / msr@aperf@ if msr@smi@ > 0 else 0)", @@ -809,6 +803,13 @@ "ScaleUnit": "100%", "Unit": "cpu_atom" }, + { + "BriefDescription": "Uncore frequency per die [GHZ]", + "MetricExpr": "tma_info_system_socket_clks / #num_dies / duration_time / 1e9", + "MetricGroup": "SoC", + "MetricName": "UNCORE_FREQ", + "Unit": "cpu_core" + }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", "MetricExpr": "(cpu_core@UOPS_DISPATCHED.PORT_0@ + cpu_core@UOPS_DISPATCHED.PORT_1@ + cpu_core@UOPS_DISPATCHED.PORT_5_11@ + cpu_core@UOPS_DISPATCHED.PORT_6@) / (5 * tma_info_core_core_clks)", @@ -1838,7 +1839,7 @@ }, { "BriefDescription": "Average number of parallel data read requests to external memory", - "MetricExpr": "UNC_ARB_DAT_OCCUPANCY.RD / cpu_core@UNC_ARB_DAT_OCCUPANCY.RD\\,cmask\\=1@", + "MetricExpr": "UNC_ARB_DAT_OCCUPANCY.RD / UNC_ARB_DAT_OCCUPANCY.RD@cmask\\=1@", "MetricGroup": "Mem;MemoryBW;SoC", "MetricName": "tma_info_system_mem_parallel_reads", "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches", diff --git a/tools/perf/pmu-events/arch/x86/rocketlake/rkl-metrics.json b/tools/perf/pmu-events/arch/x86/rocketlake/rkl-metrics.json index 0c880e4156699..27433fc15ede7 100644 --- a/tools/perf/pmu-events/arch/x86/rocketlake/rkl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/rocketlake/rkl-metrics.json @@ -985,7 +985,7 @@ }, { "BriefDescription": "Average number of parallel data read requests to external memory", - "MetricExpr": "UNC_ARB_DAT_OCCUPANCY.RD / cpu@UNC_ARB_DAT_OCCUPANCY.RD\\,cmask\\=1@", + "MetricExpr": "UNC_ARB_DAT_OCCUPANCY.RD / UNC_ARB_DAT_OCCUPANCY.RD@cmask\\=1@", "MetricGroup": "Mem;MemoryBW;SoC", "MetricName": "tma_info_system_mem_parallel_reads", "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" -- GitLab From 576d7fed09c7edbae7600f29a8a3ed6c1ead904f Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 3 Jan 2024 23:42:57 -0800 Subject: [PATCH 0658/1290] perf vendor events intel: Update emeraldrapids events to v1.02 Update to v1.02 released in: https://github.com/intel/perfmon/pull/123 Removes events AMX_OPS_RETIRED.BF16 and AMX_OPS_RETIRED.INT8. Add events FP_ARITH_DISPATCHED.V0, FP_ARITH_DISPATCHED.V1, FP_ARITH_DISPATCHED.V2, UNC_IIO_IOMMU0.1G_HITS, UNC_IIO_IOMMU0.2M_HITS and UNC_IIO_IOMMU0.4K_HITS. Description updates. Signed-off-by: Ian Rogers Reviewed-by: Kan Liang Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Edward Baker Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240104074259.653219-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- .../x86/emeraldrapids/floating-point.json | 27 +++++++++++++++-- .../arch/x86/emeraldrapids/pipeline.json | 18 +---------- .../emeraldrapids/uncore-interconnect.json | 8 ++--- .../arch/x86/emeraldrapids/uncore-io.json | 30 +++++++++++++++++++ tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- 5 files changed, 60 insertions(+), 25 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/floating-point.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/floating-point.json index 4a9d211e9d4f1..1bdefaf962877 100644 --- a/tools/perf/pmu-events/arch/x86/emeraldrapids/floating-point.json +++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/floating-point.json @@ -23,26 +23,47 @@ "UMask": "0x10" }, { - "BriefDescription": "FP_ARITH_DISPATCHED.PORT_0", + "BriefDescription": "FP_ARITH_DISPATCHED.PORT_0 [This event is alias to FP_ARITH_DISPATCHED.V0]", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.PORT_0", "SampleAfterValue": "2000003", "UMask": "0x1" }, { - "BriefDescription": "FP_ARITH_DISPATCHED.PORT_1", + "BriefDescription": "FP_ARITH_DISPATCHED.PORT_1 [This event is alias to FP_ARITH_DISPATCHED.V1]", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.PORT_1", "SampleAfterValue": "2000003", "UMask": "0x2" }, { - "BriefDescription": "FP_ARITH_DISPATCHED.PORT_5", + "BriefDescription": "FP_ARITH_DISPATCHED.PORT_5 [This event is alias to FP_ARITH_DISPATCHED.V2]", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.PORT_5", "SampleAfterValue": "2000003", "UMask": "0x4" }, + { + "BriefDescription": "FP_ARITH_DISPATCHED.V0 [This event is alias to FP_ARITH_DISPATCHED.PORT_0]", + "EventCode": "0xb3", + "EventName": "FP_ARITH_DISPATCHED.V0", + "SampleAfterValue": "2000003", + "UMask": "0x1" + }, + { + "BriefDescription": "FP_ARITH_DISPATCHED.V1 [This event is alias to FP_ARITH_DISPATCHED.PORT_1]", + "EventCode": "0xb3", + "EventName": "FP_ARITH_DISPATCHED.V1", + "SampleAfterValue": "2000003", + "UMask": "0x2" + }, + { + "BriefDescription": "FP_ARITH_DISPATCHED.V2 [This event is alias to FP_ARITH_DISPATCHED.PORT_5]", + "EventCode": "0xb3", + "EventName": "FP_ARITH_DISPATCHED.V2", + "SampleAfterValue": "2000003", + "UMask": "0x4" + }, { "BriefDescription": "Counts number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.", "EventCode": "0xc7", diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/pipeline.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/pipeline.json index 6dcf3b763af4f..1f8200fb89647 100644 --- a/tools/perf/pmu-events/arch/x86/emeraldrapids/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/pipeline.json @@ -1,20 +1,4 @@ [ - { - "BriefDescription": "AMX retired arithmetic BF16 operations.", - "EventCode": "0xce", - "EventName": "AMX_OPS_RETIRED.BF16", - "PublicDescription": "Number of AMX-based retired arithmetic bfloat16 (BF16) floating-point operations. Counts TDPBF16PS FP instructions. SW to use operation multiplier of 4", - "SampleAfterValue": "1000003", - "UMask": "0x2" - }, - { - "BriefDescription": "AMX retired arithmetic integer 8-bit operations.", - "EventCode": "0xce", - "EventName": "AMX_OPS_RETIRED.INT8", - "PublicDescription": "Number of AMX-based retired arithmetic integer operations of 8-bit width source operands. Counts TDPB[SS,UU,US,SU]D instructions. SW should use operation multiplier of 8.", - "SampleAfterValue": "1000003", - "UMask": "0x1" - }, { "BriefDescription": "This event is deprecated. Refer to new event ARITH.DIV_ACTIVE", "CounterMask": "1", @@ -505,7 +489,7 @@ "UMask": "0x1" }, { - "BriefDescription": "INT_MISC.UNKNOWN_BRANCH_CYCLES", + "BriefDescription": "Bubble cycles of BAClear (Unknown Branch).", "EventCode": "0xad", "EventName": "INT_MISC.UNKNOWN_BRANCH_CYCLES", "MSRIndex": "0x3F7", diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-interconnect.json index 09d840c7da4c9..65d088556bae8 100644 --- a/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-interconnect.json +++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-interconnect.json @@ -4825,11 +4825,11 @@ "Unit": "M3UPI" }, { - "BriefDescription": "Number of allocations into the CRS Egress used to queue up requests destined to the mesh (AD Bouncable)", + "BriefDescription": "Number of allocations into the CRS Egress used to queue up requests destined to the mesh (AD Bounceable)", "EventCode": "0x47", "EventName": "UNC_MDF_CRS_TxR_INSERTS.AD_BNC", "PerPkg": "1", - "PublicDescription": "AD Bouncable : Number of allocations into the CRS Egress", + "PublicDescription": "AD Bounceable : Number of allocations into the CRS Egress", "UMask": "0x1", "Unit": "MDF" }, @@ -4861,11 +4861,11 @@ "Unit": "MDF" }, { - "BriefDescription": "Number of allocations into the CRS Egress used to queue up requests destined to the mesh (BL Bouncable)", + "BriefDescription": "Number of allocations into the CRS Egress used to queue up requests destined to the mesh (BL Bounceable)", "EventCode": "0x47", "EventName": "UNC_MDF_CRS_TxR_INSERTS.BL_BNC", "PerPkg": "1", - "PublicDescription": "BL Bouncable : Number of allocations into the CRS Egress", + "PublicDescription": "BL Bounceable : Number of allocations into the CRS Egress", "UMask": "0x4", "Unit": "MDF" }, diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-io.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-io.json index 557080b74ee50..0761980c34a04 100644 --- a/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-io.json +++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-io.json @@ -1185,6 +1185,36 @@ "UMask": "0x70ff010", "Unit": "IIO" }, + { + "BriefDescription": ": IOTLB Hits to a 1G Page", + "EventCode": "0x40", + "EventName": "UNC_IIO_IOMMU0.1G_HITS", + "PerPkg": "1", + "PortMask": "0x0000", + "PublicDescription": ": IOTLB Hits to a 1G Page : Counts if a transaction to a 1G page, on its first lookup, hits the IOTLB.", + "UMask": "0x10", + "Unit": "IIO" + }, + { + "BriefDescription": ": IOTLB Hits to a 2M Page", + "EventCode": "0x40", + "EventName": "UNC_IIO_IOMMU0.2M_HITS", + "PerPkg": "1", + "PortMask": "0x0000", + "PublicDescription": ": IOTLB Hits to a 2M Page : Counts if a transaction to a 2M page, on its first lookup, hits the IOTLB.", + "UMask": "0x8", + "Unit": "IIO" + }, + { + "BriefDescription": ": IOTLB Hits to a 4K Page", + "EventCode": "0x40", + "EventName": "UNC_IIO_IOMMU0.4K_HITS", + "PerPkg": "1", + "PortMask": "0x0000", + "PublicDescription": ": IOTLB Hits to a 4K Page : Counts if a transaction to a 4K page, on its first lookup, hits the IOTLB.", + "UMask": "0x4", + "Unit": "IIO" + }, { "BriefDescription": ": Context cache hits", "EventCode": "0x40", diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index e571683f59f3d..fd38c516c0480 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -7,7 +7,7 @@ GenuineIntel-6-56,v11,broadwellde,core GenuineIntel-6-4F,v22,broadwellx,core GenuineIntel-6-55-[56789ABCDEF],v1.20,cascadelakex,core GenuineIntel-6-9[6C],v1.04,elkhartlake,core -GenuineIntel-6-CF,v1.01,emeraldrapids,core +GenuineIntel-6-CF,v1.02,emeraldrapids,core GenuineIntel-6-5[CF],v13,goldmont,core GenuineIntel-6-7A,v1.01,goldmontplus,core GenuineIntel-6-B6,v1.00,grandridge,core -- GitLab From 8550506887a93cd6ff4f6b44a08e8c2cc7a4d481 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 3 Jan 2024 23:42:58 -0800 Subject: [PATCH 0659/1290] perf vendor events intel: Update icelakex events to v1.23 Update to v1.23 released in: https://github.com/intel/perfmon/pull/123 Updates to event descriptions. Signed-off-by: Ian Rogers Reviewed-by: Kan Liang Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Edward Baker Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240104074259.653219-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/icelakex/other.json | 2 +- tools/perf/pmu-events/arch/x86/icelakex/pipeline.json | 2 +- .../pmu-events/arch/x86/icelakex/uncore-interconnect.json | 6 +++--- tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/icelakex/other.json b/tools/perf/pmu-events/arch/x86/icelakex/other.json index 63d5faf2fc43e..11810daaf1503 100644 --- a/tools/perf/pmu-events/arch/x86/icelakex/other.json +++ b/tools/perf/pmu-events/arch/x86/icelakex/other.json @@ -19,7 +19,7 @@ "BriefDescription": "Core cycles where the core was running in a manner where Turbo may be clipped to the AVX512 turbo schedule.", "EventCode": "0x28", "EventName": "CORE_POWER.LVL2_TURBO_LICENSE", - "PublicDescription": "Core cycles where the core was running with power-delivery for license level 2 (introduced in Skylake Server microarchtecture). This includes high current AVX 512-bit instructions.", + "PublicDescription": "Core cycles where the core was running with power-delivery for license level 2 (introduced in Skylake Server microarchitecture). This includes high current AVX 512-bit instructions.", "SampleAfterValue": "200003", "UMask": "0x20" }, diff --git a/tools/perf/pmu-events/arch/x86/icelakex/pipeline.json b/tools/perf/pmu-events/arch/x86/icelakex/pipeline.json index 176e5ef2a24af..45ee6bceba7f1 100644 --- a/tools/perf/pmu-events/arch/x86/icelakex/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/icelakex/pipeline.json @@ -519,7 +519,7 @@ "BriefDescription": "Cycles when Reservation Station (RS) is empty for the thread", "EventCode": "0x5e", "EventName": "RS_EVENTS.EMPTY_CYCLES", - "PublicDescription": "Counts cycles during which the reservation station (RS) is empty for this logical processor. This is usually caused when the front-end pipeline runs into stravation periods (e.g. branch mispredictions or i-cache misses)", + "PublicDescription": "Counts cycles during which the reservation station (RS) is empty for this logical processor. This is usually caused when the front-end pipeline runs into starvation periods (e.g. branch mispredictions or i-cache misses)", "SampleAfterValue": "1000003", "UMask": "0x1" }, diff --git a/tools/perf/pmu-events/arch/x86/icelakex/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/icelakex/uncore-interconnect.json index f87ea3f66d1be..a066a009c5117 100644 --- a/tools/perf/pmu-events/arch/x86/icelakex/uncore-interconnect.json +++ b/tools/perf/pmu-events/arch/x86/icelakex/uncore-interconnect.json @@ -38,7 +38,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.CLFLUSH", "PerPkg": "1", - "PublicDescription": "Coherent Ops : CLFlush : Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Coherent Ops : CLFlush : Counts the number of coherency related operations serviced by the IRP", "UMask": "0x80", "Unit": "IRP" }, @@ -65,7 +65,7 @@ "EventCode": "0x10", "EventName": "UNC_I_COHERENT_OPS.WBMTOI", "PerPkg": "1", - "PublicDescription": "Coherent Ops : WbMtoI : Counts the number of coherency related operations servied by the IRP", + "PublicDescription": "Coherent Ops : WbMtoI : Counts the number of coherency related operations serviced by the IRP", "UMask": "0x40", "Unit": "IRP" }, @@ -454,7 +454,7 @@ "EventCode": "0x11", "EventName": "UNC_I_TRANSACTIONS.WRITES", "PerPkg": "1", - "PublicDescription": "Inbound Transaction Count : Writes : Counts the number of Inbound transactions from the IRP to the Uncore. This can be filtered based on request type in addition to the source queue. Note the special filtering equation. We do OR-reduction on the request type. If the SOURCE bit is set, then we also do AND qualification based on the source portID. : Trackes only write requests. Each write request should have a prefetch, so there is no need to explicitly track these requests. For writes that are tickled and have to retry, the counter will be incremented for each retry.", + "PublicDescription": "Inbound Transaction Count : Writes : Counts the number of Inbound transactions from the IRP to the Uncore. This can be filtered based on request type in addition to the source queue. Note the special filtering equation. We do OR-reduction on the request type. If the SOURCE bit is set, then we also do AND qualification based on the source portID. : Tracks only write requests. Each write request should have a prefetch, so there is no need to explicitly track these requests. For writes that are tickled and have to retry, the counter will be incremented for each retry.", "UMask": "0x2", "Unit": "IRP" }, diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index fd38c516c0480..c1820eb16a19a 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -15,7 +15,7 @@ GenuineIntel-6-A[DE],v1.01,graniterapids,core GenuineIntel-6-(3C|45|46),v33,haswell,core GenuineIntel-6-3F,v28,haswellx,core GenuineIntel-6-7[DE],v1.19,icelake,core -GenuineIntel-6-6[AC],v1.21,icelakex,core +GenuineIntel-6-6[AC],v1.23,icelakex,core GenuineIntel-6-3A,v24,ivybridge,core GenuineIntel-6-3E,v24,ivytown,core GenuineIntel-6-2D,v24,jaketown,core -- GitLab From 360b045fceb282fb21b66131c396b0f85759ddb7 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 3 Jan 2024 23:42:59 -0800 Subject: [PATCH 0660/1290] perf vendor events intel: Update sapphirerapids events to v1.17 Update to v1.17 released in: https://github.com/intel/perfmon/pull/123 Add events FP_ARITH_DISPATCHED.V0, FP_ARITH_DISPATCHED.V1, FP_ARITH_DISPATCHED.V2, UNC_IIO_IOMMU0.1G_HITS, UNC_IIO_IOMMU0.2M_HITS and UNC_IIO_IOMMU0.4K_HITS. Description updates. Signed-off-by: Ian Rogers Reviewed-by: Kan Liang Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Edward Baker Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20240104074259.653219-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- .../x86/sapphirerapids/floating-point.json | 27 +++++++++++++++-- .../arch/x86/sapphirerapids/pipeline.json | 2 +- .../sapphirerapids/uncore-interconnect.json | 8 ++--- .../arch/x86/sapphirerapids/uncore-io.json | 30 +++++++++++++++++++ 5 files changed, 60 insertions(+), 9 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index c1820eb16a19a..4d1deed4437ab 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -26,7 +26,7 @@ GenuineIntel-6-1[AEF],v4,nehalemep,core GenuineIntel-6-2E,v4,nehalemex,core GenuineIntel-6-A7,v1.01,rocketlake,core GenuineIntel-6-2A,v19,sandybridge,core -GenuineIntel-6-8F,v1.16,sapphirerapids,core +GenuineIntel-6-8F,v1.17,sapphirerapids,core GenuineIntel-6-AF,v1.00,sierraforest,core GenuineIntel-6-(37|4A|4C|4D|5A),v15,silvermont,core GenuineIntel-6-(4E|5E|8E|9E|A5|A6),v57,skylake,core diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/floating-point.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/floating-point.json index 4a9d211e9d4f1..1bdefaf962877 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/floating-point.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/floating-point.json @@ -23,26 +23,47 @@ "UMask": "0x10" }, { - "BriefDescription": "FP_ARITH_DISPATCHED.PORT_0", + "BriefDescription": "FP_ARITH_DISPATCHED.PORT_0 [This event is alias to FP_ARITH_DISPATCHED.V0]", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.PORT_0", "SampleAfterValue": "2000003", "UMask": "0x1" }, { - "BriefDescription": "FP_ARITH_DISPATCHED.PORT_1", + "BriefDescription": "FP_ARITH_DISPATCHED.PORT_1 [This event is alias to FP_ARITH_DISPATCHED.V1]", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.PORT_1", "SampleAfterValue": "2000003", "UMask": "0x2" }, { - "BriefDescription": "FP_ARITH_DISPATCHED.PORT_5", + "BriefDescription": "FP_ARITH_DISPATCHED.PORT_5 [This event is alias to FP_ARITH_DISPATCHED.V2]", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.PORT_5", "SampleAfterValue": "2000003", "UMask": "0x4" }, + { + "BriefDescription": "FP_ARITH_DISPATCHED.V0 [This event is alias to FP_ARITH_DISPATCHED.PORT_0]", + "EventCode": "0xb3", + "EventName": "FP_ARITH_DISPATCHED.V0", + "SampleAfterValue": "2000003", + "UMask": "0x1" + }, + { + "BriefDescription": "FP_ARITH_DISPATCHED.V1 [This event is alias to FP_ARITH_DISPATCHED.PORT_1]", + "EventCode": "0xb3", + "EventName": "FP_ARITH_DISPATCHED.V1", + "SampleAfterValue": "2000003", + "UMask": "0x2" + }, + { + "BriefDescription": "FP_ARITH_DISPATCHED.V2 [This event is alias to FP_ARITH_DISPATCHED.PORT_5]", + "EventCode": "0xb3", + "EventName": "FP_ARITH_DISPATCHED.V2", + "SampleAfterValue": "2000003", + "UMask": "0x4" + }, { "BriefDescription": "Counts number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.", "EventCode": "0xc7", diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/pipeline.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/pipeline.json index 6dcf3b763af4f..2cfe814d20151 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/pipeline.json @@ -505,7 +505,7 @@ "UMask": "0x1" }, { - "BriefDescription": "INT_MISC.UNKNOWN_BRANCH_CYCLES", + "BriefDescription": "Bubble cycles of BAClear (Unknown Branch).", "EventCode": "0xad", "EventName": "INT_MISC.UNKNOWN_BRANCH_CYCLES", "MSRIndex": "0x3F7", diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-interconnect.json index 09d840c7da4c9..65d088556bae8 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-interconnect.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-interconnect.json @@ -4825,11 +4825,11 @@ "Unit": "M3UPI" }, { - "BriefDescription": "Number of allocations into the CRS Egress used to queue up requests destined to the mesh (AD Bouncable)", + "BriefDescription": "Number of allocations into the CRS Egress used to queue up requests destined to the mesh (AD Bounceable)", "EventCode": "0x47", "EventName": "UNC_MDF_CRS_TxR_INSERTS.AD_BNC", "PerPkg": "1", - "PublicDescription": "AD Bouncable : Number of allocations into the CRS Egress", + "PublicDescription": "AD Bounceable : Number of allocations into the CRS Egress", "UMask": "0x1", "Unit": "MDF" }, @@ -4861,11 +4861,11 @@ "Unit": "MDF" }, { - "BriefDescription": "Number of allocations into the CRS Egress used to queue up requests destined to the mesh (BL Bouncable)", + "BriefDescription": "Number of allocations into the CRS Egress used to queue up requests destined to the mesh (BL Bounceable)", "EventCode": "0x47", "EventName": "UNC_MDF_CRS_TxR_INSERTS.BL_BNC", "PerPkg": "1", - "PublicDescription": "BL Bouncable : Number of allocations into the CRS Egress", + "PublicDescription": "BL Bounceable : Number of allocations into the CRS Egress", "UMask": "0x4", "Unit": "MDF" }, diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-io.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-io.json index 8b5f54fed1033..03596db877101 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-io.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-io.json @@ -1249,6 +1249,36 @@ "UMask": "0x70ff010", "Unit": "IIO" }, + { + "BriefDescription": ": IOTLB Hits to a 1G Page", + "EventCode": "0x40", + "EventName": "UNC_IIO_IOMMU0.1G_HITS", + "PerPkg": "1", + "PortMask": "0x0000", + "PublicDescription": ": IOTLB Hits to a 1G Page : Counts if a transaction to a 1G page, on its first lookup, hits the IOTLB.", + "UMask": "0x10", + "Unit": "IIO" + }, + { + "BriefDescription": ": IOTLB Hits to a 2M Page", + "EventCode": "0x40", + "EventName": "UNC_IIO_IOMMU0.2M_HITS", + "PerPkg": "1", + "PortMask": "0x0000", + "PublicDescription": ": IOTLB Hits to a 2M Page : Counts if a transaction to a 2M page, on its first lookup, hits the IOTLB.", + "UMask": "0x8", + "Unit": "IIO" + }, + { + "BriefDescription": ": IOTLB Hits to a 4K Page", + "EventCode": "0x40", + "EventName": "UNC_IIO_IOMMU0.4K_HITS", + "PerPkg": "1", + "PortMask": "0x0000", + "PublicDescription": ": IOTLB Hits to a 4K Page : Counts if a transaction to a 4K page, on its first lookup, hits the IOTLB.", + "UMask": "0x4", + "Unit": "IIO" + }, { "BriefDescription": ": Context cache hits", "EventCode": "0x40", -- GitLab From 6af6d22495efeb00cb4e220a4e6e30b5be3afdf3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ahelenia=20Ziemia=C5=84ska?= Date: Tue, 27 Dec 2022 21:57:40 +0100 Subject: [PATCH 0661/1290] perf TUI: Don't ignore job control MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In its infinite wisdom, by default, SLang sets susp undef, and this can only be un-done by calling SLtty_set_suspend_state(true). After every SLang_init_tty(). Additionally, no provisions are made for maintaining the teletype attributes across suspend/continue (outside of curses emulation mode(?!), which provides full support, naturally), so we need to save and restore the flags ourselves, as well as reset the text colours when going under. We need to also re-draw the screen, and raising SIGWINCH, shockingly, Just Works. The correct solution would be to Not Use SLang, but as a stop-gap, this makes TUI 'perf report' usable. Signed-off-by: Ahelenia Ziemiańska Tested-by: Arnaldo Carvalho de Melo Tested-by: Namhyung Kim Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Peter Zijlstra Cc: yaowenbin Link: https://lore.kernel.org/r/0354dcae23a8713f75f4fed609e0caec3c6e3cd5.1672174189.git.nabijaczleweli@nabijaczleweli.xyz Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/annotate.c | 1 + tools/perf/ui/browsers/hists.c | 2 ++ tools/perf/ui/browsers/scripts.c | 1 + tools/perf/ui/tui/setup.c | 22 ++++++++++++++++++++++ 4 files changed, 26 insertions(+) diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index cb2eb6dcb5321..ec5e219328760 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -938,6 +938,7 @@ int hist_entry__tui_annotate(struct hist_entry *he, struct evsel *evsel, /* reset abort key so that it can get Ctrl-C as a key */ SLang_reset_tty(); SLang_init_tty(0, 0, 0); + SLtty_set_suspend_state(true); return map_symbol__tui_annotate(&he->ms, evsel, hbt); } diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index 3061dea29e6b6..0c02b3a8e121f 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -3000,6 +3000,7 @@ static int evsel__hists_browse(struct evsel *evsel, int nr_events, const char *h /* reset abort key so that it can get Ctrl-C as a key */ SLang_reset_tty(); SLang_init_tty(0, 0, 0); + SLtty_set_suspend_state(true); if (min_pcnt) browser->min_pcnt = min_pcnt; @@ -3667,6 +3668,7 @@ int block_hists_tui_browse(struct block_hist *bh, struct evsel *evsel, /* reset abort key so that it can get Ctrl-C as a key */ SLang_reset_tty(); SLang_init_tty(0, 0, 0); + SLtty_set_suspend_state(true); memset(&action, 0, sizeof(action)); diff --git a/tools/perf/ui/browsers/scripts.c b/tools/perf/ui/browsers/scripts.c index 47d2c7a8cbe13..50d45054ed6c1 100644 --- a/tools/perf/ui/browsers/scripts.c +++ b/tools/perf/ui/browsers/scripts.c @@ -166,6 +166,7 @@ void run_script(char *cmd) printf("\033[c\033[H\033[J"); fflush(stdout); SLang_init_tty(0, 0, 0); + SLtty_set_suspend_state(true); SLsmg_refresh(); } diff --git a/tools/perf/ui/tui/setup.c b/tools/perf/ui/tui/setup.c index 605d9e175ea73..16c6eff4d2411 100644 --- a/tools/perf/ui/tui/setup.c +++ b/tools/perf/ui/tui/setup.c @@ -2,12 +2,14 @@ #include #include #include +#include #include #include #ifdef HAVE_BACKTRACE_SUPPORT #include #endif +#include "../../util/color.h" #include "../../util/debug.h" #include "../browser.h" #include "../helpline.h" @@ -121,6 +123,23 @@ static void ui__signal(int sig) exit(0); } +static void ui__sigcont(int sig) +{ + static struct termios tty; + + if (sig == SIGTSTP) { + while (tcgetattr(SLang_TT_Read_FD, &tty) == -1 && errno == EINTR) + ; + while (write(SLang_TT_Read_FD, PERF_COLOR_RESET, sizeof(PERF_COLOR_RESET) - 1) == -1 && errno == EINTR) + ; + raise(SIGSTOP); + } else { + while (tcsetattr(SLang_TT_Read_FD, TCSADRAIN, &tty) == -1 && errno == EINTR) + ; + raise(SIGWINCH); + } +} + int ui__init(void) { int err; @@ -135,6 +154,7 @@ int ui__init(void) err = SLang_init_tty(-1, 0, 0); if (err < 0) goto out; + SLtty_set_suspend_state(true); err = SLkp_init(); if (err < 0) { @@ -149,6 +169,8 @@ int ui__init(void) signal(SIGINT, ui__signal); signal(SIGQUIT, ui__signal); signal(SIGTERM, ui__signal); + signal(SIGTSTP, ui__sigcont); + signal(SIGCONT, ui__sigcont); perf_error__register(&perf_tui_eops); -- GitLab From ad30469a841b50dbb541df4d6971d891f703c297 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 7 Dec 2023 16:05:13 -0800 Subject: [PATCH 0662/1290] libsubcmd: Fix memory leak in uniq() uniq() will write one command name over another causing the overwritten string to be leaked. Fix by doing a pass that removes duplicates and a second that removes the holes. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Chenyuan Mi Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231208000515.1693746-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/subcmd/help.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/tools/lib/subcmd/help.c b/tools/lib/subcmd/help.c index adfbae27dc369..8561b0f01a247 100644 --- a/tools/lib/subcmd/help.c +++ b/tools/lib/subcmd/help.c @@ -52,11 +52,21 @@ void uniq(struct cmdnames *cmds) if (!cmds->cnt) return; - for (i = j = 1; i < cmds->cnt; i++) - if (strcmp(cmds->names[i]->name, cmds->names[i-1]->name)) - cmds->names[j++] = cmds->names[i]; - + for (i = 1; i < cmds->cnt; i++) { + if (!strcmp(cmds->names[i]->name, cmds->names[i-1]->name)) + zfree(&cmds->names[i - 1]); + } + for (i = 0, j = 0; i < cmds->cnt; i++) { + if (cmds->names[i]) { + if (i == j) + j++; + else + cmds->names[j++] = cmds->names[i]; + } + } cmds->cnt = j; + while (j < i) + cmds->names[j++] = NULL; } void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes) -- GitLab From bb177a85e82b37d3b76e65f3f773e8502be49d9b Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 7 Dec 2023 09:40:57 -0800 Subject: [PATCH 0663/1290] perf tests: Add perf script test Start a new set of shell tests for testing perf script. The initial contribution is checking that some perf db-export functionality works as reported in this regression by Ben Gainey : https://lore.kernel.org/lkml/20231207140911.3240408-1-ben.gainey@arm.com/ Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Tested-by: Ben Gainey Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231207174057.1482161-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/script.sh | 66 ++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100755 tools/perf/tests/shell/script.sh diff --git a/tools/perf/tests/shell/script.sh b/tools/perf/tests/shell/script.sh new file mode 100755 index 0000000000000..5ae7bd0031a82 --- /dev/null +++ b/tools/perf/tests/shell/script.sh @@ -0,0 +1,66 @@ +#!/bin/sh +# perf script tests +# SPDX-License-Identifier: GPL-2.0 + +set -e + +temp_dir=$(mktemp -d /tmp/perf-test-script.XXXXXXXXXX) + +perfdatafile="${temp_dir}/perf.data" +db_test="${temp_dir}/db_test.py" + +err=0 + +cleanup() +{ + trap - EXIT TERM INT + sane=$(echo "${temp_dir}" | cut -b 1-21) + if [ "${sane}" = "/tmp/perf-test-script" ] ; then + echo "--- Cleaning up ---" + rm -f "${temp_dir}/"* + rmdir "${temp_dir}" + fi +} + +trap_cleanup() +{ + cleanup + exit 1 +} + +trap trap_cleanup EXIT TERM INT + + +test_db() +{ + echo "DB test" + + # Check if python script is supported + libpython=$(perf version --build-options | grep python | grep -cv OFF) + if [ "${libpython}" != "1" ] ; then + echo "SKIP: python scripting is not supported" + err=2 + return + fi + + cat << "_end_of_file_" > "${db_test}" +perf_db_export_mode = True +perf_db_export_calls = False +perf_db_export_callchains = True + +def sample_table(*args): + print(f'sample_table({args})') + +def call_path_table(*args): + print(f'call_path_table({args}') +_end_of_file_ + perf record -g -o "${perfdatafile}" true + perf script -i "${perfdatafile}" -s "${db_test}" + echo "DB test [Success]" +} + +test_db + +cleanup + +exit $err -- GitLab From 1e24ce402c97dc3c0ab050593f1d5f6fde524564 Mon Sep 17 00:00:00 2001 From: Ben Gainey Date: Thu, 7 Dec 2023 14:09:11 +0000 Subject: [PATCH 0664/1290] perf db-export: Fix missing reference count get in call_path_from_sample() The addr_location map and maps fields in the inner loop were missing calls to map__get()/maps__get(). The subsequent addr_location__exit() call in each loop puts the map/maps fields causing use-after-free aborts. This issue reproduces on at least arm64 and x86_64 with something simple like `perf record -g ls` followed by `perf script -s script.py` with the following script: perf_db_export_mode = True perf_db_export_calls = False perf_db_export_callchains = True def sample_table(*args): print(f'sample_table({args})') def call_path_table(*args): print(f'call_path_table({args}') Committer testing: This test, just introduced by Ian Rogers, now passes, not segfaulting anymore: # perf test "perf script tests" 95: perf script tests : Ok # Fixes: 0dd5041c9a0eaf8c ("perf addr_location: Add init/exit/copy functions") Signed-off-by: Ben Gainey Tested-by: Arnaldo Carvalho de Melo Tested-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20231207140911.3240408-1-ben.gainey@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/db-export.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c index b9fb71ab7a730..106429155c2e9 100644 --- a/tools/perf/util/db-export.c +++ b/tools/perf/util/db-export.c @@ -253,8 +253,8 @@ static struct call_path *call_path_from_sample(struct db_export *dbe, */ addr_location__init(&al); al.sym = node->ms.sym; - al.map = node->ms.map; - al.maps = thread__maps(thread); + al.map = map__get(node->ms.map); + al.maps = maps__get(thread__maps(thread)); al.addr = node->ip; if (al.map && !al.sym) -- GitLab From b6d8b858dbbbd832d255c3c8a3721173e6edf036 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Tue, 19 Dec 2023 15:32:35 +0100 Subject: [PATCH 0665/1290] perf test: test case 'Setup struct perf_event_attr' fails on s390 on z/vm perf test 17 'Setup struct perf_event_attr' fails on s390 z/VM guest, using linux-next kernel. Root cause is the fall-back from hardware counter cycles perf_event_attr: type 0 (PERF_TYPE_HARDWARE) size 136 config 0 (PERF_COUNT_HW_CPU_CYCLES) { sample_period, sample_freq } 4000 sample_type IP|TID|TIME|ADDR|PERIOD|DATA_SRC read_format ID|LOST which returns -ENOENT on s390 z/VM guest. This causes the code to fall back to software counter task-clock, as can be seen in the debug output: ------------------------------------------------------------ perf_event_attr: type 1 (PERF_TYPE_SOFTWARE) size 136 config 0x1 (PERF_COUNT_SW_TASK_CLOCK) <-here { sample_period, sample_freq } 4000 sample_type IP|TID|TIME|ADDR|PERIOD|DATA_SRC read_format ID|LOST This succeeds on s390 z/VM guest. This successful installation of the counter task-clock is not listed in the expected results and the test case fails. This is caused by commit eb2eac0c7b618033 ("perf evsel: Fallback to "task-clock" when not system wide") which introduced fall back from event 'cycles' to event 'task-clock'. To fix this on s390 allow event number 0 (cycles) and event number 1 (task-clock) as expected result. Output before: # ./perf test -Fv 17 17: Setup struct perf_event_attr : --- start --- running './tests/attr/test-stat-group1' unsupp './tests/attr/test-stat-group1' running './tests/attr/test-record-graph-default' test limitation '!aarch64' excluded architecture list ['aarch64'] expected config=0, got 1 FAILED './tests/attr/test-record-graph-default' - match failure ---- end ---- Setup struct perf_event_attr: FAILED! # Output after: # ./perf test -F 17 17: Setup struct perf_event_attr : Ok # Fixes: eb2eac0c7b618033 ("perf evsel: Fallback to "task-clock" when not system wide") Signed-off-by: Thomas Richter Acked-by: Ian Rogers Cc: Alexander Gordeev Cc: Heiko Carstens Cc: Sumanth Korikkar Cc: Sven Schnelle Cc: Vasily Gorbik Link: https://lore.kernel.org/r/20231219143235.1075522-1-tmricht@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/attr/base-record | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/tests/attr/base-record b/tools/perf/tests/attr/base-record index 27c21271a16c9..b44e4e6e44438 100644 --- a/tools/perf/tests/attr/base-record +++ b/tools/perf/tests/attr/base-record @@ -6,7 +6,7 @@ flags=0|8 cpu=* type=0|1 size=136 -config=0 +config=0|1 sample_period=* sample_type=263 read_format=0|4|20 -- GitLab From d6488fee66472b468ed88d265b14aa3f04dc3bdf Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 8 Dec 2023 11:06:36 +0800 Subject: [PATCH 0666/1290] cxl/port: Fix decoder initialization when nr_targets > interleave_ways The decoder_populate_targets() helper walks all of the targets in a port and makes sure they can be looked up in @target_map. Where @target_map is a lookup table from target position to target id (corresponding to a cxl_dport instance). However @target_map is only responsible for conveying the active dport instances as indicated by interleave_ways. When nr_targets > interleave_ways it results in decoder_populate_targets() walking off the end of the valid entries in @target_map. Given target_map is initialized to 0 it results in the dport lookup failing if position 0 is not mapped to a dport with an id of 0: cxl_port port3: Failed to populate active decoder targets cxl_port port3: Failed to add decoder cxl_port port3: Failed to add decoder3.0 cxl_bus_probe: cxl_port port3: probe: -6 This bug also highlights that when the decoder's ->targets[] array is written in cxl_port_setup_targets() it is missing a hold of the targets_lock to synchronize against sysfs readers of the target list. A fix for that is saved for a later patch. Fixes: a5c258021689 ("cxl/bus: Populate the target list at decoder create") Cc: Signed-off-by: Huang, Ying [djbw: rewrite the changelog, find the Fixes: tag] Co-developed-by: Dan Williams Reviewed-by: Alison Schofield Reviewed-by: Dave Jiang Signed-off-by: Dan Williams --- drivers/cxl/core/port.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index b7c93bb18f6e7..57495cdc181fd 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -1644,7 +1644,7 @@ static int decoder_populate_targets(struct cxl_switch_decoder *cxlsd, return -EINVAL; write_seqlock(&cxlsd->target_lock); - for (i = 0; i < cxlsd->nr_targets; i++) { + for (i = 0; i < cxlsd->cxld.interleave_ways; i++) { struct cxl_dport *dport = find_dport(port, target_map[i]); if (!dport) { -- GitLab From 5459e186a5c9f412334321cff58d70dcb0e48a04 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 21 Dec 2023 21:05:01 -0800 Subject: [PATCH 0667/1290] cxl/port: Fix missing target list lock cxl_port_setup_targets() modifies the ->targets[] array of a switch decoder. target_list_show() expects to be able to emit a coherent snapshot of that array by "holding" ->target_lock for read. The target_lock is held for write during initialization of the ->targets[] array, but it is not held for write during cxl_port_setup_targets(). The ->target_lock() predates the introduction of @cxl_region_rwsem. That semaphore protects changes to host-physical-address (HPA) decode which is precisely what writes to a switch decoder's target list affects. Replace ->target_lock with @cxl_region_rwsem. Now the side-effect of snapshotting a unstable view of a decoder's target list is likely benign so the Fixes: tag is presumptive. Fixes: 27b3f8d13830 ("cxl/region: Program target lists") Reviewed-by: Alison Schofield Reviewed-by: Dave Jiang Signed-off-by: Dan Williams --- drivers/cxl/core/port.c | 22 +++++++--------------- drivers/cxl/cxl.h | 2 -- 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 57495cdc181fd..6b386771cc259 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -172,14 +172,10 @@ static ssize_t target_list_show(struct device *dev, { struct cxl_switch_decoder *cxlsd = to_cxl_switch_decoder(dev); ssize_t offset; - unsigned int seq; int rc; - do { - seq = read_seqbegin(&cxlsd->target_lock); - rc = emit_target_list(cxlsd, buf); - } while (read_seqretry(&cxlsd->target_lock, seq)); - + guard(rwsem_read)(&cxl_region_rwsem); + rc = emit_target_list(cxlsd, buf); if (rc < 0) return rc; offset = rc; @@ -1633,7 +1629,7 @@ EXPORT_SYMBOL_NS_GPL(cxl_mem_find_port, CXL); static int decoder_populate_targets(struct cxl_switch_decoder *cxlsd, struct cxl_port *port, int *target_map) { - int i, rc = 0; + int i; if (!target_map) return 0; @@ -1643,19 +1639,16 @@ static int decoder_populate_targets(struct cxl_switch_decoder *cxlsd, if (xa_empty(&port->dports)) return -EINVAL; - write_seqlock(&cxlsd->target_lock); + guard(rwsem_write)(&cxl_region_rwsem); for (i = 0; i < cxlsd->cxld.interleave_ways; i++) { struct cxl_dport *dport = find_dport(port, target_map[i]); - if (!dport) { - rc = -ENXIO; - break; - } + if (!dport) + return -ENXIO; cxlsd->target[i] = dport; } - write_sequnlock(&cxlsd->target_lock); - return rc; + return 0; } struct cxl_dport *cxl_hb_modulo(struct cxl_root_decoder *cxlrd, int pos) @@ -1725,7 +1718,6 @@ static int cxl_switch_decoder_init(struct cxl_port *port, return -EINVAL; cxlsd->nr_targets = nr_targets; - seqlock_init(&cxlsd->target_lock); return cxl_decoder_init(port, &cxlsd->cxld); } diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 687043ece1018..62fa96f8567e5 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -412,7 +412,6 @@ struct cxl_endpoint_decoder { /** * struct cxl_switch_decoder - Switch specific CXL HDM Decoder * @cxld: base cxl_decoder object - * @target_lock: coordinate coherent reads of the target list * @nr_targets: number of elements in @target * @target: active ordered target list in current decoder configuration * @@ -424,7 +423,6 @@ struct cxl_endpoint_decoder { */ struct cxl_switch_decoder { struct cxl_decoder cxld; - seqlock_t target_lock; int nr_targets; struct cxl_dport *target[]; }; -- GitLab From e109deadb73318cf4a3bd61287d969f705df278f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 4 Jan 2024 16:57:12 -0500 Subject: [PATCH 0668/1290] eventfs: Have eventfs_iterate() stop immediately if ei->is_freed is set If ei->is_freed is set in eventfs_iterate(), it means that the directory that is being iterated on is in the process of being freed. Just exit the loop immediately when that is ever detected, and separate out the return of the entry->callback() from ei->is_freed. Link: https://lore.kernel.org/linux-trace-kernel/20240104220048.016261289@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Linus Torvalds Cc: Al Viro Cc: Christian Brauner Cc: Greg Kroah-Hartman Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 72912b5f9a90b..0aca6910efb3f 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -788,11 +788,12 @@ static int eventfs_iterate(struct file *file, struct dir_context *ctx) name = entry->name; mutex_lock(&eventfs_mutex); - /* If ei->is_freed, then the event itself may be too */ - if (!ei->is_freed) - r = entry->callback(name, &mode, &cdata, &fops); - else - r = -1; + /* If ei->is_freed then just bail here, nothing more to do */ + if (ei->is_freed) { + mutex_unlock(&eventfs_mutex); + goto out; + } + r = entry->callback(name, &mode, &cdata, &fops); mutex_unlock(&eventfs_mutex); if (r <= 0) continue; -- GitLab From 1e4624eb5a0ecaae0d2c4e3019bece119725bb98 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 4 Jan 2024 16:57:13 -0500 Subject: [PATCH 0669/1290] eventfs: Do ctx->pos update for all iterations in eventfs_iterate() The ctx->pos was only updated when it added an entry, but the "skip to current pos" check (c--) happened for every loop regardless of if the entry was added or not. This inconsistency caused readdir to be incorrect. It was due to: for (i = 0; i < ei->nr_entries; i++) { if (c > 0) { c--; continue; } mutex_lock(&eventfs_mutex); /* If ei->is_freed then just bail here, nothing more to do */ if (ei->is_freed) { mutex_unlock(&eventfs_mutex); goto out; } r = entry->callback(name, &mode, &cdata, &fops); mutex_unlock(&eventfs_mutex); [..] ctx->pos++; } But this can cause the iterator to return a file that was already read. That's because of the way the callback() works. Some events may not have all files, and the callback can return 0 to tell eventfs to skip the file for this directory. for instance, we have: # ls /sys/kernel/tracing/events/ftrace/function format hist hist_debug id inject and # ls /sys/kernel/tracing/events/sched/sched_switch/ enable filter format hist hist_debug id inject trigger Where the function directory is missing "enable", "filter" and "trigger". That's because the callback() for events has: static int event_callback(const char *name, umode_t *mode, void **data, const struct file_operations **fops) { struct trace_event_file *file = *data; struct trace_event_call *call = file->event_call; [..] /* * Only event directories that can be enabled should have * triggers or filters, with the exception of the "print" * event that can have a "trigger" file. */ if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) { if (call->class->reg && strcmp(name, "enable") == 0) { *mode = TRACE_MODE_WRITE; *fops = &ftrace_enable_fops; return 1; } if (strcmp(name, "filter") == 0) { *mode = TRACE_MODE_WRITE; *fops = &ftrace_event_filter_fops; return 1; } } if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) || strcmp(trace_event_name(call), "print") == 0) { if (strcmp(name, "trigger") == 0) { *mode = TRACE_MODE_WRITE; *fops = &event_trigger_fops; return 1; } } [..] return 0; } Where the function event has the TRACE_EVENT_FL_IGNORE_ENABLE set. This means that the entries array elements for "enable", "filter" and "trigger" when called on the function event will have the callback return 0 and not 1, to tell eventfs to skip these files for it. Because the "skip to current ctx->pos" check happened for all entries, but the ctx->pos++ only happened to entries that exist, it would confuse the reading of a directory. Which would cause: # ls /sys/kernel/tracing/events/ftrace/function/ format hist hist hist_debug hist_debug id inject inject The missing "enable", "filter" and "trigger" caused ls to show "hist", "hist_debug" and "inject" twice. Update the ctx->pos for every iteration to keep its update and the "skip" update consistent. This also means that on error, the ctx->pos needs to be decremented if it was incremented without adding something. Link: https://lore.kernel.org/all/20240104150500.38b15a62@gandalf.local.home/ Link: https://lore.kernel.org/linux-trace-kernel/20240104220048.172295263@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Linus Torvalds Cc: Al Viro Cc: Christian Brauner Cc: Greg Kroah-Hartman Fixes: 493ec81a8fb8e ("eventfs: Stop using dcache_readdir() for getdents()") Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 0aca6910efb3f..c73fb1f7ddbc2 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -760,6 +760,8 @@ static int eventfs_iterate(struct file *file, struct dir_context *ctx) continue; } + ctx->pos++; + if (ei_child->is_freed) continue; @@ -767,13 +769,12 @@ static int eventfs_iterate(struct file *file, struct dir_context *ctx) dentry = create_dir_dentry(ei, ei_child, ei_dentry); if (!dentry) - goto out; + goto out_dec; ino = dentry->d_inode->i_ino; dput(dentry); if (!dir_emit(ctx, name, strlen(name), ino, DT_DIR)) - goto out; - ctx->pos++; + goto out_dec; } for (i = 0; i < ei->nr_entries; i++) { @@ -784,6 +785,8 @@ static int eventfs_iterate(struct file *file, struct dir_context *ctx) continue; } + ctx->pos++; + entry = &ei->entries[i]; name = entry->name; @@ -791,7 +794,7 @@ static int eventfs_iterate(struct file *file, struct dir_context *ctx) /* If ei->is_freed then just bail here, nothing more to do */ if (ei->is_freed) { mutex_unlock(&eventfs_mutex); - goto out; + goto out_dec; } r = entry->callback(name, &mode, &cdata, &fops); mutex_unlock(&eventfs_mutex); @@ -800,19 +803,23 @@ static int eventfs_iterate(struct file *file, struct dir_context *ctx) dentry = create_file_dentry(ei, i, ei_dentry, name, mode, cdata, fops); if (!dentry) - goto out; + goto out_dec; ino = dentry->d_inode->i_ino; dput(dentry); if (!dir_emit(ctx, name, strlen(name), ino, DT_REG)) - goto out; - ctx->pos++; + goto out_dec; } ret = 1; out: srcu_read_unlock(&eventfs_srcu, idx); return ret; + + out_dec: + /* Incremented ctx->pos without adding something, reset it */ + ctx->pos--; + goto out; } /** -- GitLab From 704f960dbee2f1634f4b4e16f208cb16eaf41c1e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 4 Jan 2024 16:57:14 -0500 Subject: [PATCH 0670/1290] eventfs: Read ei->entries before ei->children in eventfs_iterate() In order to apply a shortcut to skip over the current ctx->pos immediately, by using the ei->entries array, the reading of that array should be first. Moving the array reading before the linked list reading will make the shortcut change diff nicer to read. Link: https://lore.kernel.org/all/CAHk-=wiKwDUDv3+jCsv-uacDcHDVTYsXtBR9=6sGM5mqX+DhOg@mail.gmail.com/ Link: https://lore.kernel.org/linux-trace-kernel/20240104220048.333115095@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Linus Torvalds Cc: Al Viro Cc: Christian Brauner Cc: Greg Kroah-Hartman Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 46 ++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index c73fb1f7ddbc2..a1934e0eea3b7 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -752,8 +752,8 @@ static int eventfs_iterate(struct file *file, struct dir_context *ctx) * Need to create the dentries and inodes to have a consistent * inode number. */ - list_for_each_entry_srcu(ei_child, &ei->children, list, - srcu_read_lock_held(&eventfs_srcu)) { + for (i = 0; i < ei->nr_entries; i++) { + void *cdata = ei->data; if (c > 0) { c--; @@ -762,23 +762,32 @@ static int eventfs_iterate(struct file *file, struct dir_context *ctx) ctx->pos++; - if (ei_child->is_freed) - continue; + entry = &ei->entries[i]; + name = entry->name; - name = ei_child->name; + mutex_lock(&eventfs_mutex); + /* If ei->is_freed then just bail here, nothing more to do */ + if (ei->is_freed) { + mutex_unlock(&eventfs_mutex); + goto out_dec; + } + r = entry->callback(name, &mode, &cdata, &fops); + mutex_unlock(&eventfs_mutex); + if (r <= 0) + continue; - dentry = create_dir_dentry(ei, ei_child, ei_dentry); + dentry = create_file_dentry(ei, i, ei_dentry, name, mode, cdata, fops); if (!dentry) goto out_dec; ino = dentry->d_inode->i_ino; dput(dentry); - if (!dir_emit(ctx, name, strlen(name), ino, DT_DIR)) + if (!dir_emit(ctx, name, strlen(name), ino, DT_REG)) goto out_dec; } - for (i = 0; i < ei->nr_entries; i++) { - void *cdata = ei->data; + list_for_each_entry_srcu(ei_child, &ei->children, list, + srcu_read_lock_held(&eventfs_srcu)) { if (c > 0) { c--; @@ -787,27 +796,18 @@ static int eventfs_iterate(struct file *file, struct dir_context *ctx) ctx->pos++; - entry = &ei->entries[i]; - name = entry->name; - - mutex_lock(&eventfs_mutex); - /* If ei->is_freed then just bail here, nothing more to do */ - if (ei->is_freed) { - mutex_unlock(&eventfs_mutex); - goto out_dec; - } - r = entry->callback(name, &mode, &cdata, &fops); - mutex_unlock(&eventfs_mutex); - if (r <= 0) + if (ei_child->is_freed) continue; - dentry = create_file_dentry(ei, i, ei_dentry, name, mode, cdata, fops); + name = ei_child->name; + + dentry = create_dir_dentry(ei, ei_child, ei_dentry); if (!dentry) goto out_dec; ino = dentry->d_inode->i_ino; dput(dentry); - if (!dir_emit(ctx, name, strlen(name), ino, DT_REG)) + if (!dir_emit(ctx, name, strlen(name), ino, DT_DIR)) goto out_dec; } ret = 1; -- GitLab From 1de94b52d5e8d8b32f0252f14fad1f1edc2e71f1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 4 Jan 2024 16:57:15 -0500 Subject: [PATCH 0671/1290] eventfs: Shortcut eventfs_iterate() by skipping entries already read As the ei->entries array is fixed for the duration of the eventfs_inode, it can be used to skip over already read entries in eventfs_iterate(). That is, if ctx->pos is greater than zero, there's no reason in doing the loop across the ei->entries array for the entries less than ctx->pos. Instead, start the lookup of the entries at the current ctx->pos. Link: https://lore.kernel.org/all/CAHk-=wiKwDUDv3+jCsv-uacDcHDVTYsXtBR9=6sGM5mqX+DhOg@mail.gmail.com/ Link: https://lore.kernel.org/linux-trace-kernel/20240104220048.494956957@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Al Viro Cc: Christian Brauner Cc: Greg Kroah-Hartman Suggested-by: Linus Torvalds Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index a1934e0eea3b7..fdff53d5a1f87 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -746,21 +746,15 @@ static int eventfs_iterate(struct file *file, struct dir_context *ctx) if (!ei || !ei_dentry) goto out; - ret = 0; - /* * Need to create the dentries and inodes to have a consistent * inode number. */ - for (i = 0; i < ei->nr_entries; i++) { - void *cdata = ei->data; - - if (c > 0) { - c--; - continue; - } + ret = 0; - ctx->pos++; + /* Start at 'c' to jump over already read entries */ + for (i = c; i < ei->nr_entries; i++, ctx->pos++) { + void *cdata = ei->data; entry = &ei->entries[i]; name = entry->name; @@ -769,7 +763,7 @@ static int eventfs_iterate(struct file *file, struct dir_context *ctx) /* If ei->is_freed then just bail here, nothing more to do */ if (ei->is_freed) { mutex_unlock(&eventfs_mutex); - goto out_dec; + goto out; } r = entry->callback(name, &mode, &cdata, &fops); mutex_unlock(&eventfs_mutex); @@ -778,14 +772,17 @@ static int eventfs_iterate(struct file *file, struct dir_context *ctx) dentry = create_file_dentry(ei, i, ei_dentry, name, mode, cdata, fops); if (!dentry) - goto out_dec; + goto out; ino = dentry->d_inode->i_ino; dput(dentry); if (!dir_emit(ctx, name, strlen(name), ino, DT_REG)) - goto out_dec; + goto out; } + /* Subtract the skipped entries above */ + c -= min((unsigned int)c, (unsigned int)ei->nr_entries); + list_for_each_entry_srcu(ei_child, &ei->children, list, srcu_read_lock_held(&eventfs_srcu)) { -- GitLab From 159956f34ede363e67a87bea840937e242293e91 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 26 Dec 2023 22:52:40 +0900 Subject: [PATCH 0672/1290] kbuild: deb-pkg: set DEB_* variables if debian/rules is directly executed Since commit 491b146d4c13 ("kbuild: builddeb: Eliminate debian/arch use"), direct execution of debian/rules results in the following error: dpkg-architecture: error: unknown option 'DEB_HOST_MULTIARCH' The current code: dpkg-architecture -a$DEB_HOST_ARCH -qDEB_HOST_MULTIARCH ... does not look sensible because: - For this code to work correctly, DEB_HOST_ARCH must be pre-defined, which is true when the packages are built via dpkg-buildpackage. In this case, DEB_HOST_MULTIARCH is also likely defined, hence there is no need to query DEB_HOST_MULTIARCH in the first place. - If DEB_HOST_MULTIARCH is undefined, DEB_HOST_ARCH is likely undefined too. So, you cannot query DEB_HOST_MULTIARCH in this way. This is mostly the case where debian/rules is directly executed. When debian/rules is directly executed, querying DEB_HOST_MUCHARCH is not enough because we need to know DEB_{BUILD,HOST}_GNU_TYPE as well. All DEB_* variables are defined when the package build is initiated by dpkg-buildpackage, but otherwise, let's call dpkg-architecture to set all DEB_* environment variables. This requires dpkg 1.20.6 or newer because --print-format option was added in dpkg commit 7c54fa2b232e ("dpkg-architecture: Add a --print-format option"). Signed-off-by: Masahiro Yamada Reviewed-by: Nicolas Schier --- scripts/package/builddeb | 5 ++--- scripts/package/debian/rules | 13 ++++++++++++- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/scripts/package/builddeb b/scripts/package/builddeb index 2fe51e6919da3..2eb4910f0ef34 100755 --- a/scripts/package/builddeb +++ b/scripts/package/builddeb @@ -171,9 +171,8 @@ install_libc_headers () { # move asm headers to /usr/include//asm to match the structure # used by Debian-based distros (to support multi-arch) - host_arch=$(dpkg-architecture -a$DEB_HOST_ARCH -qDEB_HOST_MULTIARCH) - mkdir $pdir/usr/include/$host_arch - mv $pdir/usr/include/asm $pdir/usr/include/$host_arch/ + mkdir "$pdir/usr/include/${DEB_HOST_MULTIARCH}" + mv "$pdir/usr/include/asm" "$pdir/usr/include/${DEB_HOST_MULTIARCH}" } rm -f debian/files diff --git a/scripts/package/debian/rules b/scripts/package/debian/rules index 529b71b55efa5..3268340386dec 100755 --- a/scripts/package/debian/rules +++ b/scripts/package/debian/rules @@ -30,5 +30,16 @@ build-arch: .PHONY: clean clean: - rm -rf debian/files debian/linux-* + rm -rf debian/files debian/linux-* debian/deb-env.vars* $(MAKE) -f $(srctree)/Makefile ARCH=$(ARCH) clean + +# If DEB_HOST_ARCH is empty, it is likely that debian/rules was executed +# directly. Run 'dpkg-architecture --print-set --print-format=make' to +# generate a makefile construct that exports all DEB_* variables. +ifndef DEB_HOST_ARCH +include debian/deb-env.vars + +debian/deb-env.vars: + dpkg-architecture -a$$(cat debian/arch) --print-set --print-format=make > $@.tmp + mv $@.tmp $@ +endif -- GitLab From eaf80f7f2c9c5f08d76858ec32addfcfe64ce58e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 26 Dec 2023 22:52:41 +0900 Subject: [PATCH 0673/1290] kbuild: deb-pkg: allow to run debian/rules from output directory 'make O=... deb-pkg' creates the debian directory in the output directory. However, currently it is impossible to run debian/rules created in the separate output directory. This commit delays the $(srctree) expansion by escaping '$' and by quoting the entire command, making it possible to run debian/rules in the output directory. Signed-off-by: Masahiro Yamada Reviewed-by: Nicolas Schier --- scripts/package/debian/rules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/package/debian/rules b/scripts/package/debian/rules index 3268340386dec..6ba756d246de1 100755 --- a/scripts/package/debian/rules +++ b/scripts/package/debian/rules @@ -19,7 +19,7 @@ binary: binary-arch binary-indep binary-indep: build-indep binary-arch: build-arch $(MAKE) -f $(srctree)/Makefile $(make-opts) \ - run-command KBUILD_RUN_COMMAND=+$(srctree)/scripts/package/builddeb + run-command KBUILD_RUN_COMMAND='+$$(srctree)/scripts/package/builddeb' .PHONY: build build-indep build-arch build: build-arch build-indep -- GitLab From 68e262f8017d7fa5a9ea1ef21cbaa0fd5334ecd5 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 26 Dec 2023 22:52:42 +0900 Subject: [PATCH 0674/1290] kbuild: deb-pkg: remove unneeded '-f $srctree/Makefile' in debian/rules This is unneeded because the Makefile in the output directory wraps the top-level Makefile in the srctree. Just run $(MAKE) irrespective of the build location. Signed-off-by: Masahiro Yamada Reviewed-by: Nicolas Schier --- scripts/package/debian/rules | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/scripts/package/debian/rules b/scripts/package/debian/rules index 6ba756d246de1..0ffa806bbd789 100755 --- a/scripts/package/debian/rules +++ b/scripts/package/debian/rules @@ -3,8 +3,6 @@ include debian/rules.vars -srctree ?= . - ifneq (,$(filter-out parallel=1,$(filter parallel=%,$(DEB_BUILD_OPTIONS)))) NUMJOBS = $(patsubst parallel=%,%,$(filter parallel=%,$(DEB_BUILD_OPTIONS))) MAKEFLAGS += -j$(NUMJOBS) @@ -18,20 +16,20 @@ make-opts = ARCH=$(ARCH) KERNELRELEASE=$(KERNELRELEASE) KBUILD_BUILD_VERSION=$(r binary: binary-arch binary-indep binary-indep: build-indep binary-arch: build-arch - $(MAKE) -f $(srctree)/Makefile $(make-opts) \ + $(MAKE) $(make-opts) \ run-command KBUILD_RUN_COMMAND='+$$(srctree)/scripts/package/builddeb' .PHONY: build build-indep build-arch build: build-arch build-indep build-indep: build-arch: - $(MAKE) -f $(srctree)/Makefile $(make-opts) \ + $(MAKE) $(make-opts) \ olddefconfig all .PHONY: clean clean: rm -rf debian/files debian/linux-* debian/deb-env.vars* - $(MAKE) -f $(srctree)/Makefile ARCH=$(ARCH) clean + $(MAKE) ARCH=$(ARCH) clean # If DEB_HOST_ARCH is empty, it is likely that debian/rules was executed # directly. Run 'dpkg-architecture --print-set --print-format=make' to -- GitLab From 5e73758b43c3defba2578df6d3a53e942fa6b41e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 26 Dec 2023 22:52:43 +0900 Subject: [PATCH 0675/1290] kbuild: deb-pkg: use more debhelper commands in builddeb Commit 36862e14e316 ("kbuild: deb-pkg: use dh_listpackages to know enabled packages") started to require the debhelper tool suite. Use more dh_* commands in create_package(): - dh_installdocs to install copyright - dh_installchangelogs to install changelog - dh_compress to compress changelog - dh_fixperms to replace the raw chmod command - dh_gencontrol to replace the raw dpkg-gencontrol command - dh_md5sums to record the md5sum of included files - dh_builddeb to replace the raw dpkg-deb command Set DEB_RULES_REQUIRES_ROOT to 'no' in case debian/rules is executed directly. Signed-off-by: Masahiro Yamada Reviewed-by: Nicolas Schier --- scripts/package/builddeb | 27 ++++++++++----------------- scripts/package/debian/rules | 3 +++ scripts/package/mkdebian | 2 +- 3 files changed, 14 insertions(+), 18 deletions(-) diff --git a/scripts/package/builddeb b/scripts/package/builddeb index 2eb4910f0ef34..436d55a83ab07 100755 --- a/scripts/package/builddeb +++ b/scripts/package/builddeb @@ -26,23 +26,16 @@ if_enabled_echo() { create_package() { local pname="$1" pdir="$2" - local dpkg_deb_opts - - mkdir -m 755 -p "$pdir/DEBIAN" - mkdir -p "$pdir/usr/share/doc/$pname" - cp debian/copyright "$pdir/usr/share/doc/$pname/" - cp debian/changelog "$pdir/usr/share/doc/$pname/changelog.Debian" - gzip -n -9 "$pdir/usr/share/doc/$pname/changelog.Debian" - sh -c "cd '$pdir'; find . -type f ! -path './DEBIAN/*' -printf '%P\0' \ - | xargs -r0 md5sum > DEBIAN/md5sums" - - # a+rX in case we are in a restrictive umask environment like 0077 - # ug-s in case we build in a setuid/setgid directory - chmod -R go-w,a+rX,ug-s "$pdir" - - # Create the package - dpkg-gencontrol -p$pname -P"$pdir" - dpkg-deb --root-owner-group ${KDEB_COMPRESS:+-Z$KDEB_COMPRESS} --build "$pdir" .. + + export DH_OPTIONS="-p${pname} -P${pdir}" + + dh_installdocs + dh_installchangelogs + dh_compress + dh_fixperms + dh_gencontrol + dh_md5sums + dh_builddeb -- ${KDEB_COMPRESS:+-Z$KDEB_COMPRESS} } install_linux_image () { diff --git a/scripts/package/debian/rules b/scripts/package/debian/rules index 0ffa806bbd789..7ab31419579f0 100755 --- a/scripts/package/debian/rules +++ b/scripts/package/debian/rules @@ -1,6 +1,9 @@ #!/usr/bin/make -f # SPDX-License-Identifier: GPL-2.0-only +# in case debian/rules is executed directly +export DEB_RULES_REQUIRES_ROOT := no + include debian/rules.vars ifneq (,$(filter-out parallel=1,$(filter parallel=%,$(DEB_BUILD_OPTIONS)))) diff --git a/scripts/package/mkdebian b/scripts/package/mkdebian index 93a24712b9a1d..070149c985fea 100755 --- a/scripts/package/mkdebian +++ b/scripts/package/mkdebian @@ -193,7 +193,7 @@ Section: kernel Priority: optional Maintainer: $maintainer Rules-Requires-Root: no -Build-Depends: debhelper +Build-Depends: debhelper-compat (= 12) Build-Depends-Arch: bc, bison, cpio, flex, kmod, libelf-dev:native, libssl-dev:native, rsync Homepage: https://www.kernel.org/ -- GitLab From 16c36f8864e354952eeeb8449034d63d372f621d Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 26 Dec 2023 23:33:59 +0900 Subject: [PATCH 0676/1290] kbuild: deb-pkg: use build ID instead of debug link for dbg package There are two ways of managing separate debug info files: [1] The executable contains the .gnu_debuglink section, which specifies the name and the CRC of the separate debug info file. [2] The executable contains a build ID, and the corresponding debug info file is placed in the .build-id directory. We could do both, but the former, which 'make deb-pkg' currently does, results in complicated installation steps because we need to manually strip the debug sections, create debug links, and re-sign the modules. Besides, it is not working with module compression. This commit abandons the approach [1], and instead opts for [2]. Debian kernel commit de26137e2a9f ("Drop not needed extra step to add debug links") also stopped adding debug links. Signed-off-by: Masahiro Yamada --- scripts/package/builddeb | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/scripts/package/builddeb b/scripts/package/builddeb index 436d55a83ab07..cc8c7a807fcc2 100755 --- a/scripts/package/builddeb +++ b/scripts/package/builddeb @@ -49,7 +49,7 @@ install_linux_image () { ${MAKE} -f ${srctree}/Makefile INSTALL_DTBS_PATH="${pdir}/usr/lib/linux-image-${KERNELRELEASE}" dtbs_install fi - ${MAKE} -f ${srctree}/Makefile INSTALL_MOD_PATH="${pdir}" modules_install + ${MAKE} -f ${srctree}/Makefile INSTALL_MOD_PATH="${pdir}" INSTALL_MOD_STRIP=1 modules_install rm -f "${pdir}/lib/modules/${KERNELRELEASE}/build" # Install the kernel @@ -110,25 +110,21 @@ install_linux_image () { install_linux_image_dbg () { pdir=$1 - image_pdir=$2 rm -rf ${pdir} - for module in $(find ${image_pdir}/lib/modules/ -name *.ko -printf '%P\n'); do - module=lib/modules/${module} - mkdir -p $(dirname ${pdir}/usr/lib/debug/${module}) - # only keep debug symbols in the debug file - ${OBJCOPY} --only-keep-debug ${image_pdir}/${module} ${pdir}/usr/lib/debug/${module} - # strip original module from debug symbols - ${OBJCOPY} --strip-debug ${image_pdir}/${module} - # then add a link to those - ${OBJCOPY} --add-gnu-debuglink=${pdir}/usr/lib/debug/${module} ${image_pdir}/${module} - done + # Parse modules.order directly because 'make modules_install' may sign, + # compress modules, and then run unneeded depmod. + while read -r mod; do + mod="${mod%.o}.ko" + dbg="${pdir}/usr/lib/debug/lib/modules/${KERNELRELEASE}/kernel/${mod}" + buildid=$("${READELF}" -n "${mod}" | sed -n 's@^.*Build ID: \(..\)\(.*\)@\1/\2@p') + link="${pdir}/usr/lib/debug/.build-id/${buildid}.debug" - # re-sign stripped modules - if is_enabled CONFIG_MODULE_SIG_ALL; then - ${MAKE} -f ${srctree}/Makefile INSTALL_MOD_PATH="${image_pdir}" modules_sign - fi + mkdir -p "${dbg%/*}" "${link%/*}" + "${OBJCOPY}" --only-keep-debug "${mod}" "${dbg}" + ln -sf --relative "${dbg}" "${link}" + done < modules.order # Build debug package # Different tools want the image in different locations @@ -176,9 +172,7 @@ for package in ${packages_enabled} do case ${package} in *-dbg) - # This must be done after linux-image, that is, we expect the - # debug package appears after linux-image in debian/control. - install_linux_image_dbg debian/linux-image-dbg debian/linux-image;; + install_linux_image_dbg debian/linux-image-dbg;; linux-image-*|user-mode-linux-*) install_linux_image debian/linux-image ${package};; linux-libc-dev) -- GitLab From 358c3f8cce6d8294e7ba72199f04771e9bff4b64 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 30 Dec 2023 21:02:52 +0900 Subject: [PATCH 0677/1290] kbuild: deb-pkg: do not search for 'scripts' directory under arch/ The 'scripts' directory was searched under arch/${SRCARCH} to copy arch/ia64/scripts, but commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture") removed arch/ia64/ entirely. There is another 'scripts' directory in arch/um/, but this script is never executed with SRCARCH=um because UML does not support the linux-headers package. Signed-off-by: Masahiro Yamada --- scripts/package/install-extmod-build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/package/install-extmod-build b/scripts/package/install-extmod-build index 8a7051fad0878..76e0765dfcd6e 100755 --- a/scripts/package/install-extmod-build +++ b/scripts/package/install-extmod-build @@ -20,7 +20,7 @@ mkdir -p "${destdir}" find "arch/${SRCARCH}" -maxdepth 1 -name 'Makefile*' find include scripts -type f -o -type l find "arch/${SRCARCH}" -name Kbuild.platforms -o -name Platform - find "arch/${SRCARCH}" -name include -o -name scripts -type d + find "arch/${SRCARCH}" -name include -type d ) | tar -c -f - -C "${srctree}" -T - | tar -xf - -C "${destdir}" { -- GitLab From 3dbb4e3602d217d7139b95a36077a6b7252dc290 Mon Sep 17 00:00:00 2001 From: Shenghao Ding Date: Thu, 4 Jan 2024 22:57:16 +0800 Subject: [PATCH 0678/1290] ASoC: dt-bindings: move tas2563 from tas2562.yaml to tas2781.yaml Move tas2563 from tas2562.yaml to tas2781.yaml to unbind tas2563 from tas2562 driver code and bind it to tas2781 driver code, because tas2563 only work in bypass-DSP mode with tas2562 driver. In order to enable DSP mode for tas2563, it has been moved to tas2781 driver. As to the hardware part, such as register setting and DSP firmware, all these are stored in the binary firmware. What tas2781 drivder does is to parse the firmware and download it to the chip, then power on the chip. So, tas2781 driver can be resued as tas2563 driver. Only attention will be paid to downloading corresponding firmware. Signed-off-by: Shenghao Ding Reviewed-by: Krzysztof Kozlowski Link: https://msgid.link/r/20240104145721.1398-1-shenghao-ding@ti.com Signed-off-by: Mark Brown --- .../devicetree/bindings/sound/tas2562.yaml | 2 - .../devicetree/bindings/sound/ti,tas2781.yaml | 78 +++++++++++++++---- 2 files changed, 63 insertions(+), 17 deletions(-) diff --git a/Documentation/devicetree/bindings/sound/tas2562.yaml b/Documentation/devicetree/bindings/sound/tas2562.yaml index f01c0dde0cf74..d28c102c0ce7f 100644 --- a/Documentation/devicetree/bindings/sound/tas2562.yaml +++ b/Documentation/devicetree/bindings/sound/tas2562.yaml @@ -18,7 +18,6 @@ description: | Specifications about the audio amplifier can be found at: https://www.ti.com/lit/gpn/tas2562 - https://www.ti.com/lit/gpn/tas2563 https://www.ti.com/lit/gpn/tas2564 https://www.ti.com/lit/gpn/tas2110 @@ -29,7 +28,6 @@ properties: compatible: enum: - ti,tas2562 - - ti,tas2563 - ti,tas2564 - ti,tas2110 diff --git a/Documentation/devicetree/bindings/sound/ti,tas2781.yaml b/Documentation/devicetree/bindings/sound/ti,tas2781.yaml index a69e6c223308e..9762386892495 100644 --- a/Documentation/devicetree/bindings/sound/ti,tas2781.yaml +++ b/Documentation/devicetree/bindings/sound/ti,tas2781.yaml @@ -5,36 +5,46 @@ $id: http://devicetree.org/schemas/sound/ti,tas2781.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# -title: Texas Instruments TAS2781 SmartAMP +title: Texas Instruments TAS2563/TAS2781 SmartAMP maintainers: - Shenghao Ding -description: - The TAS2781 is a mono, digital input Class-D audio amplifier - optimized for efficiently driving high peak power into small - loudspeakers. An integrated on-chip DSP supports Texas Instruments - Smart Amp speaker protection algorithm. The integrated speaker - voltage and current sense provides for real time +description: | + The TAS2563/TAS2781 is a mono, digital input Class-D audio + amplifier optimized for efficiently driving high peak power into + small loudspeakers. An integrated on-chip DSP supports Texas + Instruments Smart Amp speaker protection algorithm. The + integrated speaker voltage and current sense provides for real time monitoring of loudspeaker behavior. -allOf: - - $ref: dai-common.yaml# + Specifications about the audio amplifier can be found at: + https://www.ti.com/lit/gpn/tas2563 + https://www.ti.com/lit/gpn/tas2781 properties: compatible: - enum: - - ti,tas2781 + description: | + ti,tas2563: 6.1-W Boosted Class-D Audio Amplifier With Integrated + DSP and IV Sense, 16/20/24/32bit stereo I2S or multichannel TDM. + + ti,tas2781: 24-V Class-D Amplifier with Real Time Integrated Speaker + Protection and Audio Processing, 16/20/24/32bit stereo I2S or + multichannel TDM. + oneOf: + - items: + - enum: + - ti,tas2563 + - const: ti,tas2781 + - enum: + - ti,tas2781 reg: description: - I2C address, in multiple tas2781s case, all the i2c address + I2C address, in multiple-AMP case, all the i2c address aggregate as one Audio Device to support multiple audio slots. maxItems: 8 minItems: 1 - items: - minimum: 0x38 - maximum: 0x3f reset-gpios: maxItems: 1 @@ -49,6 +59,44 @@ required: - compatible - reg +allOf: + - $ref: dai-common.yaml# + - if: + properties: + compatible: + contains: + enum: + - ti,tas2563 + then: + properties: + reg: + description: + I2C address, in multiple-AMP case, all the i2c address + aggregate as one Audio Device to support multiple audio slots. + maxItems: 4 + minItems: 1 + items: + minimum: 0x4c + maximum: 0x4f + + - if: + properties: + compatible: + contains: + enum: + - ti,tas2781 + then: + properties: + reg: + description: + I2C address, in multiple-AMP case, all the i2c address + aggregate as one Audio Device to support multiple audio slots. + maxItems: 8 + minItems: 1 + items: + minimum: 0x38 + maximum: 0x3f + additionalProperties: false examples: -- GitLab From 645994d21287a1ad2f637818d737f7a3d84e97d7 Mon Sep 17 00:00:00 2001 From: Shenghao Ding Date: Thu, 4 Jan 2024 22:57:17 +0800 Subject: [PATCH 0679/1290] ASoC: tas2562: move tas2563 from tas2562 driver to tas2781 driver Move tas2563 from tas2562 driver to tas2781 driver to unbind tas2563 from tas2562 driver code and bind it to tas2781 driver code, because tas2563 only work in bypass-DSP mode with tas2562 driver. In order to enable DSP mode for tas2563, it has been moved to tas2781 driver. As to the hardware part, such as register setting and DSP firmware, all these are stored in the binary firmware. What tas2781 drivder does is to parse the firmware and download it to the chip, then power on the chip. So, tas2781 driver can be resued as tas2563 driver. Only attention will be paid to downloading corresponding firmware. Signed-off-by: Shenghao Ding Link: https://msgid.link/r/20240104145721.1398-2-shenghao-ding@ti.com Signed-off-by: Mark Brown --- sound/soc/codecs/tas2562.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/sound/soc/codecs/tas2562.c b/sound/soc/codecs/tas2562.c index 962c2cdfa0174..54561ae598b87 100644 --- a/sound/soc/codecs/tas2562.c +++ b/sound/soc/codecs/tas2562.c @@ -59,7 +59,6 @@ struct tas2562_data { enum tas256x_model { TAS2562, - TAS2563, TAS2564, TAS2110, }; @@ -721,7 +720,6 @@ static int tas2562_parse_dt(struct tas2562_data *tas2562) static const struct i2c_device_id tas2562_id[] = { { "tas2562", TAS2562 }, - { "tas2563", TAS2563 }, { "tas2564", TAS2564 }, { "tas2110", TAS2110 }, { } @@ -770,7 +768,6 @@ static int tas2562_probe(struct i2c_client *client) #ifdef CONFIG_OF static const struct of_device_id tas2562_of_match[] = { { .compatible = "ti,tas2562", }, - { .compatible = "ti,tas2563", }, { .compatible = "ti,tas2564", }, { .compatible = "ti,tas2110", }, { }, -- GitLab From e9aa44736cb75e901d76ee59d80db1ae79d516f1 Mon Sep 17 00:00:00 2001 From: Shenghao Ding Date: Thu, 4 Jan 2024 22:57:18 +0800 Subject: [PATCH 0680/1290] ASoC: tas2781: Add tas2563 into header file for DSP mode Move tas2563 from tas2562 header file to tas2781 header file to unbind tas2563 from tas2562 driver code and bind it to tas2781 driver code, because tas2563 only work in bypass-DSP mode with tas2562 driver. In order to enable DSP mode for tas2563, it has been moved to tas2781 driver. As to the hardware part, such as register setting and DSP firmware, all these are stored in the binary firmware. What tas2781 drivder does is to parse the firmware and download it to the chip, then power on the chip. So, tas2781 driver can be resued as tas2563 driver. Only attention will be paid to downloading corresponding firmware. Signed-off-by: Shenghao Ding Link: https://msgid.link/r/20240104145721.1398-3-shenghao-ding@ti.com Signed-off-by: Mark Brown --- include/sound/tas2781.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/sound/tas2781.h b/include/sound/tas2781.h index a6c808b223183..813cf9446a58c 100644 --- a/include/sound/tas2781.h +++ b/include/sound/tas2781.h @@ -1,13 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ // -// ALSA SoC Texas Instruments TAS2781 Audio Smart Amplifier +// ALSA SoC Texas Instruments TAS2563/TAS2781 Audio Smart Amplifier // // Copyright (C) 2022 - 2023 Texas Instruments Incorporated // https://www.ti.com // -// The TAS2781 driver implements a flexible and configurable +// The TAS2563/TAS2781 driver implements a flexible and configurable // algo coefficient setting for one, two, or even multiple -// TAS2781 chips. +// TAS2563/TAS2781 chips. // // Author: Shenghao Ding // Author: Kevin Lu @@ -59,7 +59,8 @@ #define TASDEVICE_CMD_FIELD_W 0x4 enum audio_device { - TAS2781 = 0, + TAS2563, + TAS2781, }; enum device_catlog_id { -- GitLab From 9f1bcd16e2bd41d758438f1d74e5f2d35f1e8c8e Mon Sep 17 00:00:00 2001 From: Shenghao Ding Date: Thu, 4 Jan 2024 22:57:19 +0800 Subject: [PATCH 0681/1290] ASoC: tas2781: Add tas2563 into driver Move tas2563 from tas2562 driver to tas2781 driver to unbind tas2563 from tas2562 driver code and bind it to tas2781 driver code, because tas2563 only work in bypass-DSP mode with tas2562 driver. In order to enable DSP mode for tas2563, it has been moved to tas2781 driver. As to the hardware part, such as register setting and DSP firmware, all these are stored in the binary firmware. What tas2781 drivder does is to parse the firmware and download it to the chip, then power on the chip. So, tas2781 driver can be resued as tas2563 driver. Only attention will be paid to downloading corresponding firmware. Signed-off-by: Shenghao Ding Link: https://msgid.link/r/20240104145721.1398-4-shenghao-ding@ti.com Signed-off-by: Mark Brown --- sound/soc/codecs/tas2781-i2c.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sound/soc/codecs/tas2781-i2c.c b/sound/soc/codecs/tas2781-i2c.c index 55cd5e3c23a5d..bd5ef4ff96feb 100644 --- a/sound/soc/codecs/tas2781-i2c.c +++ b/sound/soc/codecs/tas2781-i2c.c @@ -1,13 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 // -// ALSA SoC Texas Instruments TAS2781 Audio Smart Amplifier +// ALSA SoC Texas Instruments TAS2563/TAS2781 Audio Smart Amplifier // // Copyright (C) 2022 - 2023 Texas Instruments Incorporated // https://www.ti.com // -// The TAS2781 driver implements a flexible and configurable +// The TAS2563/TAS2781 driver implements a flexible and configurable // algo coefficient setting for one, two, or even multiple -// TAS2781 chips. +// TAS2563/TAS2781 chips. // // Author: Shenghao Ding // Author: Kevin Lu @@ -32,6 +32,7 @@ #include static const struct i2c_device_id tasdevice_id[] = { + { "tas2563", TAS2563 }, { "tas2781", TAS2781 }, {} }; @@ -39,6 +40,7 @@ MODULE_DEVICE_TABLE(i2c, tasdevice_id); #ifdef CONFIG_OF static const struct of_device_id tasdevice_of_match[] = { + { .compatible = "ti,tas2563" }, { .compatible = "ti,tas2781" }, {}, }; -- GitLab From 0e4d464cda4c5996402343d4c9e2b6ceec716f93 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 5 Jan 2024 14:57:14 +0000 Subject: [PATCH 0682/1290] netfs: Mark netfs_unbuffered_write_iter_locked() static Mark netfs_unbuffered_write_iter_locked() static as it's only called from the file in which it is defined. Signed-off-by: David Howells cc: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/netfs/direct_write.c | 4 ++-- fs/netfs/internal.h | 6 ------ 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index aad05f2349a48..b9cbfd6a8a01d 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -27,8 +27,8 @@ static void netfs_cleanup_dio_write(struct netfs_io_request *wreq) * Perform an unbuffered write where we may have to do an RMW operation on an * encrypted file. This can also be used for direct I/O writes. */ -ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter, - struct netfs_group *netfs_group) +static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter, + struct netfs_group *netfs_group) { struct netfs_io_request *wreq; unsigned long long start = iocb->ki_pos; diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index d2d63120ac60a..a6dfc88883779 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -26,12 +26,6 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq); int netfs_prefetch_for_write(struct file *file, struct folio *folio, size_t offset, size_t len); -/* - * direct_write.c - */ -ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter, - struct netfs_group *netfs_group); - /* * io.c */ -- GitLab From 35040410372ca27a33cec8382d42c90b6b6c99f6 Mon Sep 17 00:00:00 2001 From: ChiYuan Huang Date: Fri, 29 Dec 2023 09:46:01 +0800 Subject: [PATCH 0683/1290] ASoC: codecs: rtq9128: Fix PM_RUNTIME usage If 'pm_runtime_resume_and_get' is used, must check the return value to prevent the active count not matched problem. Signed-off-by: ChiYuan Huang Link: https://msgid.link/r/bebd9e2bed9e0528a7fd9c528d785da02caf4f1a.1703813842.git.cy_huang@richtek.com Signed-off-by: Mark Brown --- sound/soc/codecs/rtq9128.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sound/soc/codecs/rtq9128.c b/sound/soc/codecs/rtq9128.c index c22b047115cc4..bda64f9eeb624 100644 --- a/sound/soc/codecs/rtq9128.c +++ b/sound/soc/codecs/rtq9128.c @@ -391,7 +391,11 @@ static int rtq9128_component_probe(struct snd_soc_component *comp) unsigned int val; int i, ret; - pm_runtime_resume_and_get(comp->dev); + ret = pm_runtime_resume_and_get(comp->dev); + if (ret < 0) { + dev_err(comp->dev, "Failed to resume device (%d)\n", ret); + return ret; + } val = snd_soc_component_read(comp, RTQ9128_REG_EFUSE_DATA); -- GitLab From 415d10ccef712f3ec73cd880c1fef3eb48601c3a Mon Sep 17 00:00:00 2001 From: ChiYuan Huang Date: Fri, 29 Dec 2023 09:46:02 +0800 Subject: [PATCH 0684/1290] ASoC: codecs: rtq9128: Fix TDM enable and DAI format control flow To enable TDM mode, the current control flow limits the function calling order should be 'set_tdm_slot->set_dai_fmt'. But not all platform sound card like as simeple card to follow this design. To bypass this limit, adjust the DAI format setting in runtime 'hw_param' callback. Signed-off-by: ChiYuan Huang Link: https://msgid.link/r/c4c8df00d8d179b8b5b39a8521de3a85325c57e8.1703813842.git.cy_huang@richtek.com Signed-off-by: Mark Brown --- sound/soc/codecs/rtq9128.c | 67 ++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 31 deletions(-) diff --git a/sound/soc/codecs/rtq9128.c b/sound/soc/codecs/rtq9128.c index bda64f9eeb624..aa3eadecd9746 100644 --- a/sound/soc/codecs/rtq9128.c +++ b/sound/soc/codecs/rtq9128.c @@ -59,6 +59,7 @@ struct rtq9128_data { struct gpio_desc *enable; + unsigned int daifmt; int tdm_slots; int tdm_slot_width; bool tdm_input_data2_select; @@ -441,10 +442,7 @@ static const struct snd_soc_component_driver rtq9128_comp_driver = { static int rtq9128_dai_set_fmt(struct snd_soc_dai *dai, unsigned int fmt) { struct rtq9128_data *data = snd_soc_dai_get_drvdata(dai); - struct snd_soc_component *comp = dai->component; struct device *dev = dai->dev; - unsigned int audfmt, fmtval; - int ret; dev_dbg(dev, "%s: fmt 0x%8x\n", __func__, fmt); @@ -454,35 +452,10 @@ static int rtq9128_dai_set_fmt(struct snd_soc_dai *dai, unsigned int fmt) return -EINVAL; } - fmtval = fmt & SND_SOC_DAIFMT_FORMAT_MASK; - if (data->tdm_slots && fmtval != SND_SOC_DAIFMT_DSP_A && fmtval != SND_SOC_DAIFMT_DSP_B) { - dev_err(dev, "TDM is used, format only support DSP_A or DSP_B\n"); - return -EINVAL; - } + /* Store here and will be used in runtime hw_params for DAI format setting */ + data->daifmt = fmt; - switch (fmtval) { - case SND_SOC_DAIFMT_I2S: - audfmt = 8; - break; - case SND_SOC_DAIFMT_LEFT_J: - audfmt = 9; - break; - case SND_SOC_DAIFMT_RIGHT_J: - audfmt = 10; - break; - case SND_SOC_DAIFMT_DSP_A: - audfmt = data->tdm_slots ? 12 : 11; - break; - case SND_SOC_DAIFMT_DSP_B: - audfmt = data->tdm_slots ? 4 : 3; - break; - default: - dev_err(dev, "Unsupported format 0x%8x\n", fmt); - return -EINVAL; - } - - ret = snd_soc_component_write_field(comp, RTQ9128_REG_I2S_OPT, RTQ9128_AUDFMT_MASK, audfmt); - return ret < 0 ? ret : 0; + return 0; } static int rtq9128_dai_set_tdm_slot(struct snd_soc_dai *dai, unsigned int tx_mask, @@ -558,10 +531,38 @@ static int rtq9128_dai_hw_params(struct snd_pcm_substream *stream, struct snd_pc unsigned int width, slot_width, bitrate, audbit, dolen; struct snd_soc_component *comp = dai->component; struct device *dev = dai->dev; + unsigned int fmtval, audfmt; int ret; dev_dbg(dev, "%s: width %d\n", __func__, params_width(param)); + fmtval = FIELD_GET(SND_SOC_DAIFMT_FORMAT_MASK, data->daifmt); + if (data->tdm_slots && fmtval != SND_SOC_DAIFMT_DSP_A && fmtval != SND_SOC_DAIFMT_DSP_B) { + dev_err(dev, "TDM is used, format only support DSP_A or DSP_B\n"); + return -EINVAL; + } + + switch (fmtval) { + case SND_SOC_DAIFMT_I2S: + audfmt = 8; + break; + case SND_SOC_DAIFMT_LEFT_J: + audfmt = 9; + break; + case SND_SOC_DAIFMT_RIGHT_J: + audfmt = 10; + break; + case SND_SOC_DAIFMT_DSP_A: + audfmt = data->tdm_slots ? 12 : 11; + break; + case SND_SOC_DAIFMT_DSP_B: + audfmt = data->tdm_slots ? 4 : 3; + break; + default: + dev_err(dev, "Unsupported format 0x%8x\n", fmtval); + return -EINVAL; + } + switch (width = params_width(param)) { case 16: audbit = 0; @@ -615,6 +616,10 @@ static int rtq9128_dai_hw_params(struct snd_pcm_substream *stream, struct snd_pc return -EINVAL; } + ret = snd_soc_component_write_field(comp, RTQ9128_REG_I2S_OPT, RTQ9128_AUDFMT_MASK, audfmt); + if (ret < 0) + return ret; + ret = snd_soc_component_write_field(comp, RTQ9128_REG_I2S_OPT, RTQ9128_AUDBIT_MASK, audbit); if (ret < 0) return ret; -- GitLab From 4088e389476e3baababf9b22f34b9d8b3e557344 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 5 Jan 2024 14:55:52 +0000 Subject: [PATCH 0685/1290] netfs: Count DIO writes Provide a counter for DIO writes to match that for DIO reads. Signed-off-by: David Howells cc: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/netfs/direct_write.c | 1 + fs/netfs/internal.h | 1 + fs/netfs/stats.c | 11 +++++++---- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index b9cbfd6a8a01d..60a40d293c87f 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -140,6 +140,7 @@ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from) _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); trace_netfs_write_iter(iocb, from); + netfs_stat(&netfs_n_rh_dio_write); ret = netfs_start_io_direct(inode); if (ret < 0) diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index a6dfc88883779..3f9620d0fa63d 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -104,6 +104,7 @@ int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb); */ #ifdef CONFIG_NETFS_STATS extern atomic_t netfs_n_rh_dio_read; +extern atomic_t netfs_n_rh_dio_write; extern atomic_t netfs_n_rh_readahead; extern atomic_t netfs_n_rh_readpage; extern atomic_t netfs_n_rh_rreq; diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c index 15fd5c3f0f392..42db36528d921 100644 --- a/fs/netfs/stats.c +++ b/fs/netfs/stats.c @@ -10,6 +10,7 @@ #include "internal.h" atomic_t netfs_n_rh_dio_read; +atomic_t netfs_n_rh_dio_write; atomic_t netfs_n_rh_readahead; atomic_t netfs_n_rh_readpage; atomic_t netfs_n_rh_rreq; @@ -37,14 +38,13 @@ atomic_t netfs_n_wh_write_failed; int netfs_stats_show(struct seq_file *m, void *v) { - seq_printf(m, "Netfs : DR=%u RA=%u RP=%u WB=%u WBZ=%u rr=%u sr=%u\n", + seq_printf(m, "Netfs : DR=%u DW=%u RA=%u RP=%u WB=%u WBZ=%u\n", atomic_read(&netfs_n_rh_dio_read), + atomic_read(&netfs_n_rh_dio_write), atomic_read(&netfs_n_rh_readahead), atomic_read(&netfs_n_rh_readpage), atomic_read(&netfs_n_rh_write_begin), - atomic_read(&netfs_n_rh_write_zskip), - atomic_read(&netfs_n_rh_rreq), - atomic_read(&netfs_n_rh_sreq)); + atomic_read(&netfs_n_rh_write_zskip)); seq_printf(m, "Netfs : ZR=%u sh=%u sk=%u\n", atomic_read(&netfs_n_rh_zero), atomic_read(&netfs_n_rh_short_read), @@ -66,6 +66,9 @@ int netfs_stats_show(struct seq_file *m, void *v) atomic_read(&netfs_n_wh_write), atomic_read(&netfs_n_wh_write_done), atomic_read(&netfs_n_wh_write_failed)); + seq_printf(m, "Netfs : rr=%u sr=%u\n", + atomic_read(&netfs_n_rh_rreq), + atomic_read(&netfs_n_rh_sreq)); return fscache_stats_show(m); } EXPORT_SYMBOL(netfs_stats_show); -- GitLab From 92a714d727ec9e7ccfcc7432d348aba730145914 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 4 Jan 2024 15:52:11 +0000 Subject: [PATCH 0686/1290] netfs: Fix interaction between write-streaming and cachefiles culling An issue can occur between write-streaming (storing dirty data in partial non-uptodate pages) and a cachefiles object being culled to make space. The problem occurs because the cache object is only marked in use while there are files open using it. Once it has been released, it can be culled and the cookie marked disabled. At this point, a streaming write is permitted to occur (if the cache is active, we require pages to be prefetched and cached), but the cache can become active again before this gets flushed out - and then two effects can occur: (1) The cache may be asked to write out a region that's less than its DIO block size (assumed by cachefiles to be PAGE_SIZE) - and this causes one of two debugging statements to be emitted. (2) netfs_how_to_modify() gets confused because it sees a page that isn't allowed to be non-uptodate being uptodate and tries to prefetch it - leading to a warning that PG_fscache is set twice. Fix this by the following means: (1) Add a netfs_inode flag to disallow write-streaming to an inode and set it if we ever do local caching of that inode. It remains set for the lifetime of that inode - even if the cookie becomes disabled. (2) If the no-write-streaming flag is set, then make netfs_how_to_modify() always want to prefetch instead. (3) If netfs_how_to_modify() decides it wants to prefetch a folio, but that folio has write-streamed data in it, then it requires the folio be flushed first. (4) Export a counter of the number of times we wanted to prefetch a non-uptodate page, but found it had write-streamed data in it. (5) Export a counter of the number of times we cancelled a write to the cache because it didn't DIO align and remove the debug statements. Reported-by: Marc Dionne Signed-off-by: David Howells cc: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-erofs@lists.ozlabs.org cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/cachefiles/io.c | 12 ++++++------ fs/netfs/buffered_write.c | 24 ++++++++++++++++++++---- fs/netfs/fscache_stats.c | 9 ++++++--- fs/netfs/internal.h | 1 + fs/netfs/stats.c | 6 ++++-- include/linux/fscache-cache.h | 3 +++ include/linux/netfs.h | 1 + 7 files changed, 41 insertions(+), 15 deletions(-) diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 7529b40bc95a9..3eec269674375 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -528,12 +528,12 @@ int __cachefiles_prepare_write(struct cachefiles_object *object, /* Round to DIO size */ start = round_down(*_start, PAGE_SIZE); - if (start != *_start) { - kleave(" = -ENOBUFS [down]"); - return -ENOBUFS; - } - if (*_len > upper_len) { - kleave(" = -ENOBUFS [up]"); + if (start != *_start || *_len > upper_len) { + /* Probably asked to cache a streaming write written into the + * pagecache when the cookie was temporarily out of service to + * culling. + */ + fscache_count_dio_misfit(); return -ENOBUFS; } diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 08f28800232c3..6cd8f7422e9ab 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -80,10 +80,19 @@ static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx, return NETFS_WHOLE_FOLIO_MODIFY; if (file->f_mode & FMODE_READ) - return NETFS_JUST_PREFETCH; - - if (netfs_is_cache_enabled(ctx)) - return NETFS_JUST_PREFETCH; + goto no_write_streaming; + if (test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags)) + goto no_write_streaming; + + if (netfs_is_cache_enabled(ctx)) { + /* We don't want to get a streaming write on a file that loses + * caching service temporarily because the backing store got + * culled. + */ + if (!test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags)) + set_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags); + goto no_write_streaming; + } if (!finfo) return NETFS_STREAMING_WRITE; @@ -95,6 +104,13 @@ static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx, if (offset == finfo->dirty_offset + finfo->dirty_len) return NETFS_STREAMING_WRITE_CONT; return NETFS_FLUSH_CONTENT; + +no_write_streaming: + if (finfo) { + netfs_stat(&netfs_n_wh_wstream_conflict); + return NETFS_FLUSH_CONTENT; + } + return NETFS_JUST_PREFETCH; } /* diff --git a/fs/netfs/fscache_stats.c b/fs/netfs/fscache_stats.c index aad812ead398f..add21abdf7134 100644 --- a/fs/netfs/fscache_stats.c +++ b/fs/netfs/fscache_stats.c @@ -48,13 +48,15 @@ atomic_t fscache_n_no_create_space; EXPORT_SYMBOL(fscache_n_no_create_space); atomic_t fscache_n_culled; EXPORT_SYMBOL(fscache_n_culled); +atomic_t fscache_n_dio_misfit; +EXPORT_SYMBOL(fscache_n_dio_misfit); /* * display the general statistics */ int fscache_stats_show(struct seq_file *m) { - seq_puts(m, "FS-Cache statistics\n"); + seq_puts(m, "-- FS-Cache statistics --\n"); seq_printf(m, "Cookies: n=%d v=%d vcol=%u voom=%u\n", atomic_read(&fscache_n_cookies), atomic_read(&fscache_n_volumes), @@ -93,8 +95,9 @@ int fscache_stats_show(struct seq_file *m) atomic_read(&fscache_n_no_create_space), atomic_read(&fscache_n_culled)); - seq_printf(m, "IO : rd=%u wr=%u\n", + seq_printf(m, "IO : rd=%u wr=%u mis=%u\n", atomic_read(&fscache_n_read), - atomic_read(&fscache_n_write)); + atomic_read(&fscache_n_write), + atomic_read(&fscache_n_dio_misfit)); return 0; } diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index 3f9620d0fa63d..ec7045d24400d 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -123,6 +123,7 @@ extern atomic_t netfs_n_rh_write_begin; extern atomic_t netfs_n_rh_write_done; extern atomic_t netfs_n_rh_write_failed; extern atomic_t netfs_n_rh_write_zskip; +extern atomic_t netfs_n_wh_wstream_conflict; extern atomic_t netfs_n_wh_upload; extern atomic_t netfs_n_wh_upload_done; extern atomic_t netfs_n_wh_upload_failed; diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c index 42db36528d921..deeba9f9dcf5d 100644 --- a/fs/netfs/stats.c +++ b/fs/netfs/stats.c @@ -29,6 +29,7 @@ atomic_t netfs_n_rh_write_begin; atomic_t netfs_n_rh_write_done; atomic_t netfs_n_rh_write_failed; atomic_t netfs_n_rh_write_zskip; +atomic_t netfs_n_wh_wstream_conflict; atomic_t netfs_n_wh_upload; atomic_t netfs_n_wh_upload_done; atomic_t netfs_n_wh_upload_failed; @@ -66,9 +67,10 @@ int netfs_stats_show(struct seq_file *m, void *v) atomic_read(&netfs_n_wh_write), atomic_read(&netfs_n_wh_write_done), atomic_read(&netfs_n_wh_write_failed)); - seq_printf(m, "Netfs : rr=%u sr=%u\n", + seq_printf(m, "Netfs : rr=%u sr=%u wsc=%u\n", atomic_read(&netfs_n_rh_rreq), - atomic_read(&netfs_n_rh_sreq)); + atomic_read(&netfs_n_rh_sreq), + atomic_read(&netfs_n_wh_wstream_conflict)); return fscache_stats_show(m); } EXPORT_SYMBOL(netfs_stats_show); diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index a174cedf4d907..bdf7f3eddf0a2 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -189,17 +189,20 @@ extern atomic_t fscache_n_write; extern atomic_t fscache_n_no_write_space; extern atomic_t fscache_n_no_create_space; extern atomic_t fscache_n_culled; +extern atomic_t fscache_n_dio_misfit; #define fscache_count_read() atomic_inc(&fscache_n_read) #define fscache_count_write() atomic_inc(&fscache_n_write) #define fscache_count_no_write_space() atomic_inc(&fscache_n_no_write_space) #define fscache_count_no_create_space() atomic_inc(&fscache_n_no_create_space) #define fscache_count_culled() atomic_inc(&fscache_n_culled) +#define fscache_count_dio_misfit() atomic_inc(&fscache_n_dio_misfit) #else #define fscache_count_read() do {} while(0) #define fscache_count_write() do {} while(0) #define fscache_count_no_write_space() do {} while(0) #define fscache_count_no_create_space() do {} while(0) #define fscache_count_culled() do {} while(0) +#define fscache_count_dio_misfit() do {} while(0) #endif #endif /* _LINUX_FSCACHE_CACHE_H */ diff --git a/include/linux/netfs.h b/include/linux/netfs.h index d3bac60fcd6f3..100cbb261269d 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -142,6 +142,7 @@ struct netfs_inode { #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ #define NETFS_ICTX_WRITETHROUGH 2 /* Write-through caching */ +#define NETFS_ICTX_NO_WRITE_STREAMING 3 /* Don't engage in write-streaming */ }; /* -- GitLab From 17dc11a02d8dacc7e78968daa2a8c16281eb7d1e Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 5 Jan 2024 15:21:00 +0100 Subject: [PATCH 0687/1290] spi: coldfire-qspi: Remove an erroneous clk_disable_unprepare() from the remove function The commit in Fixes has changed a devm_clk_get()/clk_prepare_enable() into a devm_clk_get_enabled(). It has updated the error handling path of the probe accordingly, but the remove has been left unchanged. Remove now the redundant clk_disable_unprepare() call from the remove function. Fixes: a90a987ebe00 ("spi: use devm_clk_get_enabled() in mcfqspi_probe()") Signed-off-by: Christophe JAILLET Link: https://msgid.link/r/6670aed303e1f7680e0911387606a8ae069e2cef.1704464447.git.christophe.jaillet@wanadoo.fr Signed-off-by: Mark Brown --- drivers/spi/spi-coldfire-qspi.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/spi/spi-coldfire-qspi.c b/drivers/spi/spi-coldfire-qspi.c index f0b630fe16c3c..b341b6908df06 100644 --- a/drivers/spi/spi-coldfire-qspi.c +++ b/drivers/spi/spi-coldfire-qspi.c @@ -441,7 +441,6 @@ static void mcfqspi_remove(struct platform_device *pdev) mcfqspi_wr_qmr(mcfqspi, MCFQSPI_QMR_MSTR); mcfqspi_cs_teardown(mcfqspi); - clk_disable_unprepare(mcfqspi->clk); } #ifdef CONFIG_PM_SLEEP -- GitLab From be12ad45e15b5ee0e2526a50266ba1d295d26a88 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Mon, 20 Nov 2023 09:14:06 +0000 Subject: [PATCH 0688/1290] hisi_acc_vfio_pci: Update migration data pointer correctly on saving/resume When the optional PRE_COPY support was added to speed up the device compatibility check, it failed to update the saving/resuming data pointers based on the fd offset. This results in migration data corruption and when the device gets started on the destination the following error is reported in some cases, [ 478.907684] arm-smmu-v3 arm-smmu-v3.2.auto: event 0x10 received: [ 478.913691] arm-smmu-v3 arm-smmu-v3.2.auto: 0x0000310200000010 [ 478.919603] arm-smmu-v3 arm-smmu-v3.2.auto: 0x000002088000007f [ 478.925515] arm-smmu-v3 arm-smmu-v3.2.auto: 0x0000000000000000 [ 478.931425] arm-smmu-v3 arm-smmu-v3.2.auto: 0x0000000000000000 [ 478.947552] hisi_zip 0000:31:00.0: qm_axi_rresp [error status=0x1] found [ 478.955930] hisi_zip 0000:31:00.0: qm_db_timeout [error status=0x400] found [ 478.955944] hisi_zip 0000:31:00.0: qm sq doorbell timeout in function 2 Fixes: d9a871e4a143 ("hisi_acc_vfio_pci: Introduce support for PRE_COPY state transitions") Signed-off-by: Shameer Kolothum Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231120091406.780-1-shameerali.kolothum.thodi@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index b2f9778c8366e..4d27465c8f1a8 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -694,6 +694,7 @@ static ssize_t hisi_acc_vf_resume_write(struct file *filp, const char __user *bu size_t len, loff_t *pos) { struct hisi_acc_vf_migration_file *migf = filp->private_data; + u8 *vf_data = (u8 *)&migf->vf_data; loff_t requested_length; ssize_t done = 0; int ret; @@ -715,7 +716,7 @@ static ssize_t hisi_acc_vf_resume_write(struct file *filp, const char __user *bu goto out_unlock; } - ret = copy_from_user(&migf->vf_data, buf, len); + ret = copy_from_user(vf_data + *pos, buf, len); if (ret) { done = -EFAULT; goto out_unlock; @@ -835,7 +836,9 @@ static ssize_t hisi_acc_vf_save_read(struct file *filp, char __user *buf, size_t len = min_t(size_t, migf->total_length - *pos, len); if (len) { - ret = copy_to_user(buf, &migf->vf_data, len); + u8 *vf_data = (u8 *)&migf->vf_data; + + ret = copy_to_user(buf, vf_data + *pos, len); if (ret) { done = -EFAULT; goto out_unlock; -- GitLab From f644d21baab34c837d639e9639aa8204dba7f3cf Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Mon, 18 Dec 2023 16:30:53 +0100 Subject: [PATCH 0689/1290] nvmet-fcloop: Remove remote port from list when unlinking The remote port is removed too late from fcloop_nports list. Remove it when port is unregistered. This prevents a busy loop in fcloop_exit, because it is possible the remote port is found in the list and thus we will never progress. The kernel log will be spammed with nvme_fcloop: fcloop_exit: Failed deleting remote port nvme_fcloop: fcloop_exit: Failed deleting target port Signed-off-by: Daniel Wagner Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/target/fcloop.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index c65a73433c05f..ead349af30f1e 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -995,11 +995,6 @@ fcloop_nport_free(struct kref *ref) { struct fcloop_nport *nport = container_of(ref, struct fcloop_nport, ref); - unsigned long flags; - - spin_lock_irqsave(&fcloop_lock, flags); - list_del(&nport->nport_list); - spin_unlock_irqrestore(&fcloop_lock, flags); kfree(nport); } @@ -1357,6 +1352,8 @@ __unlink_remote_port(struct fcloop_nport *nport) nport->tport->remoteport = NULL; nport->rport = NULL; + list_del(&nport->nport_list); + return rport; } -- GitLab From bd029a02ce46e77e4d553140b039f93cf78ee8c1 Mon Sep 17 00:00:00 2001 From: "Jim.Lin" Date: Tue, 28 Nov 2023 10:57:37 +0800 Subject: [PATCH 0690/1290] nvme-pci: disable write zeroes for SK Hynix BC901 SK Hynix BC901 drive write zero will cause Chromebook takes more than 20 mins to switch to developer mode "disable write zeroes" can fix this issue and Sk Hynix has been verified. Signed-off-by: Jim.Lin Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 507bc149046dc..f272026807419 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3394,6 +3394,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_DEVICE(0x1c5c, 0x174a), /* SK Hynix P31 SSD */ .driver_data = NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x1c5c, 0x1D59), /* SK Hynix BC901 */ + .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_DEVICE(0x15b7, 0x2001), /* Sandisk Skyhawk */ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_DEVICE(0x1d97, 0x2263), /* SPCC */ -- GitLab From bafd590910d00327decb3937e77f6f11c3e80e4b Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 27 Dec 2023 17:31:06 +0800 Subject: [PATCH 0691/1290] nvme: introduce nvme_disk_is_ns_head helper We currently rely on gendisk's file operations (fops) to distinguish between a namespace head (ns_head) and a regular namespace. To enhance code readability, introduce a helper function. Additionally, we must ensure that the device is not an ns_head before calling nvme_get_ns_from_dev(). To enforce this, add a WARN_ON check within the nvme_get_ns_from_dev(). Signed-off-by: Guixin Liu Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Reviewed-by: Liu Song [include fix: https://lore.kernel.org/oe-kbuild-all/202401031943.0N72Tkji-lkp@intel.com/] Signed-off-by: Keith Busch --- drivers/nvme/host/nvme.h | 13 ++++++++++++- drivers/nvme/host/pr.c | 2 +- drivers/nvme/host/sysfs.c | 8 ++++---- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 297b80430f1b6..6092cc3618375 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -920,6 +920,10 @@ extern struct device_attribute dev_attr_ana_grpid; extern struct device_attribute dev_attr_ana_state; extern struct device_attribute subsys_attr_iopolicy; +static inline bool nvme_disk_is_ns_head(struct gendisk *disk) +{ + return disk->fops == &nvme_ns_head_ops; +} #else #define multipath false static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) @@ -997,6 +1001,10 @@ static inline void nvme_mpath_start_request(struct request *rq) static inline void nvme_mpath_end_request(struct request *rq) { } +static inline bool nvme_disk_is_ns_head(struct gendisk *disk) +{ + return false; +} #endif /* CONFIG_NVME_MULTIPATH */ int nvme_revalidate_zones(struct nvme_ns *ns); @@ -1025,7 +1033,10 @@ static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev) { - return dev_to_disk(dev)->private_data; + struct gendisk *disk = dev_to_disk(dev); + + WARN_ON(nvme_disk_is_ns_head(disk)); + return disk->private_data; } #ifdef CONFIG_NVME_HWMON diff --git a/drivers/nvme/host/pr.c b/drivers/nvme/host/pr.c index 391b1465ebfd5..fc3eed00f9ff1 100644 --- a/drivers/nvme/host/pr.c +++ b/drivers/nvme/host/pr.c @@ -98,7 +98,7 @@ static int nvme_send_pr_command(struct block_device *bdev, struct nvme_command *c, void *data, unsigned int data_len) { if (IS_ENABLED(CONFIG_NVME_MULTIPATH) && - bdev->bd_disk->fops == &nvme_ns_head_ops) + nvme_disk_is_ns_head(bdev->bd_disk)) return nvme_send_ns_head_pr_command(bdev, c, data, data_len); return nvme_send_ns_pr_command(bdev->bd_disk->private_data, c, data, diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index ac24ad1023806..754e911110420 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -39,10 +39,9 @@ static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev) { struct gendisk *disk = dev_to_disk(dev); - if (disk->fops == &nvme_bdev_ops) - return nvme_get_ns_from_dev(dev)->head; - else + if (nvme_disk_is_ns_head(disk)) return disk->private_data; + return nvme_get_ns_from_dev(dev)->head; } static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, @@ -233,7 +232,8 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, } #ifdef CONFIG_NVME_MULTIPATH if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) { - if (dev_to_disk(dev)->fops != &nvme_bdev_ops) /* per-path attr */ + /* per-path attr */ + if (nvme_disk_is_ns_head(dev_to_disk(dev))) return 0; if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl)) return 0; -- GitLab From 4ee7ffeb4ce50c80bc4504db6f39b25a2df6bcf4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 3 Jan 2024 16:56:55 +0100 Subject: [PATCH 0692/1290] nvmet: re-fix tracing strncpy() warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An earlier patch had tried to address a warning about a string copy with missing zero termination: drivers/nvme/target/trace.h:52:3: warning: ‘strncpy’ specified bound 32 equals destination size [-Wstringop-truncation] The new version causes a different warning with some compiler versions, notably gcc-9 and gcc-10, and also misses the zero padding that was apparently done intentionally in the original code: drivers/nvme/target/trace.h:56:2: error: 'strncpy' specified bound depends on the length of the source argument [-Werror=stringop-overflow=] Change it to use strscpy_pad() with the original length, which will give a properly padded and zero-terminated string as well as avoiding the warning. Fixes: d86481e924a7 ("nvmet: use min of device_path and disk len") Signed-off-by: Arnd Bergmann Signed-off-by: Keith Busch --- drivers/nvme/target/trace.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/nvme/target/trace.h b/drivers/nvme/target/trace.h index 2f15070ddc569..89020018a0e35 100644 --- a/drivers/nvme/target/trace.h +++ b/drivers/nvme/target/trace.h @@ -59,8 +59,7 @@ static inline void __assign_req_name(char *name, struct nvmet_req *req) return; } - strncpy(name, req->ns->device_path, - min_t(size_t, DISK_NAME_LEN, strlen(req->ns->device_path))); + strscpy_pad(name, req->ns->device_path, DISK_NAME_LEN); } #endif -- GitLab From a7de1dea76cd6a3707707af4ea2f8bc3cdeaeb11 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 3 Jan 2024 16:56:56 +0100 Subject: [PATCH 0693/1290] nvme: trace: avoid memcpy overflow warning A previous patch introduced a struct_group() in nvme_common_command to help stringop fortification figure out the length of the fields, but one function is not currently using them: In file included from drivers/nvme/target/core.c:7: In file included from include/linux/string.h:254: include/linux/fortify-string.h:592:4: error: call to '__read_overflow2_field' declared with 'warning' attribute: detected read beyond size of field (2nd parameter); maybe use struct_group()? [-Werror,-Wattribute-warning] __read_overflow2_field(q_size_field, size); ^ Change this one to use the correct field name to avoid the warning. Fixes: 5c629dc9609dc ("nvme: use struct group for generic command dwords") Signed-off-by: Arnd Bergmann Signed-off-by: Keith Busch --- drivers/nvme/target/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/trace.h b/drivers/nvme/target/trace.h index 89020018a0e35..7f7ebf9558e50 100644 --- a/drivers/nvme/target/trace.h +++ b/drivers/nvme/target/trace.h @@ -90,7 +90,7 @@ TRACE_EVENT(nvmet_req_init, __entry->flags = cmd->common.flags; __entry->nsid = le32_to_cpu(cmd->common.nsid); __entry->metadata = le64_to_cpu(cmd->common.metadata); - memcpy(__entry->cdw10, &cmd->common.cdw10, + memcpy(__entry->cdw10, &cmd->common.cdws, sizeof(__entry->cdw10)); ), TP_printk("nvmet%s: %sqid=%d, cmdid=%u, nsid=%u, flags=%#x, " -- GitLab From 98856b2ea3065d8f60e90f423d7707f4a4706ec5 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 5 Jan 2024 15:07:34 -0700 Subject: [PATCH 0694/1290] cxl: Introduce put_cxl_root() helper Add a helper function put_cxl_root() to maintain symmetry for find_cxl_root() function instead of relying on open coding of the put_device() in order to dereference the 'struct device' that happens via get_device() in find_cxl_root(). Suggested-by: Robert Richter Reviewed-by: Ira Weiny Signed-off-by: Dave Jiang Reviewed-by: Robert Richter Link: https://lore.kernel.org/r/170449245417.3779673.4566146351673989387.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- drivers/cxl/core/port.c | 9 +++++++++ drivers/cxl/cxl.h | 3 +++ 2 files changed, 12 insertions(+) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 8c00fd6be730e..64f30d5fe1f62 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -986,6 +986,15 @@ struct cxl_port *find_cxl_root(struct cxl_port *port) } EXPORT_SYMBOL_NS_GPL(find_cxl_root, CXL); +void put_cxl_root(struct cxl_root *cxl_root) +{ + if (!cxl_root) + return; + + put_device(&cxl_root->port.dev); +} +EXPORT_SYMBOL_NS_GPL(put_cxl_root, CXL); + static struct cxl_dport *find_dport(struct cxl_port *port, int id) { struct cxl_dport *dport; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 492dbf63935fd..df3db3e43913a 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -735,6 +735,9 @@ struct cxl_port *devm_cxl_add_port(struct device *host, struct cxl_root *devm_cxl_add_root(struct device *host, const struct cxl_root_ops *ops); struct cxl_port *find_cxl_root(struct cxl_port *port); +void put_cxl_root(struct cxl_root *cxl_root); +DEFINE_FREE(put_cxl_root, struct cxl_root *, if (_T) put_cxl_root(_T)) + int devm_cxl_enumerate_ports(struct cxl_memdev *cxlmd); void cxl_bus_rescan(void); void cxl_bus_drain(void); -- GitLab From 44cd71ef7bacdfcf7c3e8a7b13f11a7eba69533d Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 5 Jan 2024 15:07:40 -0700 Subject: [PATCH 0695/1290] cxl: Convert find_cxl_root() to return a 'struct cxl_root *' Commit 790815902ec6 ("cxl: Add support for _DSM Function for retrieving QTG ID") introduced 'struct cxl_root', however all usages have been worked indirectly through cxl_port. Refactor code such as find_cxl_root() function to use 'struct cxl_root' directly. Suggested-by: Dan Williams Reviewed-by: Ira Weiny Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/170449246044.3779673.13035770941393418591.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- drivers/cxl/acpi.c | 6 ++---- drivers/cxl/core/cdat.c | 17 ++++++++++------- drivers/cxl/core/pmem.c | 6 ++++-- drivers/cxl/core/port.c | 4 ++-- drivers/cxl/cxl.h | 14 +++++++------- drivers/cxl/port.c | 4 +++- 6 files changed, 28 insertions(+), 23 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index afc712264d1c2..dcf2b39e10488 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -295,14 +295,12 @@ out: return rc; } -static int cxl_acpi_qos_class(struct cxl_port *root_port, +static int cxl_acpi_qos_class(struct cxl_root *cxl_root, struct access_coordinate *coord, int entries, int *qos_class) { + struct device *dev = cxl_root->port.uport_dev; acpi_handle handle; - struct device *dev; - - dev = root_port->uport_dev; if (!dev_is_platform(dev)) return -ENODEV; diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index cd84d87f597a6..0df5379cf02f5 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -162,7 +162,6 @@ static int cxl_port_perf_data_calculate(struct cxl_port *port, struct xarray *dsmas_xa) { struct access_coordinate c; - struct cxl_port *root_port; struct cxl_root *cxl_root; struct dsmas_entry *dent; int valid_entries = 0; @@ -175,8 +174,7 @@ static int cxl_port_perf_data_calculate(struct cxl_port *port, return rc; } - root_port = find_cxl_root(port); - cxl_root = to_cxl_root(root_port); + cxl_root = find_cxl_root(port); if (!cxl_root->ops || !cxl_root->ops->qos_class) return -EOPNOTSUPP; @@ -193,7 +191,8 @@ static int cxl_port_perf_data_calculate(struct cxl_port *port, dent->coord.write_bandwidth); dent->entries = 1; - rc = cxl_root->ops->qos_class(root_port, &dent->coord, 1, &qos_class); + rc = cxl_root->ops->qos_class(cxl_root, &dent->coord, 1, + &qos_class); if (rc != 1) continue; @@ -349,15 +348,19 @@ static int cxl_qos_class_verify(struct cxl_memdev *cxlmd) { struct cxl_dev_state *cxlds = cxlmd->cxlds; struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); - struct cxl_port *root_port __free(put_device) = NULL; LIST_HEAD(__discard); struct list_head *discard __free(dpa_perf) = &__discard; + struct cxl_port *root_port; int rc; - root_port = find_cxl_root(cxlmd->endpoint); - if (!root_port) + struct cxl_root *cxl_root __free(put_cxl_root) = + find_cxl_root(cxlmd->endpoint); + + if (!cxl_root) return -ENODEV; + root_port = &cxl_root->port; + /* Check that the QTG IDs are all sane between end device and root decoders */ cxl_qos_match(root_port, &mds->ram_perf_list, discard); cxl_qos_match(root_port, &mds->pmem_perf_list, discard); diff --git a/drivers/cxl/core/pmem.c b/drivers/cxl/core/pmem.c index fc94f52403271..da92a901b9e8c 100644 --- a/drivers/cxl/core/pmem.c +++ b/drivers/cxl/core/pmem.c @@ -64,12 +64,14 @@ static int match_nvdimm_bridge(struct device *dev, void *data) struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_memdev *cxlmd) { - struct cxl_port *port = find_cxl_root(cxlmd->endpoint); + struct cxl_root *cxl_root = find_cxl_root(cxlmd->endpoint); + struct cxl_port *port; struct device *dev; - if (!port) + if (!cxl_root) return NULL; + port = &cxl_root->port; dev = device_find_child(&port->dev, NULL, match_nvdimm_bridge); put_device(&port->dev); diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 64f30d5fe1f62..63a4e3c2baed4 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -972,7 +972,7 @@ static bool dev_is_cxl_root_child(struct device *dev) return false; } -struct cxl_port *find_cxl_root(struct cxl_port *port) +struct cxl_root *find_cxl_root(struct cxl_port *port) { struct cxl_port *iter = port; @@ -982,7 +982,7 @@ struct cxl_port *find_cxl_root(struct cxl_port *port) if (!iter) return NULL; get_device(&iter->dev); - return iter; + return to_cxl_root(iter); } EXPORT_SYMBOL_NS_GPL(find_cxl_root, CXL); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index df3db3e43913a..3a5004aab97aa 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -617,12 +617,6 @@ struct cxl_port { long pci_latency; }; -struct cxl_root_ops { - int (*qos_class)(struct cxl_port *root_port, - struct access_coordinate *coord, int entries, - int *qos_class); -}; - /** * struct cxl_root - logical collection of root cxl_port items * @@ -640,6 +634,12 @@ to_cxl_root(const struct cxl_port *port) return container_of(port, struct cxl_root, port); } +struct cxl_root_ops { + int (*qos_class)(struct cxl_root *cxl_root, + struct access_coordinate *coord, int entries, + int *qos_class); +}; + static inline struct cxl_dport * cxl_find_dport_by_dev(struct cxl_port *port, const struct device *dport_dev) { @@ -734,7 +734,7 @@ struct cxl_port *devm_cxl_add_port(struct device *host, struct cxl_dport *parent_dport); struct cxl_root *devm_cxl_add_root(struct device *host, const struct cxl_root_ops *ops); -struct cxl_port *find_cxl_root(struct cxl_port *port); +struct cxl_root *find_cxl_root(struct cxl_port *port); void put_cxl_root(struct cxl_root *cxl_root); DEFINE_FREE(put_cxl_root, struct cxl_root *, if (_T) put_cxl_root(_T)) diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c index da3c3a08bd626..4f3a08fdc9e94 100644 --- a/drivers/cxl/port.c +++ b/drivers/cxl/port.c @@ -94,6 +94,7 @@ static int cxl_endpoint_port_probe(struct cxl_port *port) struct cxl_endpoint_dvsec_info info = { .port = port }; struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev); struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_root *cxl_root; struct cxl_hdm *cxlhdm; struct cxl_port *root; int rc; @@ -130,7 +131,8 @@ static int cxl_endpoint_port_probe(struct cxl_port *port) * This can't fail in practice as CXL root exit unregisters all * descendant ports and that in turn synchronizes with cxl_port_probe() */ - root = find_cxl_root(port); + cxl_root = find_cxl_root(port); + root = &cxl_root->port; /* * Now that all endpoint decoders are successfully enumerated, try to -- GitLab From 98e7ab3345e105339b7d919b4918f3c1879052ed Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 5 Jan 2024 15:07:46 -0700 Subject: [PATCH 0696/1290] cxl: Fix device reference leak in cxl_port_perf_data_calculate() cxl_port_perf_data_calculate() calls find_cxl_root() and does not dereference the 'struct device' in the cxl_root->port. find_cxl_root() calls get_device() and takes a reference on the port 'struct device' member. Use the __free() macro to ensure the dereference happens. Fixes: 7a4f148dd8d5 ("cxl: Compute the entire CXL path latency and bandwidth data") Reviewed-by: Ira Weiny Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/170449246681.3779673.2288926019977963333.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- drivers/cxl/core/cdat.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index 0df5379cf02f5..c6208aab452ff 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -162,7 +162,6 @@ static int cxl_port_perf_data_calculate(struct cxl_port *port, struct xarray *dsmas_xa) { struct access_coordinate c; - struct cxl_root *cxl_root; struct dsmas_entry *dent; int valid_entries = 0; unsigned long index; @@ -174,7 +173,11 @@ static int cxl_port_perf_data_calculate(struct cxl_port *port, return rc; } - cxl_root = find_cxl_root(port); + struct cxl_root *cxl_root __free(put_cxl_root) = find_cxl_root(port); + + if (!cxl_root) + return -ENODEV; + if (!cxl_root->ops || !cxl_root->ops->qos_class) return -EOPNOTSUPP; -- GitLab From 66f11890d35a609012037cccfd9e63ed98474f99 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 5 Jan 2024 15:07:53 -0700 Subject: [PATCH 0697/1290] cxl: Refactor to use __free() for cxl_root allocation in cxl_find_nvdimm_bridge() Use scope-based resource management __free() macro to drop the open coded put_device() in cxl_find_nvdimm_bridge(). Reviewed-by: Ira Weiny Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/170449247353.3779673.5963704495491343135.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- drivers/cxl/core/pmem.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/cxl/core/pmem.c b/drivers/cxl/core/pmem.c index da92a901b9e8c..e69625a8d6a1d 100644 --- a/drivers/cxl/core/pmem.c +++ b/drivers/cxl/core/pmem.c @@ -64,16 +64,14 @@ static int match_nvdimm_bridge(struct device *dev, void *data) struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_memdev *cxlmd) { - struct cxl_root *cxl_root = find_cxl_root(cxlmd->endpoint); - struct cxl_port *port; + struct cxl_root *cxl_root __free(put_cxl_root) = + find_cxl_root(cxlmd->endpoint); struct device *dev; if (!cxl_root) return NULL; - port = &cxl_root->port; - dev = device_find_child(&port->dev, NULL, match_nvdimm_bridge); - put_device(&port->dev); + dev = device_find_child(&cxl_root->port.dev, NULL, match_nvdimm_bridge); if (!dev) return NULL; -- GitLab From 321dd36c286b3f982b341ce9a273b0f66f0e00ed Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 5 Jan 2024 15:07:59 -0700 Subject: [PATCH 0698/1290] cxl: Refactor to use __free() for cxl_root allocation in cxl_endpoint_port_probe() Use scope-based resource management __free() macro to drop the open coded put_device() in cxl_endpoint_port_probe(). Reviewed-by: Ira Weiny Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/170449247973.3779673.15088722836135359275.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- drivers/cxl/port.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c index 4f3a08fdc9e94..97c21566677aa 100644 --- a/drivers/cxl/port.c +++ b/drivers/cxl/port.c @@ -94,7 +94,6 @@ static int cxl_endpoint_port_probe(struct cxl_port *port) struct cxl_endpoint_dvsec_info info = { .port = port }; struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev); struct cxl_dev_state *cxlds = cxlmd->cxlds; - struct cxl_root *cxl_root; struct cxl_hdm *cxlhdm; struct cxl_port *root; int rc; @@ -131,7 +130,8 @@ static int cxl_endpoint_port_probe(struct cxl_port *port) * This can't fail in practice as CXL root exit unregisters all * descendant ports and that in turn synchronizes with cxl_port_probe() */ - cxl_root = find_cxl_root(port); + struct cxl_root *cxl_root __free(put_cxl_root) = find_cxl_root(port); + root = &cxl_root->port; /* @@ -139,7 +139,6 @@ static int cxl_endpoint_port_probe(struct cxl_port *port) * assemble regions from committed decoders */ device_for_each_child(&port->dev, root, discover_region); - put_device(&root->dev); return 0; } -- GitLab From 807c6d09cc99cbdf9933edfadcbaa8f0b856848d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 5 Jan 2024 22:03:58 +0000 Subject: [PATCH 0699/1290] netfs: Fix the loop that unmarks folios after writing to the cache In the loop in netfs_rreq_unmark_after_write() that removes the PG_fscache from folios after they've been written to the cache, as soon as we remove the mark from a multipage folio, it can get split - and then we might see a fragment of folio again. Guard against this by advancing the 'unlocked' tracker to the index of the last page in the folio to avoid a double removal of the PG_fscache mark. Reported-by: Marc Dionne Signed-off-by: David Howells cc: Matthew Wilcox cc: linux-afs@lists.infradead.org cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/netfs/buffered_write.c | 1 + fs/netfs/io.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 6cd8f7422e9ab..0b2b7a60dabc0 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -698,6 +698,7 @@ static void netfs_pages_written_back(struct netfs_io_request *wreq) end_wb: if (folio_test_fscache(folio)) folio_end_fscache(folio); + xas_advance(&xas, folio_next_index(folio) - 1); folio_end_writeback(folio); } diff --git a/fs/netfs/io.c b/fs/netfs/io.c index 5b5af96cd4b95..4309edf338627 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -126,7 +126,7 @@ static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq, */ if (have_unlocked && folio_index(folio) <= unlocked) continue; - unlocked = folio_index(folio); + unlocked = folio_next_index(folio) - 1; trace_netfs_folio(folio, netfs_folio_trace_end_copy); folio_end_fscache(folio); have_unlocked = true; -- GitLab From 26a1a86dd093a10d0653429bf013dae6e95dccbf Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 20 Dec 2023 16:17:29 -0800 Subject: [PATCH 0700/1290] cxl/events: Promote CXL event structures to a core header UEFI code can process CXL events through CPER records. Those records use almost the same format as the CXL events. Lift the CXL event structures to a core header to be shared in later patches. [jic123: drop "CXL rev 3.0" mention] Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20231220-cxl-cper-v5-2-1bb8a4ca2c7a@intel.com [djbw: add F: entry to maintainers for include/linux/cxl-event.h] Reviewed-by: Jonathan Cameron Acked-by: Ard Biesheuvel Signed-off-by: Dan Williams --- MAINTAINERS | 1 + drivers/cxl/cxlmem.h | 90 +------------------------------------ include/linux/cxl-event.h | 95 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 97 insertions(+), 89 deletions(-) create mode 100644 include/linux/cxl-event.h diff --git a/MAINTAINERS b/MAINTAINERS index 7cef2d2ef8d70..04f6c52d0a8f1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5245,6 +5245,7 @@ M: Dan Williams L: linux-cxl@vger.kernel.org S: Maintained F: drivers/cxl/ +F: include/linux/cxl-event.h F: include/uapi/linux/cxl_mem.h F: tools/testing/cxl/ diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index a2fcbca253f39..f0e7ebb84f028 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -6,6 +6,7 @@ #include #include #include +#include #include "cxl.h" /* CXL 2.0 8.2.8.5.1.1 Memory Device Status Register */ @@ -579,27 +580,6 @@ struct cxl_mbox_identify { u8 qos_telemetry_caps; } __packed; -/* - * Common Event Record Format - * CXL rev 3.0 section 8.2.9.2.1; Table 8-42 - */ -struct cxl_event_record_hdr { - uuid_t id; - u8 length; - u8 flags[3]; - __le16 handle; - __le16 related_handle; - __le64 timestamp; - u8 maint_op_class; - u8 reserved[15]; -} __packed; - -#define CXL_EVENT_RECORD_DATA_LENGTH 0x50 -struct cxl_event_record_raw { - struct cxl_event_record_hdr hdr; - u8 data[CXL_EVENT_RECORD_DATA_LENGTH]; -} __packed; - /* * Get Event Records output payload * CXL rev 3.0 section 8.2.9.2.2; Table 8-50 @@ -641,74 +621,6 @@ struct cxl_mbox_clear_event_payload { } __packed; #define CXL_CLEAR_EVENT_MAX_HANDLES U8_MAX -/* - * General Media Event Record - * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43 - */ -#define CXL_EVENT_GEN_MED_COMP_ID_SIZE 0x10 -struct cxl_event_gen_media { - struct cxl_event_record_hdr hdr; - __le64 phys_addr; - u8 descriptor; - u8 type; - u8 transaction_type; - u8 validity_flags[2]; - u8 channel; - u8 rank; - u8 device[3]; - u8 component_id[CXL_EVENT_GEN_MED_COMP_ID_SIZE]; - u8 reserved[46]; -} __packed; - -/* - * DRAM Event Record - DER - * CXL rev 3.0 section 8.2.9.2.1.2; Table 3-44 - */ -#define CXL_EVENT_DER_CORRECTION_MASK_SIZE 0x20 -struct cxl_event_dram { - struct cxl_event_record_hdr hdr; - __le64 phys_addr; - u8 descriptor; - u8 type; - u8 transaction_type; - u8 validity_flags[2]; - u8 channel; - u8 rank; - u8 nibble_mask[3]; - u8 bank_group; - u8 bank; - u8 row[3]; - u8 column[2]; - u8 correction_mask[CXL_EVENT_DER_CORRECTION_MASK_SIZE]; - u8 reserved[0x17]; -} __packed; - -/* - * Get Health Info Record - * CXL rev 3.0 section 8.2.9.8.3.1; Table 8-100 - */ -struct cxl_get_health_info { - u8 health_status; - u8 media_status; - u8 add_status; - u8 life_used; - u8 device_temp[2]; - u8 dirty_shutdown_cnt[4]; - u8 cor_vol_err_cnt[4]; - u8 cor_per_err_cnt[4]; -} __packed; - -/* - * Memory Module Event Record - * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45 - */ -struct cxl_event_mem_module { - struct cxl_event_record_hdr hdr; - u8 event_type; - struct cxl_get_health_info info; - u8 reserved[0x3d]; -} __packed; - struct cxl_mbox_get_partition_info { __le64 active_volatile_cap; __le64 active_persistent_cap; diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h new file mode 100644 index 0000000000000..0fc068123f8ed --- /dev/null +++ b/include/linux/cxl-event.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2023 Intel Corporation. */ +#ifndef _LINUX_CXL_EVENT_H +#define _LINUX_CXL_EVENT_H + +/* + * Common Event Record Format + * CXL rev 3.0 section 8.2.9.2.1; Table 8-42 + */ +struct cxl_event_record_hdr { + uuid_t id; + u8 length; + u8 flags[3]; + __le16 handle; + __le16 related_handle; + __le64 timestamp; + u8 maint_op_class; + u8 reserved[15]; +} __packed; + +#define CXL_EVENT_RECORD_DATA_LENGTH 0x50 +struct cxl_event_record_raw { + struct cxl_event_record_hdr hdr; + u8 data[CXL_EVENT_RECORD_DATA_LENGTH]; +} __packed; + +/* + * General Media Event Record + * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43 + */ +#define CXL_EVENT_GEN_MED_COMP_ID_SIZE 0x10 +struct cxl_event_gen_media { + struct cxl_event_record_hdr hdr; + __le64 phys_addr; + u8 descriptor; + u8 type; + u8 transaction_type; + u8 validity_flags[2]; + u8 channel; + u8 rank; + u8 device[3]; + u8 component_id[CXL_EVENT_GEN_MED_COMP_ID_SIZE]; + u8 reserved[46]; +} __packed; + +/* + * DRAM Event Record - DER + * CXL rev 3.0 section 8.2.9.2.1.2; Table 3-44 + */ +#define CXL_EVENT_DER_CORRECTION_MASK_SIZE 0x20 +struct cxl_event_dram { + struct cxl_event_record_hdr hdr; + __le64 phys_addr; + u8 descriptor; + u8 type; + u8 transaction_type; + u8 validity_flags[2]; + u8 channel; + u8 rank; + u8 nibble_mask[3]; + u8 bank_group; + u8 bank; + u8 row[3]; + u8 column[2]; + u8 correction_mask[CXL_EVENT_DER_CORRECTION_MASK_SIZE]; + u8 reserved[0x17]; +} __packed; + +/* + * Get Health Info Record + * CXL rev 3.0 section 8.2.9.8.3.1; Table 8-100 + */ +struct cxl_get_health_info { + u8 health_status; + u8 media_status; + u8 add_status; + u8 life_used; + u8 device_temp[2]; + u8 dirty_shutdown_cnt[4]; + u8 cor_vol_err_cnt[4]; + u8 cor_per_err_cnt[4]; +} __packed; + +/* + * Memory Module Event Record + * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45 + */ +struct cxl_event_mem_module { + struct cxl_event_record_hdr hdr; + u8 event_type; + struct cxl_get_health_info info; + u8 reserved[0x3d]; +} __packed; + +#endif /* _LINUX_CXL_EVENT_H */ -- GitLab From 9f67c1e63976d3403f0b250b03ffe959c890f9db Mon Sep 17 00:00:00 2001 From: Esteban Blanc Date: Tue, 7 Nov 2023 10:47:01 +0100 Subject: [PATCH 0701/1290] rtc: tps6594: Add driver for TPS6594 RTC TPS6594 PMIC is a MFD. This patch adds support for the RTC found inside TPS6594 family of PMIC. Alarm is also supported. Signed-off-by: Esteban Blanc Reviewed-by: Andy Shevchenko Tested-by: Thomas Richard Link: https://lore.kernel.org/r/20231107094701.2223486-1-eblanc@baylibre.com Signed-off-by: Alexandre Belloni --- drivers/rtc/Kconfig | 12 + drivers/rtc/Makefile | 1 + drivers/rtc/rtc-tps6594.c | 454 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 467 insertions(+) create mode 100644 drivers/rtc/rtc-tps6594.c diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 4e3c2fde2d6a6..68a530ef10669 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -578,6 +578,18 @@ config RTC_DRV_TPS6586X along with alarm. This driver supports the RTC driver for the TPS6586X RTC module. +config RTC_DRV_TPS6594 + tristate "TI TPS6594 RTC driver" + depends on MFD_TPS6594 + default MFD_TPS6594 + help + TI Power Management IC TPS6594 supports RTC functionality + along with alarm. This driver supports the RTC driver for + the TPS6594 RTC module. + + This driver can also be built as a module. If so, the module + will be called rtc-tps6594. + config RTC_DRV_TPS65910 tristate "TI TPS65910 RTC driver" depends on MFD_TPS65910 diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile index 6817ea4e44687..31c4606fd9bf0 100644 --- a/drivers/rtc/Makefile +++ b/drivers/rtc/Makefile @@ -177,6 +177,7 @@ obj-$(CONFIG_RTC_DRV_TEGRA) += rtc-tegra.o obj-$(CONFIG_RTC_DRV_TEST) += rtc-test.o obj-$(CONFIG_RTC_DRV_TI_K3) += rtc-ti-k3.o obj-$(CONFIG_RTC_DRV_TPS6586X) += rtc-tps6586x.o +obj-$(CONFIG_RTC_DRV_TPS6594) += rtc-tps6594.o obj-$(CONFIG_RTC_DRV_TPS65910) += rtc-tps65910.o obj-$(CONFIG_RTC_DRV_TWL4030) += rtc-twl.o obj-$(CONFIG_RTC_DRV_VT8500) += rtc-vt8500.o diff --git a/drivers/rtc/rtc-tps6594.c b/drivers/rtc/rtc-tps6594.c new file mode 100644 index 0000000000000..838ae8562a351 --- /dev/null +++ b/drivers/rtc/rtc-tps6594.c @@ -0,0 +1,454 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * RTC driver for tps6594 PMIC + * + * Copyright (C) 2023 BayLibre Incorporated - https://www.baylibre.com/ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +// Total number of RTC registers needed to set time +#define NUM_TIME_REGS (TPS6594_REG_RTC_WEEKS - TPS6594_REG_RTC_SECONDS + 1) + +// Total number of RTC alarm registers +#define NUM_TIME_ALARM_REGS (NUM_TIME_REGS - 1) + +/* + * Min and max values supported by 'offset' interface (swapped sign). + * After conversion, the values do not exceed the range [-32767, 33767] + * which COMP_REG must conform to. + */ +#define MIN_OFFSET (-277774) +#define MAX_OFFSET (277774) + +// Number of ticks per hour +#define TICKS_PER_HOUR (32768 * 3600) + +// Multiplier for ppb conversions +#define PPB_MULT NANO + +static int tps6594_rtc_alarm_irq_enable(struct device *dev, + unsigned int enabled) +{ + struct tps6594 *tps = dev_get_drvdata(dev->parent); + u8 val; + + val = enabled ? TPS6594_BIT_IT_ALARM : 0; + + return regmap_update_bits(tps->regmap, TPS6594_REG_RTC_INTERRUPTS, + TPS6594_BIT_IT_ALARM, val); +} + +/* Pulse GET_TIME field of RTC_CTRL_1 to store a timestamp in shadow registers. */ +static int tps6594_rtc_shadow_timestamp(struct device *dev, struct tps6594 *tps) +{ + int ret; + + /* + * Set GET_TIME to 0. Next time we set GET_TIME to 1 we will be sure to store + * an up-to-date timestamp. + */ + ret = regmap_clear_bits(tps->regmap, TPS6594_REG_RTC_CTRL_1, + TPS6594_BIT_GET_TIME); + if (ret < 0) + return ret; + + /* + * Copy content of RTC registers to shadow registers or latches to read + * a coherent timestamp. + */ + return regmap_set_bits(tps->regmap, TPS6594_REG_RTC_CTRL_1, + TPS6594_BIT_GET_TIME); +} + +static int tps6594_rtc_read_time(struct device *dev, struct rtc_time *tm) +{ + unsigned char rtc_data[NUM_TIME_REGS]; + struct tps6594 *tps = dev_get_drvdata(dev->parent); + int ret; + + // Check if RTC is running. + ret = regmap_test_bits(tps->regmap, TPS6594_REG_RTC_STATUS, + TPS6594_BIT_RUN); + if (ret < 0) + return ret; + if (ret == 0) + return -EINVAL; + + ret = tps6594_rtc_shadow_timestamp(dev, tps); + if (ret < 0) + return ret; + + // Read shadowed RTC registers. + ret = regmap_bulk_read(tps->regmap, TPS6594_REG_RTC_SECONDS, rtc_data, + NUM_TIME_REGS); + if (ret < 0) + return ret; + + tm->tm_sec = bcd2bin(rtc_data[0]); + tm->tm_min = bcd2bin(rtc_data[1]); + tm->tm_hour = bcd2bin(rtc_data[2]); + tm->tm_mday = bcd2bin(rtc_data[3]); + tm->tm_mon = bcd2bin(rtc_data[4]) - 1; + tm->tm_year = bcd2bin(rtc_data[5]) + 100; + tm->tm_wday = bcd2bin(rtc_data[6]); + + return 0; +} + +static int tps6594_rtc_set_time(struct device *dev, struct rtc_time *tm) +{ + unsigned char rtc_data[NUM_TIME_REGS]; + struct tps6594 *tps = dev_get_drvdata(dev->parent); + int ret; + + rtc_data[0] = bin2bcd(tm->tm_sec); + rtc_data[1] = bin2bcd(tm->tm_min); + rtc_data[2] = bin2bcd(tm->tm_hour); + rtc_data[3] = bin2bcd(tm->tm_mday); + rtc_data[4] = bin2bcd(tm->tm_mon + 1); + rtc_data[5] = bin2bcd(tm->tm_year - 100); + rtc_data[6] = bin2bcd(tm->tm_wday); + + // Stop RTC while updating the RTC time registers. + ret = regmap_clear_bits(tps->regmap, TPS6594_REG_RTC_CTRL_1, + TPS6594_BIT_STOP_RTC); + if (ret < 0) + return ret; + + // Update all the time registers in one shot. + ret = regmap_bulk_write(tps->regmap, TPS6594_REG_RTC_SECONDS, rtc_data, + NUM_TIME_REGS); + if (ret < 0) + return ret; + + // Start back RTC. + return regmap_set_bits(tps->regmap, TPS6594_REG_RTC_CTRL_1, + TPS6594_BIT_STOP_RTC); +} + +static int tps6594_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alm) +{ + unsigned char alarm_data[NUM_TIME_ALARM_REGS]; + u32 int_val; + struct tps6594 *tps = dev_get_drvdata(dev->parent); + int ret; + + ret = regmap_bulk_read(tps->regmap, TPS6594_REG_ALARM_SECONDS, + alarm_data, NUM_TIME_ALARM_REGS); + if (ret < 0) + return ret; + + alm->time.tm_sec = bcd2bin(alarm_data[0]); + alm->time.tm_min = bcd2bin(alarm_data[1]); + alm->time.tm_hour = bcd2bin(alarm_data[2]); + alm->time.tm_mday = bcd2bin(alarm_data[3]); + alm->time.tm_mon = bcd2bin(alarm_data[4]) - 1; + alm->time.tm_year = bcd2bin(alarm_data[5]) + 100; + + ret = regmap_read(tps->regmap, TPS6594_REG_RTC_INTERRUPTS, &int_val); + if (ret < 0) + return ret; + + alm->enabled = int_val & TPS6594_BIT_IT_ALARM; + + return 0; +} + +static int tps6594_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm) +{ + unsigned char alarm_data[NUM_TIME_ALARM_REGS]; + struct tps6594 *tps = dev_get_drvdata(dev->parent); + int ret; + + // Disable alarm irq before changing the alarm timestamp. + ret = tps6594_rtc_alarm_irq_enable(dev, 0); + if (ret) + return ret; + + alarm_data[0] = bin2bcd(alm->time.tm_sec); + alarm_data[1] = bin2bcd(alm->time.tm_min); + alarm_data[2] = bin2bcd(alm->time.tm_hour); + alarm_data[3] = bin2bcd(alm->time.tm_mday); + alarm_data[4] = bin2bcd(alm->time.tm_mon + 1); + alarm_data[5] = bin2bcd(alm->time.tm_year - 100); + + // Update all the alarm registers in one shot. + ret = regmap_bulk_write(tps->regmap, TPS6594_REG_ALARM_SECONDS, + alarm_data, NUM_TIME_ALARM_REGS); + if (ret < 0) + return ret; + + if (alm->enabled) + ret = tps6594_rtc_alarm_irq_enable(dev, 1); + + return ret; +} + +static int tps6594_rtc_set_calibration(struct device *dev, int calibration) +{ + struct tps6594 *tps = dev_get_drvdata(dev->parent); + __le16 value; + int ret; + + /* + * TPS6594 uses two's complement 16 bit value for compensation of RTC + * crystal inaccuracies. One time every hour when seconds counter + * increments from 0 to 1 compensation value will be added to internal + * RTC counter value. + * + * Valid range for compensation value: [-32767 .. 32767]. + */ + if (calibration < S16_MIN + 1 || calibration > S16_MAX) + return -ERANGE; + + value = cpu_to_le16(calibration); + + // Update all the compensation registers in one shot. + ret = regmap_bulk_write(tps->regmap, TPS6594_REG_RTC_COMP_LSB, &value, + sizeof(value)); + if (ret < 0) + return ret; + + // Enable automatic compensation. + return regmap_set_bits(tps->regmap, TPS6594_REG_RTC_CTRL_1, + TPS6594_BIT_AUTO_COMP); +} + +static int tps6594_rtc_get_calibration(struct device *dev, int *calibration) +{ + struct tps6594 *tps = dev_get_drvdata(dev->parent); + unsigned int ctrl; + __le16 value; + int ret; + + ret = regmap_read(tps->regmap, TPS6594_REG_RTC_CTRL_1, &ctrl); + if (ret < 0) + return ret; + + // If automatic compensation is not enabled report back zero. + if (!(ctrl & TPS6594_BIT_AUTO_COMP)) { + *calibration = 0; + return 0; + } + + ret = regmap_bulk_read(tps->regmap, TPS6594_REG_RTC_COMP_LSB, &value, + sizeof(value)); + if (ret < 0) + return ret; + + *calibration = le16_to_cpu(value); + + return 0; +} + +static int tps6594_rtc_read_offset(struct device *dev, long *offset) +{ + int calibration; + s64 tmp; + int ret; + + ret = tps6594_rtc_get_calibration(dev, &calibration); + if (ret < 0) + return ret; + + // Convert from RTC calibration register format to ppb format. + tmp = calibration * PPB_MULT; + + if (tmp < 0) + tmp -= TICKS_PER_HOUR / 2LL; + else + tmp += TICKS_PER_HOUR / 2LL; + tmp = div_s64(tmp, TICKS_PER_HOUR); + + /* + * SAFETY: + * Computatiion is the reverse operation of the one done in + * `tps6594_rtc_set_offset`. The safety remarks applie here too. + */ + + /* + * Offset value operates in negative way, so swap sign. + * See 8.3.10.5, (32768 - COMP_REG). + */ + *offset = (long)-tmp; + + return 0; +} + +static int tps6594_rtc_set_offset(struct device *dev, long offset) +{ + int calibration; + s64 tmp; + + // Make sure offset value is within supported range. + if (offset < MIN_OFFSET || offset > MAX_OFFSET) + return -ERANGE; + + // Convert from ppb format to RTC calibration register format. + + tmp = offset * TICKS_PER_HOUR; + if (tmp < 0) + tmp -= PPB_MULT / 2LL; + else + tmp += PPB_MULT / 2LL; + tmp = div_s64(tmp, PPB_MULT); + + /* + * SAFETY: + * - tmp = offset * TICK_PER_HOUR : + * `offset` can't be more than 277774, so `tmp` can't exceed 277774000000000 + * which is lower than the maximum value in an `s64` (2^63-1). No overflow here. + * + * - tmp += TICK_PER_HOUR / 2LL : + * tmp will have a maximum value of 277774117964800 which is still inferior to 2^63-1. + */ + + // Offset value operates in negative way, so swap sign. + calibration = (int)-tmp; + + return tps6594_rtc_set_calibration(dev, calibration); +} + +static irqreturn_t tps6594_rtc_interrupt(int irq, void *rtc) +{ + struct device *dev = rtc; + struct tps6594 *tps = dev_get_drvdata(dev->parent); + struct rtc_device *rtc_dev = dev_get_drvdata(dev); + int ret; + u32 rtc_reg; + + ret = regmap_read(tps->regmap, TPS6594_REG_RTC_STATUS, &rtc_reg); + if (ret) + return IRQ_NONE; + + rtc_update_irq(rtc_dev, 1, RTC_IRQF | RTC_AF); + + return IRQ_HANDLED; +} + +static const struct rtc_class_ops tps6594_rtc_ops = { + .read_time = tps6594_rtc_read_time, + .set_time = tps6594_rtc_set_time, + .read_alarm = tps6594_rtc_read_alarm, + .set_alarm = tps6594_rtc_set_alarm, + .alarm_irq_enable = tps6594_rtc_alarm_irq_enable, + .read_offset = tps6594_rtc_read_offset, + .set_offset = tps6594_rtc_set_offset, +}; + +static int tps6594_rtc_probe(struct platform_device *pdev) +{ + struct tps6594 *tps = dev_get_drvdata(pdev->dev.parent); + struct device *dev = &pdev->dev; + struct rtc_device *rtc; + int irq; + int ret; + + rtc = devm_kzalloc(dev, sizeof(*rtc), GFP_KERNEL); + if (!rtc) + return -ENOMEM; + + rtc = devm_rtc_allocate_device(dev); + if (IS_ERR(rtc)) + return PTR_ERR(rtc); + + // Enable crystal oscillator. + ret = regmap_set_bits(tps->regmap, TPS6594_REG_RTC_CTRL_2, + TPS6594_BIT_XTAL_EN); + if (ret < 0) + return ret; + + ret = regmap_test_bits(tps->regmap, TPS6594_REG_RTC_STATUS, + TPS6594_BIT_RUN); + if (ret < 0) + return ret; + // RTC not running. + if (ret == 0) { + ret = regmap_set_bits(tps->regmap, TPS6594_REG_RTC_CTRL_1, + TPS6594_BIT_STOP_RTC); + if (ret < 0) + return ret; + + /* + * On some boards, a 40 ms delay is needed before BIT_RUN is set. + * 80 ms should provide sufficient margin. + */ + mdelay(80); + + /* + * RTC should be running now. Check if this is the case. + * If not it might be a missing oscillator. + */ + ret = regmap_test_bits(tps->regmap, TPS6594_REG_RTC_STATUS, + TPS6594_BIT_RUN); + if (ret < 0) + return ret; + if (ret == 0) + return -ENODEV; + + // Stop RTC until first call to `tps6594_rtc_set_time`. + ret = regmap_clear_bits(tps->regmap, TPS6594_REG_RTC_CTRL_1, + TPS6594_BIT_STOP_RTC); + if (ret < 0) + return ret; + } + + platform_set_drvdata(pdev, rtc); + + irq = platform_get_irq_byname(pdev, TPS6594_IRQ_NAME_ALARM); + if (irq < 0) + return dev_err_probe(dev, irq, "Failed to get irq\n"); + + ret = devm_request_threaded_irq(dev, irq, NULL, tps6594_rtc_interrupt, + IRQF_ONESHOT, TPS6594_IRQ_NAME_ALARM, + dev); + if (ret < 0) + return dev_err_probe(dev, ret, + "Failed to request_threaded_irq\n"); + + ret = device_init_wakeup(dev, true); + if (ret < 0) + return dev_err_probe(dev, ret, + "Failed to init rtc as wakeup source\n"); + + rtc->ops = &tps6594_rtc_ops; + rtc->range_min = RTC_TIMESTAMP_BEGIN_2000; + rtc->range_max = RTC_TIMESTAMP_END_2099; + + return devm_rtc_register_device(rtc); +} + +static const struct platform_device_id tps6594_rtc_id_table[] = { + { "tps6594-rtc", }, + {} +}; +MODULE_DEVICE_TABLE(platform, tps6594_rtc_id_table); + +static struct platform_driver tps6594_rtc_driver = { + .probe = tps6594_rtc_probe, + .driver = { + .name = "tps6594-rtc", + }, + .id_table = tps6594_rtc_id_table, +}; + +module_platform_driver(tps6594_rtc_driver); +MODULE_AUTHOR("Esteban Blanc "); +MODULE_DESCRIPTION("TPS6594 RTC driver"); +MODULE_LICENSE("GPL"); -- GitLab From cd0d7d6639de4468d181c87e45cfd07d51b525da Mon Sep 17 00:00:00 2001 From: Antoniu Miclaus Date: Tue, 14 Nov 2023 13:45:31 +0200 Subject: [PATCH 0702/1290] rtc: lpc24xx: add missing dependency The driver depends on COMMON_CLK. Add the dependency in Kconfig. Signed-off-by: Antoniu Miclaus Link: https://lore.kernel.org/r/20231114114532.37840-1-antoniu.miclaus@analog.com Signed-off-by: Alexandre Belloni --- drivers/rtc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 68a530ef10669..1ea9c76001505 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -1717,6 +1717,7 @@ config RTC_DRV_LPC24XX tristate "NXP RTC for LPC178x/18xx/408x/43xx" depends on ARCH_LPC18XX || COMPILE_TEST depends on OF && HAS_IOMEM + depends on COMMON_CLK help This enables support for the NXP RTC found which can be found on NXP LPC178x/18xx/408x/43xx devices. -- GitLab From 3628d999e31e2e31b51f78b0f68e91275854c179 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sat, 18 Nov 2023 18:22:00 +0900 Subject: [PATCH 0703/1290] rtc: ds3232: avoid unused-const-variable warning Fix the following warning when CONFIG_I2C is not set but CONFIG_SPI_MASTER is set. warning: 'ds3232_pm_ops' defined but not used [-Wunused-const-variable=] 'ds3232_pm_ops' is only used by rtc-ds3232 i2c driver, so move the device PM callbacks inside #if IS_ENABLED(CONFIG_I2C). Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202311172325.33tTniaJ-lkp@intel.com/ Signed-off-by: Akinobu Mita Link: https://lore.kernel.org/r/20231118092200.829808-1-akinobu.mita@gmail.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-ds3232.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-ds3232.c b/drivers/rtc/rtc-ds3232.c index 89d7b085f7219..1485a6ae51e61 100644 --- a/drivers/rtc/rtc-ds3232.c +++ b/drivers/rtc/rtc-ds3232.c @@ -536,6 +536,8 @@ static int ds3232_probe(struct device *dev, struct regmap *regmap, int irq, return 0; } +#if IS_ENABLED(CONFIG_I2C) + #ifdef CONFIG_PM_SLEEP static int ds3232_suspend(struct device *dev) { @@ -564,8 +566,6 @@ static const struct dev_pm_ops ds3232_pm_ops = { SET_SYSTEM_SLEEP_PM_OPS(ds3232_suspend, ds3232_resume) }; -#if IS_ENABLED(CONFIG_I2C) - static int ds3232_i2c_probe(struct i2c_client *client) { struct regmap *regmap; -- GitLab From 2f80de657f83a1f6495e557096847f4626d57164 Mon Sep 17 00:00:00 2001 From: Stefan Eichenberger Date: Wed, 22 Nov 2023 19:16:11 +0100 Subject: [PATCH 0704/1290] rtc: rv8803: Add power management support Add power management support to the driver. This allows a SoC to wake from suspend using the nINT provided by the RTC. Only register it as a wakeup device if the interrupt is provided and handled. Signed-off-by: Stefan Eichenberger Link: https://lore.kernel.org/r/20231122181611.164792-1-eichest@gmail.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-rv8803.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/drivers/rtc/rtc-rv8803.c b/drivers/rtc/rtc-rv8803.c index 1a3ec1bb5b814..11e6b0d31f5d7 100644 --- a/drivers/rtc/rtc-rv8803.c +++ b/drivers/rtc/rtc-rv8803.c @@ -17,6 +17,7 @@ #include #include #include +#include #define RV8803_I2C_TRY_COUNT 4 @@ -607,6 +608,28 @@ static int rv8803_regs_configure(struct rv8803_data *rv8803) return 0; } +static int rv8803_resume(struct device *dev) +{ + struct rv8803_data *rv8803 = dev_get_drvdata(dev); + + if (rv8803->client->irq > 0 && device_may_wakeup(dev)) + disable_irq_wake(rv8803->client->irq); + + return 0; +} + +static int rv8803_suspend(struct device *dev) +{ + struct rv8803_data *rv8803 = dev_get_drvdata(dev); + + if (rv8803->client->irq > 0 && device_may_wakeup(dev)) + enable_irq_wake(rv8803->client->irq); + + return 0; +} + +static DEFINE_SIMPLE_DEV_PM_OPS(rv8803_pm_ops, rv8803_suspend, rv8803_resume); + static const struct i2c_device_id rv8803_id[] = { { "rv8803", rv_8803 }, { "rv8804", rx_8804 }, @@ -683,6 +706,11 @@ static int rv8803_probe(struct i2c_client *client) if (err) { dev_warn(&client->dev, "unable to request IRQ, alarms disabled\n"); client->irq = 0; + } else { + device_init_wakeup(&client->dev, true); + err = dev_pm_set_wake_irq(&client->dev, client->irq); + if (err) + dev_err(&client->dev, "failed to set wake IRQ\n"); } } if (!client->irq) @@ -737,6 +765,7 @@ static struct i2c_driver rv8803_driver = { .driver = { .name = "rtc-rv8803", .of_match_table = of_match_ptr(rv8803_of_match), + .pm = &rv8803_pm_ops, }, .probe = rv8803_probe, .id_table = rv8803_id, -- GitLab From 33f4ac16540509af518580abe730d409e8098aca Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 30 Nov 2023 18:32:23 +0100 Subject: [PATCH 0705/1290] dt-bindings: rtc: qcom-pm8xxx: fix inconsistent example The PM8921 is an SSBI PMIC but in the binding example it is described as being part of an SPMI PMIC while using an SSBI address. Make the example consistent by using the sibling PM8941 SPMI PMIC instead. Fixes: 8138c5f0318c ("dt-bindings: rtc: qcom-pm8xxx-rtc: Add qcom pm8xxx rtc bindings") Signed-off-by: Johan Hovold Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20231130173223.12794-1-johan+linaro@kernel.org Signed-off-by: Alexandre Belloni --- .../bindings/rtc/qcom-pm8xxx-rtc.yaml | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/Documentation/devicetree/bindings/rtc/qcom-pm8xxx-rtc.yaml b/Documentation/devicetree/bindings/rtc/qcom-pm8xxx-rtc.yaml index b95a69cc9ae0f..d274bb7a534b5 100644 --- a/Documentation/devicetree/bindings/rtc/qcom-pm8xxx-rtc.yaml +++ b/Documentation/devicetree/bindings/rtc/qcom-pm8xxx-rtc.yaml @@ -61,27 +61,27 @@ additionalProperties: false examples: - | + #include #include - spmi_bus: spmi@c440000 { - reg = <0x0c440000 0x1100>; - #address-cells = <2>; - #size-cells = <0>; - pmicintc: pmic@0 { - reg = <0x0 SPMI_USID>; - compatible = "qcom,pm8921"; - interrupts = <104 8>; - #interrupt-cells = <2>; - interrupt-controller; - #address-cells = <1>; + + spmi { + #address-cells = <2>; #size-cells = <0>; - pm8921_rtc: rtc@11d { - compatible = "qcom,pm8921-rtc"; - reg = <0x11d>; - interrupts = <0x27 0>; - nvmem-cells = <&rtc_offset>; - nvmem-cell-names = "offset"; + pmic@0 { + compatible = "qcom,pm8941", "qcom,spmi-pmic"; + reg = <0x0 SPMI_USID>; + #address-cells = <1>; + #size-cells = <0>; + + rtc@6000 { + compatible = "qcom,pm8941-rtc"; + reg = <0x6000>, <0x6100>; + reg-names = "rtc", "alarm"; + interrupts = <0x0 0x61 0x1 IRQ_TYPE_EDGE_RISING>; + nvmem-cells = <&rtc_offset>; + nvmem-cell-names = "offset"; + }; }; - }; }; ... -- GitLab From e9a2162495ce387647bae565483f978369a29839 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Sun, 17 Dec 2023 23:58:31 +0100 Subject: [PATCH 0706/1290] rtc: ma35d1: remove hardcoded UIE support Let the core handle UIE instead of enabling it forcefully at probe which means the RTC will generate an interrupt every second even when nobody cares. Link: https://lore.kernel.org/r/20231217225831.48581-1-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-ma35d1.c | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/drivers/rtc/rtc-ma35d1.c b/drivers/rtc/rtc-ma35d1.c index 07c9a083a9d5d..cfcfc28060f62 100644 --- a/drivers/rtc/rtc-ma35d1.c +++ b/drivers/rtc/rtc-ma35d1.c @@ -79,11 +79,6 @@ static irqreturn_t ma35d1_rtc_interrupt(int irq, void *data) events |= RTC_AF | RTC_IRQF; } - if (rtc_irq & RTC_INTSTS_UIF) { - rtc_reg_write(rtc, MA35_REG_RTC_INTSTS, RTC_INTSTS_UIF); - events |= RTC_UF | RTC_IRQF; - } - rtc_update_irq(rtc->rtcdev, 1, events); return IRQ_HANDLED; @@ -216,7 +211,6 @@ static int ma35d1_rtc_probe(struct platform_device *pdev) { struct ma35_rtc *rtc; struct clk *clk; - u32 regval; int ret; rtc = devm_kzalloc(&pdev->dev, sizeof(*rtc), GFP_KERNEL); @@ -264,40 +258,26 @@ static int ma35d1_rtc_probe(struct platform_device *pdev) if (ret) return dev_err_probe(&pdev->dev, ret, "Failed to register rtc device\n"); - regval = rtc_reg_read(rtc, MA35_REG_RTC_INTEN); - regval |= RTC_INTEN_UIEN; - rtc_reg_write(rtc, MA35_REG_RTC_INTEN, regval); - return 0; } static int ma35d1_rtc_suspend(struct platform_device *pdev, pm_message_t state) { struct ma35_rtc *rtc = platform_get_drvdata(pdev); - u32 regval; if (device_may_wakeup(&pdev->dev)) enable_irq_wake(rtc->irq_num); - regval = rtc_reg_read(rtc, MA35_REG_RTC_INTEN); - regval &= ~RTC_INTEN_UIEN; - rtc_reg_write(rtc, MA35_REG_RTC_INTEN, regval); - return 0; } static int ma35d1_rtc_resume(struct platform_device *pdev) { struct ma35_rtc *rtc = platform_get_drvdata(pdev); - u32 regval; if (device_may_wakeup(&pdev->dev)) disable_irq_wake(rtc->irq_num); - regval = rtc_reg_read(rtc, MA35_REG_RTC_INTEN); - regval |= RTC_INTEN_UIEN; - rtc_reg_write(rtc, MA35_REG_RTC_INTEN, regval); - return 0; } -- GitLab From 54e1898e113dc65974103620bbe7a9f856f80f6e Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 11 Dec 2023 14:26:00 +0100 Subject: [PATCH 0707/1290] rtc: MAINTAINERS: drop Alessandro Zummo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Last email from Alessandro was in 2016, so remove him from maintainers of the RTC subsystem. Stale maintainer entries hide information whether subsystem needs help, has a bus-factor or is even orphaned. Link: https://lore.kernel.org/all/?q=f%3A%22Alessandro+Zummo%22 Cc: Alexandre Belloni Cc: Uwe Kleine-König Cc: Arnd Bergmann Cc: Alessandro Zummo Signed-off-by: Krzysztof Kozlowski Acked-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20231211132600.101090-1-krzysztof.kozlowski@linaro.org Signed-off-by: Alexandre Belloni --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 97f51d5ec1cfd..2635f7b59e3ca 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18286,7 +18286,6 @@ X: include/linux/srcu*.h X: kernel/rcu/srcu*.c REAL TIME CLOCK (RTC) SUBSYSTEM -M: Alessandro Zummo M: Alexandre Belloni L: linux-rtc@vger.kernel.org S: Maintained -- GitLab From e3d3fe7e7bf08820a83c9d9a4c38c7b29a2927f1 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Tue, 19 Dec 2023 06:07:12 +0100 Subject: [PATCH 0708/1290] rtc: class: Remove usage of the deprecated ida_simple_xx() API ida_alloc() and ida_free() should be preferred to the deprecated ida_simple_get() and ida_simple_remove(). This is less verbose. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/4f2c049cb09d46fed336e22445c71988b4f340d6.1702962419.git.christophe.jaillet@wanadoo.fr Signed-off-by: Alexandre Belloni --- drivers/rtc/class.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c index edfd942f8c549..921ee18279743 100644 --- a/drivers/rtc/class.c +++ b/drivers/rtc/class.c @@ -256,7 +256,7 @@ static int rtc_device_get_id(struct device *dev) of_id = of_alias_get_id(dev->parent->of_node, "rtc"); if (of_id >= 0) { - id = ida_simple_get(&rtc_ida, of_id, of_id + 1, GFP_KERNEL); + id = ida_alloc_range(&rtc_ida, of_id, of_id, GFP_KERNEL); if (id < 0) dev_warn(dev, "/aliases ID %d not available\n", of_id); } -- GitLab From c7ec4f2d684e17d69bbdd7c4324db0ef5daac26a Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 8 Jan 2024 07:41:39 +0100 Subject: [PATCH 0709/1290] xen-netback: don't produce zero-size SKB frags While frontends may submit zero-size requests (wasting a precious slot), core networking code as of at least 3ece782693c4b ("sock: skb_copy_ubufs support for compound pages") can't deal with SKBs when they have all zero-size fragments. Respond to empty requests right when populating fragments; all further processing is fragment based and hence won't encounter these empty requests anymore. In a way this should have been that way from the beginning: When no data is to be transferred for a particular request, there's not even a point in validating the respective grant ref. That's no different from e.g. passing NULL into memcpy() when at the same time the size is 0. This is XSA-448 / CVE-2023-46838. Cc: stable@vger.kernel.org Signed-off-by: Jan Beulich Reviewed-by: Juergen Gross Reviewed-by: Paul Durrant --- drivers/net/xen-netback/netback.c | 44 ++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 88f760a7cbc35..d7503aef599f0 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -463,12 +463,25 @@ static void xenvif_get_requests(struct xenvif_queue *queue, } for (shinfo->nr_frags = 0; nr_slots > 0 && shinfo->nr_frags < MAX_SKB_FRAGS; - shinfo->nr_frags++, gop++, nr_slots--) { + nr_slots--) { + if (unlikely(!txp->size)) { + unsigned long flags; + + spin_lock_irqsave(&queue->response_lock, flags); + make_tx_response(queue, txp, 0, XEN_NETIF_RSP_OKAY); + push_tx_responses(queue); + spin_unlock_irqrestore(&queue->response_lock, flags); + ++txp; + continue; + } + index = pending_index(queue->pending_cons++); pending_idx = queue->pending_ring[index]; xenvif_tx_create_map_op(queue, pending_idx, txp, txp == first ? extra_count : 0, gop); frag_set_pending_idx(&frags[shinfo->nr_frags], pending_idx); + ++shinfo->nr_frags; + ++gop; if (txp == first) txp = txfrags; @@ -481,20 +494,39 @@ static void xenvif_get_requests(struct xenvif_queue *queue, shinfo = skb_shinfo(nskb); frags = shinfo->frags; - for (shinfo->nr_frags = 0; shinfo->nr_frags < nr_slots; - shinfo->nr_frags++, txp++, gop++) { + for (shinfo->nr_frags = 0; shinfo->nr_frags < nr_slots; ++txp) { + if (unlikely(!txp->size)) { + unsigned long flags; + + spin_lock_irqsave(&queue->response_lock, flags); + make_tx_response(queue, txp, 0, + XEN_NETIF_RSP_OKAY); + push_tx_responses(queue); + spin_unlock_irqrestore(&queue->response_lock, + flags); + continue; + } + index = pending_index(queue->pending_cons++); pending_idx = queue->pending_ring[index]; xenvif_tx_create_map_op(queue, pending_idx, txp, 0, gop); frag_set_pending_idx(&frags[shinfo->nr_frags], pending_idx); + ++shinfo->nr_frags; + ++gop; } - skb_shinfo(skb)->frag_list = nskb; - } else if (nskb) { + if (shinfo->nr_frags) { + skb_shinfo(skb)->frag_list = nskb; + nskb = NULL; + } + } + + if (nskb) { /* A frag_list skb was allocated but it is no longer needed - * because enough slots were converted to copy ops above. + * because enough slots were converted to copy ops above or some + * were empty. */ kfree_skb(nskb); } -- GitLab From 59b946ea30806064c4ac78f0ac93642655dd4f2e Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Mon, 8 Jan 2024 11:48:41 +0200 Subject: [PATCH 0710/1290] ASoC: Intel: bxt_da7219_max98357a: Fix kernel ops due to COMP_DUMMY change The change to avoid dummy components will leave the component name and dai_name NULL which will cause NULL dereference when trying to access to it in the machine driver when applying fixups. Link: https://github.com/thesofproject/linux/pull/4759#issuecomment-1878641868 Fixes: 13f58267cda3 ("ASoC: soc.h: don't create dummy Component via COMP_DUMMY()") Signed-off-by: Peter Ujfalusi Reviewed-by: Bard Liao Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240108094842.28782-2-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/boards/bxt_da7219_max98357a.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sound/soc/intel/boards/bxt_da7219_max98357a.c b/sound/soc/intel/boards/bxt_da7219_max98357a.c index 816fad8c1ff0e..540f7a29310a9 100644 --- a/sound/soc/intel/boards/bxt_da7219_max98357a.c +++ b/sound/soc/intel/boards/bxt_da7219_max98357a.c @@ -797,6 +797,9 @@ static int broxton_audio_probe(struct platform_device *pdev) broxton_audio_card.name = "glkda7219max"; /* Fixup the SSP entries for geminilake */ for (i = 0; i < ARRAY_SIZE(broxton_dais); i++) { + if (!broxton_dais[i].codecs->dai_name) + continue; + /* MAXIM_CODEC is connected to SSP1. */ if (!strcmp(broxton_dais[i].codecs->dai_name, BXT_MAXIM_CODEC_DAI)) { @@ -822,6 +825,9 @@ static int broxton_audio_probe(struct platform_device *pdev) broxton_audio_card.name = "cmlda7219max"; for (i = 0; i < ARRAY_SIZE(broxton_dais); i++) { + if (!broxton_dais[i].codecs->dai_name) + continue; + /* MAXIM_CODEC is connected to SSP1. */ if (!strcmp(broxton_dais[i].codecs->dai_name, BXT_MAXIM_CODEC_DAI)) { -- GitLab From 3ec71290db4de298b67659ef2bc2a8f84cf9537b Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Mon, 8 Jan 2024 11:48:42 +0200 Subject: [PATCH 0711/1290] ASoC: Intel: bxt_rt298: Fix kernel ops due to COMP_DUMMY change The change to avoid dummy components will leave the component name and dai_name NULL which will cause NULL dereference when trying to access to it in the machine driver when applying fixups. Link: https://github.com/thesofproject/linux/pull/4759#issuecomment-1878641868 Fixes: 13f58267cda3 ("ASoC: soc.h: don't create dummy Component via COMP_DUMMY()") Signed-off-by: Peter Ujfalusi Reviewed-by: Bard Liao Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240108094842.28782-3-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/boards/bxt_rt298.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sound/soc/intel/boards/bxt_rt298.c b/sound/soc/intel/boards/bxt_rt298.c index 4631106f2a282..c0eb65c14aa97 100644 --- a/sound/soc/intel/boards/bxt_rt298.c +++ b/sound/soc/intel/boards/bxt_rt298.c @@ -604,7 +604,8 @@ static int broxton_audio_probe(struct platform_device *pdev) int i; for (i = 0; i < ARRAY_SIZE(broxton_rt298_dais); i++) { - if (!strncmp(card->dai_link[i].codecs->name, "i2c-INT343A:00", + if (card->dai_link[i].codecs->name && + !strncmp(card->dai_link[i].codecs->name, "i2c-INT343A:00", I2C_NAME_SIZE)) { if (!strncmp(card->name, "broxton-rt298", PLATFORM_NAME_SIZE)) { -- GitLab From 172fb49600c2b96a4ade255fbfc3fe7098b8afd3 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 6 Dec 2023 10:48:30 -0800 Subject: [PATCH 0712/1290] nvme-pci: enhance timeout kernel log Kernel configs don't necessarily have opcode decoding, and some opcodes are not even decodable. It is still interesting for debugging SSD issues to know what opcode is timing out, what request type it came from, and the data size (if applicable). Also print the command_id along side blk-mq's tag to help match commands with protocol wire traces and firmware logs, Reviewed-by: Max Gurtovoy Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f272026807419..75e763ce09aa0 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1284,6 +1284,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) struct request *abort_req; struct nvme_command cmd = { }; u32 csts = readl(dev->bar + NVME_REG_CSTS); + u8 opcode; /* If PCI error recovery process is happening, we cannot reset or * the recovery mechanism will surely fail. @@ -1310,8 +1311,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) if (blk_mq_rq_state(req) != MQ_RQ_IN_FLIGHT) { dev_warn(dev->ctrl.device, - "I/O %d QID %d timeout, completion polled\n", - req->tag, nvmeq->qid); + "I/O tag %d (%04x) QID %d timeout, completion polled\n", + req->tag, nvme_cid(req), nvmeq->qid); return BLK_EH_DONE; } @@ -1327,8 +1328,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) fallthrough; case NVME_CTRL_DELETING: dev_warn_ratelimited(dev->ctrl.device, - "I/O %d QID %d timeout, disable controller\n", - req->tag, nvmeq->qid); + "I/O tag %d (%04x) QID %d timeout, disable controller\n", + req->tag, nvme_cid(req), nvmeq->qid); nvme_req(req)->flags |= NVME_REQ_CANCELLED; nvme_dev_disable(dev, true); return BLK_EH_DONE; @@ -1343,10 +1344,12 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) * command was already aborted once before and still hasn't been * returned to the driver, or if this is the admin queue. */ + opcode = nvme_req(req)->cmd->common.opcode; if (!nvmeq->qid || iod->aborted) { dev_warn(dev->ctrl.device, - "I/O %d QID %d timeout, reset controller\n", - req->tag, nvmeq->qid); + "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n", + req->tag, nvme_cid(req), opcode, + nvme_opcode_str(nvmeq->qid, opcode, 0), nvmeq->qid); nvme_req(req)->flags |= NVME_REQ_CANCELLED; goto disable; } @@ -1362,10 +1365,10 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) cmd.abort.sqid = cpu_to_le16(nvmeq->qid); dev_warn(nvmeq->dev->ctrl.device, - "I/O %d (%s) QID %d timeout, aborting\n", - req->tag, - nvme_get_opcode_str(nvme_req(req)->cmd->common.opcode), - nvmeq->qid); + "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, aborting req_op:%s(%u) size:%u\n", + req->tag, nvme_cid(req), opcode, nvme_get_opcode_str(opcode), + nvmeq->qid, blk_op_str(req_op(req)), req_op(req), + blk_rq_bytes(req)); abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, nvme_req_op(&cmd), BLK_MQ_REQ_NOWAIT); -- GitLab From a5c1a87ce0876192d4343cdf5f0a22d737eb0ec3 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 7 Jan 2024 02:29:49 +0200 Subject: [PATCH 0713/1290] nvme-rdma: enhance timeout kernel log Print the command_id along side blk-mq's tag to help match commands with protocol wire traces and logs. Signed-off-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/rdma.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index bc90ec3c51b07..2e77c0f25f710 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1941,9 +1941,14 @@ static enum blk_eh_timer_return nvme_rdma_timeout(struct request *rq) struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); struct nvme_rdma_queue *queue = req->queue; struct nvme_rdma_ctrl *ctrl = queue->ctrl; - - dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n", - rq->tag, nvme_rdma_queue_idx(queue)); + u8 opcode = req->req.cmd->common.opcode; + u8 fctype = req->req.cmd->fabrics.fctype; + int qid = nvme_rdma_queue_idx(queue); + + dev_warn(ctrl->ctrl.device, + "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout\n", + rq->tag, nvme_cid(rq), opcode, + nvme_opcode_str(qid, opcode, fctype), qid); if (ctrl->ctrl.state != NVME_CTRL_LIVE) { /* -- GitLab From 45c36f04f1beb7c4c45f5910a1f69d6d9d5a19fb Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 7 Jan 2024 02:29:50 +0200 Subject: [PATCH 0714/1290] nvme-tcp: enhance timeout kernel log Print the command_id along side blk-mq's tag to help match commands with protocol wire traces and logs. Signed-off-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 5056bcae2f391..b234f0674aebb 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2425,9 +2425,9 @@ static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq) int qid = nvme_tcp_queue_id(req->queue); dev_warn(ctrl->device, - "queue %d: timeout cid %#x type %d opcode %#x (%s)\n", - nvme_tcp_queue_id(req->queue), nvme_cid(rq), pdu->hdr.type, - opc, nvme_opcode_str(qid, opc, fctype)); + "I/O tag %d (%04x) type %d opcode %#x (%s) QID %d timeout\n", + rq->tag, nvme_cid(rq), pdu->hdr.type, opc, + nvme_opcode_str(qid, opc, fctype), qid); if (ctrl->state != NVME_CTRL_LIVE) { /* -- GitLab From 9a1abc24850eb759e36a2f8869161c3b7254c904 Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Fri, 5 Jan 2024 09:14:44 +0100 Subject: [PATCH 0715/1290] nvmet-tcp: Fix the H2C expected PDU len calculation The nvmet_tcp_handle_h2c_data_pdu() function should take into consideration the possibility that the header digest and/or the data digests are enabled when calculating the expected PDU length, before comparing it to the value stored in cmd->pdu_len. Fixes: efa56305908b ("nvmet-tcp: Fix a kernel panic when host sends an invalid H2C PDU length") Signed-off-by: Maurizio Lombardi Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 792828fb91ccf..4dc60cbcb205b 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -979,7 +979,7 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) { struct nvme_tcp_data_pdu *data = &queue->pdu.data; struct nvmet_tcp_cmd *cmd; - unsigned int plen; + unsigned int exp_data_len; if (likely(queue->nr_cmds)) { if (unlikely(data->ttag >= queue->nr_cmds)) { @@ -999,9 +999,13 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) goto err_proto; } - plen = le32_to_cpu(data->hdr.plen); + exp_data_len = le32_to_cpu(data->hdr.plen) - + nvmet_tcp_hdgst_len(queue) - + nvmet_tcp_ddgst_len(queue) - + sizeof(*data); + cmd->pdu_len = le32_to_cpu(data->data_length); - if (unlikely(cmd->pdu_len != (plen - sizeof(*data)) || + if (unlikely(cmd->pdu_len != exp_data_len || cmd->pdu_len == 0 || cmd->pdu_len > NVMET_TCP_MAXH2CDATA)) { pr_err("H2CData PDU len %u is invalid\n", cmd->pdu_len); -- GitLab From 4f1991a92cfe89096b2d1f5583a2e093bdd55c37 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sun, 7 Jan 2024 20:32:58 -0500 Subject: [PATCH 0716/1290] tracing histograms: Simplify parse_actions() function The parse_actions() function uses 'len = str_has_prefix()' to test which action is in the string being parsed. But then it goes and repeats the logic for each different action. This logic can be simplified and duplicate code can be removed as 'len' contains the length of the found prefix which should be used for all actions. Link: https://lore.kernel.org/all/20240107112044.6702cb66@gandalf.local.home/ Link: https://lore.kernel.org/linux-trace-kernel/20240107203258.37e26d2b@gandalf.local.home Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Andy Shevchenko Cc: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 49 ++++++++++++++++---------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 5ecf3c8bde205..6ece1308d36a0 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -4805,36 +4805,35 @@ static int parse_actions(struct hist_trigger_data *hist_data) int len; for (i = 0; i < hist_data->attrs->n_actions; i++) { + enum handler_id hid = 0; + char *action_str; + str = hist_data->attrs->action_str[i]; - if ((len = str_has_prefix(str, "onmatch("))) { - char *action_str = str + len; + if ((len = str_has_prefix(str, "onmatch("))) + hid = HANDLER_ONMATCH; + else if ((len = str_has_prefix(str, "onmax("))) + hid = HANDLER_ONMAX; + else if ((len = str_has_prefix(str, "onchange("))) + hid = HANDLER_ONCHANGE; - data = onmatch_parse(tr, action_str); - if (IS_ERR(data)) { - ret = PTR_ERR(data); - break; - } - } else if ((len = str_has_prefix(str, "onmax("))) { - char *action_str = str + len; + action_str = str + len; - data = track_data_parse(hist_data, action_str, - HANDLER_ONMAX); - if (IS_ERR(data)) { - ret = PTR_ERR(data); - break; - } - } else if ((len = str_has_prefix(str, "onchange("))) { - char *action_str = str + len; + switch (hid) { + case HANDLER_ONMATCH: + data = onmatch_parse(tr, action_str); + break; + case HANDLER_ONMAX: + case HANDLER_ONCHANGE: + data = track_data_parse(hist_data, action_str, hid); + break; + default: + data = ERR_PTR(-EINVAL); + break; + } - data = track_data_parse(hist_data, action_str, - HANDLER_ONCHANGE); - if (IS_ERR(data)) { - ret = PTR_ERR(data); - break; - } - } else { - ret = -EINVAL; + if (IS_ERR(data)) { + ret = PTR_ERR(data); break; } -- GitLab From 3b7cb745473aec7255d66e3854abaa9c3f46f952 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 8 Jan 2024 11:50:16 -0700 Subject: [PATCH 0717/1290] block: move __get_task_ioprio() into header file We call this once per IO, which can be millions of times per second. Since nobody really uses io priorities, or at least it isn't very common, this is all wasted time and can amount to as much as 3% of the total kernel time. Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/ioprio.c | 26 -------------------------- include/linux/ioprio.h | 25 ++++++++++++++++++++++++- 2 files changed, 24 insertions(+), 27 deletions(-) diff --git a/block/ioprio.c b/block/ioprio.c index b5a942519a797..73301a261429f 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -139,32 +139,6 @@ out: return ret; } -/* - * If the task has set an I/O priority, use that. Otherwise, return - * the default I/O priority. - * - * Expected to be called for current task or with task_lock() held to keep - * io_context stable. - */ -int __get_task_ioprio(struct task_struct *p) -{ - struct io_context *ioc = p->io_context; - int prio; - - if (p != current) - lockdep_assert_held(&p->alloc_lock); - if (ioc) - prio = ioc->ioprio; - else - prio = IOPRIO_DEFAULT; - - if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE) - prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p), - task_nice_ioprio(p)); - return prio; -} -EXPORT_SYMBOL_GPL(__get_task_ioprio); - static int get_task_ioprio(struct task_struct *p) { int ret; diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index 7578d4f6a969a..d6a9b5b7ed167 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -47,7 +47,30 @@ static inline int task_nice_ioclass(struct task_struct *task) } #ifdef CONFIG_BLOCK -int __get_task_ioprio(struct task_struct *p); +/* + * If the task has set an I/O priority, use that. Otherwise, return + * the default I/O priority. + * + * Expected to be called for current task or with task_lock() held to keep + * io_context stable. + */ +static inline int __get_task_ioprio(struct task_struct *p) +{ + struct io_context *ioc = p->io_context; + int prio; + + if (p != current) + lockdep_assert_held(&p->alloc_lock); + if (ioc) + prio = ioc->ioprio; + else + prio = IOPRIO_DEFAULT; + + if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE) + prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p), + task_nice_ioprio(p)); + return prio; +} #else static inline int __get_task_ioprio(struct task_struct *p) { -- GitLab From 53889bcaf536b3abedeaf104019877cee37dd08b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 8 Jan 2024 11:51:57 -0700 Subject: [PATCH 0718/1290] block: make __get_task_ioprio() easier to read We don't need to do any gymnastics if we don't have an io_context assigned at all, so just return early with our default priority. Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/ioprio.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index d6a9b5b7ed167..db1249cd96920 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -59,13 +59,13 @@ static inline int __get_task_ioprio(struct task_struct *p) struct io_context *ioc = p->io_context; int prio; + if (!ioc) + return IOPRIO_DEFAULT; + if (p != current) lockdep_assert_held(&p->alloc_lock); - if (ioc) - prio = ioc->ioprio; - else - prio = IOPRIO_DEFAULT; + prio = ioc->ioprio; if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE) prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p), task_nice_ioprio(p)); -- GitLab From d988c9f511af71a3445b6a4f3a2c67208ff8e480 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 8 Jan 2024 17:11:13 -0300 Subject: [PATCH 0719/1290] MAINTAINERS: Add Namhyung as tools/perf/ co-maintainer Acked-by: Ingo Molnar Acked-by: Linus Torvalds Acked-by: Namhyung Kim Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ian Rogers Cc: Adrian Hunter Cc: Jiri Olsa Cc: Mark Rutland Cc: Alexander Shishkin Link: https://lore.kernel.org/lkml/ZZxbCeVPnOjShbMQ@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 9104430e148e6..041ef848d7369 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16896,10 +16896,10 @@ PERFORMANCE EVENTS SUBSYSTEM M: Peter Zijlstra M: Ingo Molnar M: Arnaldo Carvalho de Melo +M: Namhyung Kim R: Mark Rutland R: Alexander Shishkin R: Jiri Olsa -R: Namhyung Kim R: Ian Rogers R: Adrian Hunter L: linux-perf-users@vger.kernel.org -- GitLab From 4d4e1b6319e5c4425ea3faeaf9a10b8b4c16c1e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=ADcolas=20F=2E=20R=2E=20A=2E=20Prado?= Date: Mon, 8 Jan 2024 17:44:58 -0300 Subject: [PATCH 0720/1290] ASoC: mediatek: mt8192: Check existence of dai_name before dereferencing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Following commit 13f58267cda3 ("ASoC: soc.h: don't create dummy Component via COMP_DUMMY()"), the dai_name field is only populated for dummy components after the card is registered. This causes a null pointer dereference in the mt8192-mt6359 sound card driver's probe function when searching for a dai_name among all the card's dai links. Verify that the dai_name is non-null before passing it to strcmp. While at it, also check that there's at least one codec. Reported-by: kernelci.org bot Closes: https://linux.kernelci.org/test/case/id/6582cd6d992645c680e13478/ Fixes: 13f58267cda3 ("ASoC: soc.h: don't create dummy Component via COMP_DUMMY()") Signed-off-by: Nícolas F. R. A. Prado Link: https://msgid.link/r/20240108204508.691739-1-nfraprado@collabora.com Signed-off-by: Mark Brown --- sound/soc/mediatek/mt8192/mt8192-mt6359-rt1015-rt5682.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sound/soc/mediatek/mt8192/mt8192-mt6359-rt1015-rt5682.c b/sound/soc/mediatek/mt8192/mt8192-mt6359-rt1015-rt5682.c index 5bd6addd14505..bfcb2c486c39d 100644 --- a/sound/soc/mediatek/mt8192/mt8192-mt6359-rt1015-rt5682.c +++ b/sound/soc/mediatek/mt8192/mt8192-mt6359-rt1015-rt5682.c @@ -1208,7 +1208,8 @@ static int mt8192_mt6359_dev_probe(struct platform_device *pdev) dai_link->ignore = 0; } - if (strcmp(dai_link->codecs[0].dai_name, RT1015_CODEC_DAI) == 0) + if (dai_link->num_codecs && dai_link->codecs[0].dai_name && + strcmp(dai_link->codecs[0].dai_name, RT1015_CODEC_DAI) == 0) dai_link->ops = &mt8192_rt1015_i2s_ops; if (!dai_link->platforms->name) -- GitLab From 8ead196be219adade3bd0d4115cc9b8506643121 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Fri, 5 Jan 2024 10:01:26 +0800 Subject: [PATCH 0721/1290] apparmor: Fix memory leak in unpack_profile() The aa_put_pdb(rules->file) should be called when rules->file is reassigned, otherwise there may be a memory leak. This was found via kmemleak: unreferenced object 0xffff986c17056600 (size 192): comm "apparmor_parser", pid 875, jiffies 4294893488 hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 89 14 04 6c 98 ff ff ............l... 00 00 8c 11 6c 98 ff ff bc 0c 00 00 00 00 00 00 ....l........... backtrace (crc e28c80c4): [] kmemleak_alloc+0x4f/0x90 [] kmalloc_trace+0x2d2/0x340 [] aa_alloc_pdb+0x4d/0x90 [] unpack_pdb+0x48/0x660 [] unpack_profile+0x693/0x1090 [] aa_unpack+0x10a/0x6e0 [] aa_replace_profiles+0xa3/0x1210 [] policy_update+0x163/0x2a0 [] profile_replace+0xb1/0x130 [] vfs_write+0xd4/0x3d0 [] ksys_write+0x6b/0xf0 [] __x64_sys_write+0x1e/0x30 [] do_syscall_64+0x76/0x120 [] entry_SYSCALL_64_after_hwframe+0x6c/0x74 So add aa_put_pdb(rules->file) to fix it when rules->file is reassigned. Fixes: 98b824ff8984 ("apparmor: refcount the pdb") Signed-off-by: Gaosheng Cui Signed-off-by: John Johansen --- security/apparmor/policy_unpack.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index dbf7d96257ada..5e578ef0ddffb 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -1025,8 +1025,10 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) } } else if (rules->policy->dfa && rules->policy->start[AA_CLASS_FILE]) { + aa_put_pdb(rules->file); rules->file = aa_get_pdb(rules->policy); } else { + aa_put_pdb(rules->file); rules->file = aa_get_pdb(nullpdb); } error = -EPROTO; -- GitLab From 3d1d4aa0cc13b1883a5a56c945837a2e0ecb5143 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 8 Jan 2024 10:02:55 +0000 Subject: [PATCH 0722/1290] cachefiles: Fix signed/unsigned mixup In __cachefiles_prepare_write(), the start and pos variables were made unsigned 64-bit so that the casts in the checking could be got rid of - which should be fine since absolute file offsets can't be negative, except that an error code may be obtained from vfs_llseek(), which *would* be negative. This breaks the error check. Fix this for now by reverting pos and start to be signed and putting back the casts. Unfortunately, the error value checks cannot be replaced with IS_ERR_VALUE() as long might be 32-bits. Fixes: 7097c96411d2 ("cachefiles: Fix __cachefiles_prepare_write()") Reported-by: Simon Horman Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202401071152.DbKqMQMu-lkp@intel.com/ Signed-off-by: David Howells Reviewed-by: Simon Horman Reviewed-by: Gao Xiang cc: Yiqun Leng cc: Jia Zhu cc: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-erofs@lists.ozlabs.org cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- fs/cachefiles/io.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 3eec269674375..9a2cb2868e901 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -522,7 +522,7 @@ int __cachefiles_prepare_write(struct cachefiles_object *object, bool no_space_allocated_yet) { struct cachefiles_cache *cache = object->volume->cache; - unsigned long long start = *_start, pos; + loff_t start = *_start, pos; size_t len = *_len; int ret; @@ -556,7 +556,7 @@ int __cachefiles_prepare_write(struct cachefiles_object *object, cachefiles_trace_seek_error); return pos; } - if (pos >= start + *_len) + if ((u64)pos >= (u64)start + *_len) goto check_space; /* Unallocated region */ /* We have a block that's at least partially filled - if we're low on @@ -575,7 +575,7 @@ int __cachefiles_prepare_write(struct cachefiles_object *object, cachefiles_trace_seek_error); return pos; } - if (pos >= start + *_len) + if ((u64)pos >= (u64)start + *_len) return 0; /* Fully allocated */ /* Partially allocated, but insufficient space: cull. */ -- GitLab From e2bdb5272f4314256f51d91eee7babcae58b194b Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 8 Jan 2024 21:30:49 +0000 Subject: [PATCH 0723/1290] netfs: Fix wrong #ifdef hiding wait netfs_writepages_begin() has the wait on the fscache folio conditional on CONFIG_NETFS_FSCACHE - which doesn't exist. Fix it to be conditional on CONFIG_FSCACHE instead. Fixes: 62c3b7481b9a ("netfs: Provide a writepages implementation") Reported-by: Lukas Bulwahn Signed-off-by: David Howells cc: Matthew Wilcox cc: linux-afs@lists.infradead.org cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org Link: https://lore.kernel.org/r/20240109083257.GK132648@kernel.org/ --- fs/netfs/buffered_write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 0b2b7a60dabc0..de517ca70d916 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -1076,7 +1076,7 @@ lock_again: folio_unlock(folio); if (wbc->sync_mode != WB_SYNC_NONE) { folio_wait_writeback(folio); -#ifdef CONFIG_NETFS_FSCACHE +#ifdef CONFIG_FSCACHE folio_wait_fscache(folio); #endif goto lock_again; -- GitLab From 3dc2f209208dddc73ffd1d817081711343de3242 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Tue, 9 Jan 2024 10:45:47 +0800 Subject: [PATCH 0724/1290] swiotlb: check alloc_size before the allocation of a new memory pool The allocation request for swiotlb contiguous memory greater than 128*2KB cannot be fulfilled because it exceeds the maximum contiguous memory limit. If the swiotlb memory we allocate is larger than 128*2KB, swiotlb_find_slots() will still schedule the allocation of a new memory pool, which will increase memory overhead. Fix it by adding a check with alloc_size no more than 128*2KB before scheduling the allocation of a new memory pool in swiotlb_find_slots(). Signed-off-by: ZhangPeng Reviewed-by: Petr Tesarik Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index e20b856255ef6..96f832d828fd4 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -1136,6 +1136,9 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, int cpu, i; int index; + if (alloc_size > IO_TLB_SEGSIZE * IO_TLB_SIZE) + return -1; + cpu = raw_smp_processor_id(); for (i = 0; i < default_nareas; ++i) { index = swiotlb_search_area(dev, cpu, i, orig_addr, alloc_size, -- GitLab From f9cfe7e7f96a9414a17d596e288693c4f2325d49 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 9 Jan 2024 21:39:57 +0800 Subject: [PATCH 0725/1290] md: Fix md_seq_ops() regressions Commit cf1b6d4441ff ("md: simplify md_seq_ops") introduce following regressions: 1) If list all_mddevs is emptly, personalities and unused devices won't be showed to user anymore. 2) If seq_file buffer overflowed from md_seq_show(), then md_seq_start() will be called again, hence personalities will be showed to user again. 3) If seq_file buffer overflowed from md_seq_stop(), seq_read_iter() doesn't handle this, hence unused devices won't be showed to user. Fix above problems by printing personalities and unused devices in md_seq_show(). Fixes: cf1b6d4441ff ("md: simplify md_seq_ops") Cc: stable@vger.kernel.org # v6.7+ Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240109133957.2975272-1-yukuai1@huaweicloud.com --- drivers/md/md.c | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index e351e6c51cc72..ff3057c787c1b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8135,6 +8135,19 @@ static void status_unused(struct seq_file *seq) seq_printf(seq, "\n"); } +static void status_personalities(struct seq_file *seq) +{ + struct md_personality *pers; + + seq_puts(seq, "Personalities : "); + spin_lock(&pers_lock); + list_for_each_entry(pers, &pers_list, list) + seq_printf(seq, "[%s] ", pers->name); + + spin_unlock(&pers_lock); + seq_puts(seq, "\n"); +} + static int status_resync(struct seq_file *seq, struct mddev *mddev) { sector_t max_sectors, resync, res; @@ -8276,20 +8289,10 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev) static void *md_seq_start(struct seq_file *seq, loff_t *pos) __acquires(&all_mddevs_lock) { - struct md_personality *pers; - - seq_puts(seq, "Personalities : "); - spin_lock(&pers_lock); - list_for_each_entry(pers, &pers_list, list) - seq_printf(seq, "[%s] ", pers->name); - - spin_unlock(&pers_lock); - seq_puts(seq, "\n"); seq->poll_event = atomic_read(&md_event_count); - spin_lock(&all_mddevs_lock); - return seq_list_start(&all_mddevs, *pos); + return seq_list_start_head(&all_mddevs, *pos); } static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) @@ -8300,16 +8303,23 @@ static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void md_seq_stop(struct seq_file *seq, void *v) __releases(&all_mddevs_lock) { - status_unused(seq); spin_unlock(&all_mddevs_lock); } static int md_seq_show(struct seq_file *seq, void *v) { - struct mddev *mddev = list_entry(v, struct mddev, all_mddevs); + struct mddev *mddev; sector_t sectors; struct md_rdev *rdev; + if (v == &all_mddevs) { + status_personalities(seq); + if (list_empty(&all_mddevs)) + status_unused(seq); + return 0; + } + + mddev = list_entry(v, struct mddev, all_mddevs); if (!mddev_get(mddev)) return 0; @@ -8385,6 +8395,10 @@ static int md_seq_show(struct seq_file *seq, void *v) } spin_unlock(&mddev->lock); spin_lock(&all_mddevs_lock); + + if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs)) + status_unused(seq); + if (atomic_dec_and_test(&mddev->active)) __mddev_put(mddev); -- GitLab From 47bf0f83fc86df1bf42b385a91aadb910137c5c9 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Tue, 2 Jan 2024 15:07:44 -0500 Subject: [PATCH 0726/1290] drm/amdkfd: Fix lock dependency warning ====================================================== WARNING: possible circular locking dependency detected 6.5.0-kfd-fkuehlin #276 Not tainted ------------------------------------------------------ kworker/8:2/2676 is trying to acquire lock: ffff9435aae95c88 ((work_completion)(&svm_bo->eviction_work)){+.+.}-{0:0}, at: __flush_work+0x52/0x550 but task is already holding lock: ffff9435cd8e1720 (&svms->lock){+.+.}-{3:3}, at: svm_range_deferred_list_work+0xe8/0x340 [amdgpu] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (&svms->lock){+.+.}-{3:3}: __mutex_lock+0x97/0xd30 kfd_ioctl_alloc_memory_of_gpu+0x6d/0x3c0 [amdgpu] kfd_ioctl+0x1b2/0x5d0 [amdgpu] __x64_sys_ioctl+0x86/0xc0 do_syscall_64+0x39/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd -> #1 (&mm->mmap_lock){++++}-{3:3}: down_read+0x42/0x160 svm_range_evict_svm_bo_worker+0x8b/0x340 [amdgpu] process_one_work+0x27a/0x540 worker_thread+0x53/0x3e0 kthread+0xeb/0x120 ret_from_fork+0x31/0x50 ret_from_fork_asm+0x11/0x20 -> #0 ((work_completion)(&svm_bo->eviction_work)){+.+.}-{0:0}: __lock_acquire+0x1426/0x2200 lock_acquire+0xc1/0x2b0 __flush_work+0x80/0x550 __cancel_work_timer+0x109/0x190 svm_range_bo_release+0xdc/0x1c0 [amdgpu] svm_range_free+0x175/0x180 [amdgpu] svm_range_deferred_list_work+0x15d/0x340 [amdgpu] process_one_work+0x27a/0x540 worker_thread+0x53/0x3e0 kthread+0xeb/0x120 ret_from_fork+0x31/0x50 ret_from_fork_asm+0x11/0x20 other info that might help us debug this: Chain exists of: (work_completion)(&svm_bo->eviction_work) --> &mm->mmap_lock --> &svms->lock Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&svms->lock); lock(&mm->mmap_lock); lock(&svms->lock); lock((work_completion)(&svm_bo->eviction_work)); I believe this cannot really lead to a deadlock in practice, because svm_range_evict_svm_bo_worker only takes the mmap_read_lock if the BO refcount is non-0. That means it's impossible that svm_range_bo_release is running concurrently. However, there is no good way to annotate this. To avoid the problem, take a BO reference in svm_range_schedule_evict_svm_bo instead of in the worker. That way it's impossible for a BO to get freed while eviction work is pending and the cancel_work_sync call in svm_range_bo_release can be eliminated. v2: Use svm_bo_ref_unless_zero and explained why that's safe. Also removed redundant checks that are already done in amdkfd_fence_enable_signaling. Signed-off-by: Felix Kuehling Reviewed-by: Philip Yang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index ac84c4a2ca072..d46c145835e09 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -404,14 +404,9 @@ static void svm_range_bo_release(struct kref *kref) spin_lock(&svm_bo->list_lock); } spin_unlock(&svm_bo->list_lock); - if (!dma_fence_is_signaled(&svm_bo->eviction_fence->base)) { - /* We're not in the eviction worker. - * Signal the fence and synchronize with any - * pending eviction work. - */ + if (!dma_fence_is_signaled(&svm_bo->eviction_fence->base)) + /* We're not in the eviction worker. Signal the fence. */ dma_fence_signal(&svm_bo->eviction_fence->base); - cancel_work_sync(&svm_bo->eviction_work); - } dma_fence_put(&svm_bo->eviction_fence->base); amdgpu_bo_unref(&svm_bo->bo); kfree(svm_bo); @@ -3432,13 +3427,14 @@ svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange, int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence) { - if (!fence) - return -EINVAL; - - if (dma_fence_is_signaled(&fence->base)) - return 0; - - if (fence->svm_bo) { + /* Dereferencing fence->svm_bo is safe here because the fence hasn't + * signaled yet and we're under the protection of the fence->lock. + * After the fence is signaled in svm_range_bo_release, we cannot get + * here any more. + * + * Reference is dropped in svm_range_evict_svm_bo_worker. + */ + if (svm_bo_ref_unless_zero(fence->svm_bo)) { WRITE_ONCE(fence->svm_bo->evicting, 1); schedule_work(&fence->svm_bo->eviction_work); } @@ -3453,8 +3449,6 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work) int r = 0; svm_bo = container_of(work, struct svm_range_bo, eviction_work); - if (!svm_bo_ref_unless_zero(svm_bo)) - return; /* svm_bo was freed while eviction was pending */ if (mmget_not_zero(svm_bo->eviction_fence->mm)) { mm = svm_bo->eviction_fence->mm; -- GitLab From 17e74e11ac2b46e7514705ae7abfb93ac0e20bd6 Mon Sep 17 00:00:00 2001 From: Martin Tsai Date: Mon, 18 Dec 2023 16:36:44 +0800 Subject: [PATCH 0727/1290] drm/amd/display: To adjust dprefclk by down spread percentage [Why] Panels show corruption with high refresh rate timings when ssc is enabled. [How] Read down-spread percentage from lut to adjust dprefclk. Issues come from S0i3 with this commit has been fixed by SMU. Reviewed-by: Nicholas Kazlauskas Acked-by: Rodrigo Siqueira Signed-off-by: Martin Tsai Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../dc/clk_mgr/dcn314/dcn314_clk_mgr.c | 71 ++++++++++++++++++- .../dc/clk_mgr/dcn314/dcn314_clk_mgr.h | 11 +++ .../gpu/drm/amd/display/dc/dce/dce_audio.c | 2 +- .../drm/amd/display/dc/dce/dce_clock_source.c | 9 ++- .../amd/display/dc/hwss/dce110/dce110_hwseq.c | 2 +- .../gpu/drm/amd/display/dc/inc/hw/clk_mgr.h | 1 + .../gpu/drm/amd/display/include/audio_types.h | 2 +- 7 files changed, 93 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_clk_mgr.c index 7575282563267..878c0e7b78abd 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_clk_mgr.c @@ -87,6 +87,20 @@ static const struct IP_BASE CLK_BASE = { { { { 0x00016C00, 0x02401800, 0, 0, 0, #define CLK1_CLK_PLL_REQ__PllSpineDiv_MASK 0x0000F000L #define CLK1_CLK_PLL_REQ__FbMult_frac_MASK 0xFFFF0000L +#define regCLK1_CLK2_BYPASS_CNTL 0x029c +#define regCLK1_CLK2_BYPASS_CNTL_BASE_IDX 0 + +#define CLK1_CLK2_BYPASS_CNTL__CLK2_BYPASS_SEL__SHIFT 0x0 +#define CLK1_CLK2_BYPASS_CNTL__CLK2_BYPASS_DIV__SHIFT 0x10 +#define CLK1_CLK2_BYPASS_CNTL__CLK2_BYPASS_SEL_MASK 0x00000007L +#define CLK1_CLK2_BYPASS_CNTL__CLK2_BYPASS_DIV_MASK 0x000F0000L + +#define regCLK6_0_CLK6_spll_field_8 0x464b +#define regCLK6_0_CLK6_spll_field_8_BASE_IDX 0 + +#define CLK6_0_CLK6_spll_field_8__spll_ssc_en__SHIFT 0xd +#define CLK6_0_CLK6_spll_field_8__spll_ssc_en_MASK 0x00002000L + #define REG(reg_name) \ (CLK_BASE.instance[0].segment[reg ## reg_name ## _BASE_IDX] + reg ## reg_name) @@ -160,6 +174,37 @@ static void dcn314_disable_otg_wa(struct clk_mgr *clk_mgr_base, struct dc_state } } +bool dcn314_is_spll_ssc_enabled(struct clk_mgr *clk_mgr_base) +{ + struct clk_mgr_internal *clk_mgr = TO_CLK_MGR_INTERNAL(clk_mgr_base); + uint32_t ssc_enable; + + REG_GET(CLK6_0_CLK6_spll_field_8, spll_ssc_en, &ssc_enable); + + return ssc_enable == 1; +} + +void dcn314_init_clocks(struct clk_mgr *clk_mgr) +{ + struct clk_mgr_internal *clk_mgr_int = TO_CLK_MGR_INTERNAL(clk_mgr); + uint32_t ref_dtbclk = clk_mgr->clks.ref_dtbclk_khz; + + memset(&(clk_mgr->clks), 0, sizeof(struct dc_clocks)); + // Assumption is that boot state always supports pstate + clk_mgr->clks.ref_dtbclk_khz = ref_dtbclk; // restore ref_dtbclk + clk_mgr->clks.p_state_change_support = true; + clk_mgr->clks.prev_p_state_change_support = true; + clk_mgr->clks.pwr_state = DCN_PWR_STATE_UNKNOWN; + clk_mgr->clks.zstate_support = DCN_ZSTATE_SUPPORT_UNKNOWN; + + // to adjust dp_dto reference clock if ssc is enable otherwise to apply dprefclk + if (dcn314_is_spll_ssc_enabled(clk_mgr)) + clk_mgr->dp_dto_source_clock_in_khz = + dce_adjust_dp_ref_freq_for_ss(clk_mgr_int, clk_mgr->dprefclk_khz); + else + clk_mgr->dp_dto_source_clock_in_khz = clk_mgr->dprefclk_khz; +} + void dcn314_update_clocks(struct clk_mgr *clk_mgr_base, struct dc_state *context, bool safe_to_lower) @@ -436,6 +481,11 @@ static DpmClocks314_t dummy_clocks; static struct dcn314_watermarks dummy_wms = { 0 }; +static struct dcn314_ss_info_table ss_info_table = { + .ss_divider = 1000, + .ss_percentage = {0, 0, 375, 375, 375} +}; + static void dcn314_build_watermark_ranges(struct clk_bw_params *bw_params, struct dcn314_watermarks *table) { int i, num_valid_sets; @@ -708,13 +758,31 @@ static struct clk_mgr_funcs dcn314_funcs = { .get_dp_ref_clk_frequency = dce12_get_dp_ref_freq_khz, .get_dtb_ref_clk_frequency = dcn31_get_dtb_ref_freq_khz, .update_clocks = dcn314_update_clocks, - .init_clocks = dcn31_init_clocks, + .init_clocks = dcn314_init_clocks, .enable_pme_wa = dcn314_enable_pme_wa, .are_clock_states_equal = dcn314_are_clock_states_equal, .notify_wm_ranges = dcn314_notify_wm_ranges }; extern struct clk_mgr_funcs dcn3_fpga_funcs; +static void dcn314_read_ss_info_from_lut(struct clk_mgr_internal *clk_mgr) +{ + uint32_t clock_source; + //uint32_t ssc_enable; + + REG_GET(CLK1_CLK2_BYPASS_CNTL, CLK2_BYPASS_SEL, &clock_source); + //REG_GET(CLK6_0_CLK6_spll_field_8, spll_ssc_en, &ssc_enable); + + if (dcn314_is_spll_ssc_enabled(&clk_mgr->base) && (clock_source < ARRAY_SIZE(ss_info_table.ss_percentage))) { + clk_mgr->dprefclk_ss_percentage = ss_info_table.ss_percentage[clock_source]; + + if (clk_mgr->dprefclk_ss_percentage != 0) { + clk_mgr->ss_on_dprefclk = true; + clk_mgr->dprefclk_ss_divider = ss_info_table.ss_divider; + } + } +} + void dcn314_clk_mgr_construct( struct dc_context *ctx, struct clk_mgr_dcn314 *clk_mgr, @@ -782,6 +850,7 @@ void dcn314_clk_mgr_construct( clk_mgr->base.base.dprefclk_khz = 600000; clk_mgr->base.base.clks.ref_dtbclk_khz = 600000; dce_clock_read_ss_info(&clk_mgr->base); + dcn314_read_ss_info_from_lut(&clk_mgr->base); /*if bios enabled SS, driver needs to adjust dtb clock, only enable with correct bios*/ clk_mgr->base.base.bw_params = &dcn314_bw_params; diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_clk_mgr.h b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_clk_mgr.h index 171f84340eb2f..002c28e807208 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_clk_mgr.h +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_clk_mgr.h @@ -28,6 +28,8 @@ #define __DCN314_CLK_MGR_H__ #include "clk_mgr_internal.h" +#define DCN314_NUM_CLOCK_SOURCES 5 + struct dcn314_watermarks; struct dcn314_smu_watermark_set { @@ -40,9 +42,18 @@ struct clk_mgr_dcn314 { struct dcn314_smu_watermark_set smu_wm_set; }; +struct dcn314_ss_info_table { + uint32_t ss_divider; + uint32_t ss_percentage[DCN314_NUM_CLOCK_SOURCES]; +}; + bool dcn314_are_clock_states_equal(struct dc_clocks *a, struct dc_clocks *b); +bool dcn314_is_spll_ssc_enabled(struct clk_mgr *clk_mgr_base); + +void dcn314_init_clocks(struct clk_mgr *clk_mgr); + void dcn314_update_clocks(struct clk_mgr *clk_mgr_base, struct dc_state *context, bool safe_to_lower); diff --git a/drivers/gpu/drm/amd/display/dc/dce/dce_audio.c b/drivers/gpu/drm/amd/display/dc/dce/dce_audio.c index 140598f18bbdd..f0458b8f00af8 100644 --- a/drivers/gpu/drm/amd/display/dc/dce/dce_audio.c +++ b/drivers/gpu/drm/amd/display/dc/dce/dce_audio.c @@ -782,7 +782,7 @@ static void get_azalia_clock_info_dp( /*audio_dto_module = dpDtoSourceClockInkhz * 10,000; * [khz] ->[100Hz] */ azalia_clock_info->audio_dto_module = - pll_info->dp_dto_source_clock_in_khz * 10; + pll_info->audio_dto_source_clock_in_khz * 10; } void dce_aud_wall_dto_setup( diff --git a/drivers/gpu/drm/amd/display/dc/dce/dce_clock_source.c b/drivers/gpu/drm/amd/display/dc/dce/dce_clock_source.c index 5d3f6fa1011e8..970644b695cd4 100644 --- a/drivers/gpu/drm/amd/display/dc/dce/dce_clock_source.c +++ b/drivers/gpu/drm/amd/display/dc/dce/dce_clock_source.c @@ -975,6 +975,9 @@ static bool dcn31_program_pix_clk( look_up_in_video_optimized_rate_tlb(pix_clk_params->requested_pix_clk_100hz / 10); struct bp_pixel_clock_parameters bp_pc_params = {0}; enum transmitter_color_depth bp_pc_colour_depth = TRANSMITTER_COLOR_DEPTH_24; + + if (clock_source->ctx->dc->clk_mgr->dp_dto_source_clock_in_khz != 0) + dp_dto_ref_khz = clock_source->ctx->dc->clk_mgr->dp_dto_source_clock_in_khz; // For these signal types Driver to program DP_DTO without calling VBIOS Command table if (dc_is_dp_signal(pix_clk_params->signal_type) || dc_is_virtual_signal(pix_clk_params->signal_type)) { if (e) { @@ -1088,6 +1091,10 @@ static bool get_pixel_clk_frequency_100hz( struct dce110_clk_src *clk_src = TO_DCE110_CLK_SRC(clock_source); unsigned int clock_hz = 0; unsigned int modulo_hz = 0; + unsigned int dp_dto_ref_khz = clock_source->ctx->dc->clk_mgr->dprefclk_khz; + + if (clock_source->ctx->dc->clk_mgr->dp_dto_source_clock_in_khz != 0) + dp_dto_ref_khz = clock_source->ctx->dc->clk_mgr->dp_dto_source_clock_in_khz; if (clock_source->id == CLOCK_SOURCE_ID_DP_DTO) { clock_hz = REG_READ(PHASE[inst]); @@ -1100,7 +1107,7 @@ static bool get_pixel_clk_frequency_100hz( modulo_hz = REG_READ(MODULO[inst]); if (modulo_hz) *pixel_clk_khz = div_u64((uint64_t)clock_hz* - clock_source->ctx->dc->clk_mgr->dprefclk_khz*10, + dp_dto_ref_khz*10, modulo_hz); else *pixel_clk_khz = 0; diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c index fb328cd06cea2..5660f15da291e 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c @@ -1354,7 +1354,7 @@ static void build_audio_output( if (state->clk_mgr && (pipe_ctx->stream->signal == SIGNAL_TYPE_DISPLAY_PORT || pipe_ctx->stream->signal == SIGNAL_TYPE_DISPLAY_PORT_MST)) { - audio_output->pll_info.dp_dto_source_clock_in_khz = + audio_output->pll_info.audio_dto_source_clock_in_khz = state->clk_mgr->funcs->get_dp_ref_clk_frequency( state->clk_mgr); } diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/clk_mgr.h b/drivers/gpu/drm/amd/display/dc/inc/hw/clk_mgr.h index cbba39d251e53..17e014d3bdc84 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/hw/clk_mgr.h +++ b/drivers/gpu/drm/amd/display/dc/inc/hw/clk_mgr.h @@ -333,6 +333,7 @@ struct clk_mgr { bool force_smu_not_present; bool dc_mode_softmax_enabled; int dprefclk_khz; // Used by program pixel clock in clock source funcs, need to figureout where this goes + int dp_dto_source_clock_in_khz; // Used to program DP DTO with ss adjustment on DCN314 int dentist_vco_freq_khz; struct clk_state_registers_and_bypass boot_snapshot; struct clk_bw_params *bw_params; diff --git a/drivers/gpu/drm/amd/display/include/audio_types.h b/drivers/gpu/drm/amd/display/include/audio_types.h index 66a54da0641ce..915a031a43cb2 100644 --- a/drivers/gpu/drm/amd/display/include/audio_types.h +++ b/drivers/gpu/drm/amd/display/include/audio_types.h @@ -64,7 +64,7 @@ enum audio_dto_source { /* PLL information required for AZALIA DTO calculation */ struct audio_pll_info { - uint32_t dp_dto_source_clock_in_khz; + uint32_t audio_dto_source_clock_in_khz; uint32_t feed_back_divider; enum audio_dto_source dto_source; bool ss_enabled; -- GitLab From 7bdbfb4e36e34eb788e44f27666bf0a2b3b90803 Mon Sep 17 00:00:00 2001 From: George Shen Date: Sun, 17 Dec 2023 17:17:57 -0500 Subject: [PATCH 0728/1290] drm/amd/display: Disconnect phantom pipe OPP from OPTC being disabled [Why] If an OPP is used for a different OPTC without first being disconnected from the previous OPTC, unexpected behaviour can occur. This also applies to phantom pipes, which is what the current logic missed. [How] Disconnect OPPs from OPTC for phantom pipes before disabling OTG master. Also move the disconnection to before the OTG master disable, since the register is double buffered. Reviewed-by: Dillon Varone Acked-by: Rodrigo Siqueira Signed-off-by: George Shen Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../amd/display/dc/optc/dcn32/dcn32_optc.c | 19 +++++++++++++------ .../amd/display/dc/optc/dcn35/dcn35_optc.c | 12 ++++++------ 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.c b/drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.c index 91ea0d4da06a9..1788eb29474b4 100644 --- a/drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.c +++ b/drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.c @@ -166,12 +166,6 @@ static bool optc32_disable_crtc(struct timing_generator *optc) { struct optc *optc1 = DCN10TG_FROM_TG(optc); - /* disable otg request until end of the first line - * in the vertical blank region - */ - REG_UPDATE(OTG_CONTROL, - OTG_MASTER_EN, 0); - REG_UPDATE_5(OPTC_DATA_SOURCE_SELECT, OPTC_SEG0_SRC_SEL, 0xf, OPTC_SEG1_SRC_SEL, 0xf, @@ -179,6 +173,12 @@ static bool optc32_disable_crtc(struct timing_generator *optc) OPTC_SEG3_SRC_SEL, 0xf, OPTC_NUM_OF_INPUT_SEGMENT, 0); + /* disable otg request until end of the first line + * in the vertical blank region + */ + REG_UPDATE(OTG_CONTROL, + OTG_MASTER_EN, 0); + REG_UPDATE(CONTROL, VTG0_ENABLE, 0); @@ -205,6 +205,13 @@ static void optc32_disable_phantom_otg(struct timing_generator *optc) { struct optc *optc1 = DCN10TG_FROM_TG(optc); + REG_UPDATE_5(OPTC_DATA_SOURCE_SELECT, + OPTC_SEG0_SRC_SEL, 0xf, + OPTC_SEG1_SRC_SEL, 0xf, + OPTC_SEG2_SRC_SEL, 0xf, + OPTC_SEG3_SRC_SEL, 0xf, + OPTC_NUM_OF_INPUT_SEGMENT, 0); + REG_UPDATE(OTG_CONTROL, OTG_MASTER_EN, 0); } diff --git a/drivers/gpu/drm/amd/display/dc/optc/dcn35/dcn35_optc.c b/drivers/gpu/drm/amd/display/dc/optc/dcn35/dcn35_optc.c index 08a59cf449cae..3d6c1b2c2b4d6 100644 --- a/drivers/gpu/drm/amd/display/dc/optc/dcn35/dcn35_optc.c +++ b/drivers/gpu/drm/amd/display/dc/optc/dcn35/dcn35_optc.c @@ -138,12 +138,6 @@ static bool optc35_disable_crtc(struct timing_generator *optc) { struct optc *optc1 = DCN10TG_FROM_TG(optc); - /* disable otg request until end of the first line - * in the vertical blank region - */ - REG_UPDATE(OTG_CONTROL, - OTG_MASTER_EN, 0); - REG_UPDATE_5(OPTC_DATA_SOURCE_SELECT, OPTC_SEG0_SRC_SEL, 0xf, OPTC_SEG1_SRC_SEL, 0xf, @@ -151,6 +145,12 @@ static bool optc35_disable_crtc(struct timing_generator *optc) OPTC_SEG3_SRC_SEL, 0xf, OPTC_NUM_OF_INPUT_SEGMENT, 0); + /* disable otg request until end of the first line + * in the vertical blank region + */ + REG_UPDATE(OTG_CONTROL, + OTG_MASTER_EN, 0); + REG_UPDATE(CONTROL, VTG0_ENABLE, 0); -- GitLab From 51c7e6ac24101af3147ebc45627810da367c6b66 Mon Sep 17 00:00:00 2001 From: Martin Leung Date: Wed, 20 Dec 2023 16:54:05 -0500 Subject: [PATCH 0729/1290] drm/amd/display: revert "for FPO & SubVP/DRR config program vmin/max" This reverts commit 6b2b782ad6a25734ae847d1659bea3f613dbb563. The original commit causes issues with certain features when DRR is disabled, need to revisit this change later after resolving issues with new DRR policy. Reviewed-by: Rodrigo Siqueira Signed-off-by: Martin Leung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/dc/core/dc_resource.c | 14 ------- .../display/dc/dcn32/dcn32_resource_helpers.c | 14 +++++++ .../drm/amd/display/dc/dml/dcn32/dcn32_fpu.c | 11 +++--- .../amd/display/dc/hwss/dcn20/dcn20_hwseq.c | 37 ------------------- drivers/gpu/drm/amd/display/dc/inc/resource.h | 3 -- .../dc/resource/dcn32/dcn32_resource.c | 2 +- .../dc/resource/dcn32/dcn32_resource.h | 3 ++ .../dc/resource/dcn321/dcn321_resource.c | 2 +- 8 files changed, 24 insertions(+), 62 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c index 57f0ddd159239..f2abc1096ffb6 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c @@ -4986,20 +4986,6 @@ enum dc_status update_dp_encoder_resources_for_test_harness(const struct dc *dc, return DC_OK; } -bool resource_subvp_in_use(struct dc *dc, - struct dc_state *context) -{ - uint32_t i; - - for (i = 0; i < dc->res_pool->pipe_count; i++) { - struct pipe_ctx *pipe = &context->res_ctx.pipe_ctx[i]; - - if (dc_state_get_pipe_subvp_type(context, pipe) != SUBVP_NONE) - return true; - } - return false; -} - bool check_subvp_sw_cursor_fallback_req(const struct dc *dc, struct dc_stream_state *stream) { if (!dc->debug.disable_subvp_high_refresh && is_subvp_high_refresh_candidate(stream)) diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource_helpers.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource_helpers.c index e4a328b45c8a5..87760600e154d 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource_helpers.c +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource_helpers.c @@ -183,6 +183,20 @@ bool dcn32_all_pipes_have_stream_and_plane(struct dc *dc, return true; } +bool dcn32_subvp_in_use(struct dc *dc, + struct dc_state *context) +{ + uint32_t i; + + for (i = 0; i < dc->res_pool->pipe_count; i++) { + struct pipe_ctx *pipe = &context->res_ctx.pipe_ctx[i]; + + if (dc_state_get_pipe_subvp_type(context, pipe) != SUBVP_NONE) + return true; + } + return false; +} + bool dcn32_mpo_in_use(struct dc_state *context) { uint32_t i; diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c index aa68d010cbfd2..9f37f717a1f86 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c @@ -33,7 +33,6 @@ #include "dcn30/dcn30_resource.h" #include "link.h" #include "dc_state_priv.h" -#include "resource.h" #define DC_LOGGER_INIT(logger) @@ -292,7 +291,7 @@ int dcn32_find_dummy_latency_index_for_fw_based_mclk_switch(struct dc *dc, /* for subvp + DRR case, if subvp pipes are still present we support pstate */ if (vba->DRAMClockChangeSupport[vlevel][vba->maxMpcComb] == dm_dram_clock_change_unsupported && - resource_subvp_in_use(dc, context)) + dcn32_subvp_in_use(dc, context)) vba->DRAMClockChangeSupport[vlevel][context->bw_ctx.dml.vba.maxMpcComb] = temp_clock_change_support; if (vlevel < context->bw_ctx.dml.vba.soc.num_states && @@ -2273,7 +2272,7 @@ void dcn32_calculate_wm_and_dlg_fpu(struct dc *dc, struct dc_state *context, unsigned int dummy_latency_index = 0; int maxMpcComb = context->bw_ctx.dml.vba.maxMpcComb; unsigned int min_dram_speed_mts = context->bw_ctx.dml.vba.DRAMSpeed; - bool subvp_active = resource_subvp_in_use(dc, context); + bool subvp_in_use = dcn32_subvp_in_use(dc, context); unsigned int min_dram_speed_mts_margin; bool need_fclk_lat_as_dummy = false; bool is_subvp_p_drr = false; @@ -2282,7 +2281,7 @@ void dcn32_calculate_wm_and_dlg_fpu(struct dc *dc, struct dc_state *context, dc_assert_fp_enabled(); /* need to find dummy latency index for subvp */ - if (subvp_active) { + if (subvp_in_use) { /* Override DRAMClockChangeSupport for SubVP + DRR case where the DRR cannot switch without stretching it's VBLANK */ if (!pstate_en) { context->bw_ctx.dml.vba.DRAMClockChangeSupport[vlevel][maxMpcComb] = dm_dram_clock_change_vblank_w_mall_sub_vp; @@ -2468,7 +2467,7 @@ void dcn32_calculate_wm_and_dlg_fpu(struct dc *dc, struct dc_state *context, dc->clk_mgr->bw_params->clk_table.entries[min_dram_speed_mts_offset].memclk_mhz * 16; } - if (!context->bw_ctx.bw.dcn.clk.fw_based_mclk_switching && !subvp_active) { + if (!context->bw_ctx.bw.dcn.clk.fw_based_mclk_switching && !subvp_in_use) { /* find largest table entry that is lower than dram speed, * but lower than DPM0 still uses DPM0 */ @@ -3528,7 +3527,7 @@ void dcn32_set_clock_limits(const struct _vcs_dpi_soc_bounding_box_st *soc_bb) void dcn32_override_min_req_memclk(struct dc *dc, struct dc_state *context) { // WA: restrict FPO and SubVP to use first non-strobe mode (DCN32 BW issue) - if ((context->bw_ctx.bw.dcn.clk.fw_based_mclk_switching || resource_subvp_in_use(dc, context)) && + if ((context->bw_ctx.bw.dcn.clk.fw_based_mclk_switching || dcn32_subvp_in_use(dc, context)) && dc->dml.soc.num_chans <= 8) { int num_mclk_levels = dc->clk_mgr->bw_params->clk_table.num_entries_per_clk.num_memclk_levels; diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c index bc71a9b058fed..0dfcb3cdcd20c 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c @@ -1882,42 +1882,6 @@ static void dcn20_program_pipe( } } -static void update_vmin_vmax_fams(struct dc *dc, - struct dc_state *context) -{ - uint32_t i; - struct drr_params params = {0}; - bool subvp_in_use = resource_subvp_in_use(dc, context); - - for (i = 0; i < dc->res_pool->pipe_count; i++) { - struct pipe_ctx *pipe = &context->res_ctx.pipe_ctx[i]; - - if (resource_is_pipe_type(pipe, OTG_MASTER) && - ((subvp_in_use && dc_state_get_pipe_subvp_type(context, pipe) != SUBVP_PHANTOM && - pipe->stream->allow_freesync) || (context->bw_ctx.bw.dcn.clk.fw_based_mclk_switching && pipe->stream->fpo_in_use))) { - if (!pipe->stream->vrr_active_variable && !pipe->stream->vrr_active_fixed) { - struct timing_generator *tg = context->res_ctx.pipe_ctx[i].stream_res.tg; - - /* DRR should be configured already if we're in active variable - * or active fixed, so only program if we're not in this state - */ - params.vertical_total_min = pipe->stream->timing.v_total; - params.vertical_total_max = pipe->stream->timing.v_total; - tg->funcs->set_drr(tg, ¶ms); - } - } else { - if (resource_is_pipe_type(pipe, OTG_MASTER) && - !pipe->stream->vrr_active_variable && - !pipe->stream->vrr_active_fixed) { - struct timing_generator *tg = context->res_ctx.pipe_ctx[i].stream_res.tg; - params.vertical_total_min = 0; - params.vertical_total_max = 0; - tg->funcs->set_drr(tg, ¶ms); - } - } - } -} - void dcn20_program_front_end_for_ctx( struct dc *dc, struct dc_state *context) @@ -1994,7 +1958,6 @@ void dcn20_program_front_end_for_ctx( && context->res_ctx.pipe_ctx[i].stream) hws->funcs.blank_pixel_data(dc, &context->res_ctx.pipe_ctx[i], true); - update_vmin_vmax_fams(dc, context); /* Disconnect mpcc */ for (i = 0; i < dc->res_pool->pipe_count; i++) diff --git a/drivers/gpu/drm/amd/display/dc/inc/resource.h b/drivers/gpu/drm/amd/display/dc/inc/resource.h index 1d51fed12e200..c958ef37b78a6 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/resource.h +++ b/drivers/gpu/drm/amd/display/dc/inc/resource.h @@ -609,9 +609,6 @@ bool dc_resource_acquire_secondary_pipe_for_mpc_odm_legacy( struct pipe_ctx *sec_pipe, bool odm); -bool resource_subvp_in_use(struct dc *dc, - struct dc_state *context); - /* A test harness interface that modifies dp encoder resources in the given dc * state and bypasses the need to revalidate. The interface assumes that the * test harness interface is called with pre-validated link config stored in the diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c index ac04a9c9a3d86..c4d71e7f18af4 100644 --- a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c @@ -1899,7 +1899,7 @@ int dcn32_populate_dml_pipes_from_context( static struct dc_cap_funcs cap_funcs = { .get_dcc_compression_cap = dcn20_get_dcc_compression_cap, - .get_subvp_en = resource_subvp_in_use, + .get_subvp_en = dcn32_subvp_in_use, }; void dcn32_calculate_wm_and_dlg(struct dc *dc, struct dc_state *context, diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.h b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.h index 62611acd4bcb5..0c87b0fabba7d 100644 --- a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.h +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.h @@ -131,6 +131,9 @@ void dcn32_merge_pipes_for_subvp(struct dc *dc, bool dcn32_all_pipes_have_stream_and_plane(struct dc *dc, struct dc_state *context); +bool dcn32_subvp_in_use(struct dc *dc, + struct dc_state *context); + bool dcn32_mpo_in_use(struct dc_state *context); bool dcn32_any_surfaces_rotated(struct dc *dc, struct dc_state *context); diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c b/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c index e1ab207c46f15..74412e5f03fef 100644 --- a/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c @@ -1574,7 +1574,7 @@ static void dcn321_destroy_resource_pool(struct resource_pool **pool) static struct dc_cap_funcs cap_funcs = { .get_dcc_compression_cap = dcn20_get_dcc_compression_cap, - .get_subvp_en = resource_subvp_in_use, + .get_subvp_en = dcn32_subvp_in_use, }; static void dcn321_update_bw_bounding_box(struct dc *dc, struct clk_bw_params *bw_params) -- GitLab From a465536ebff88fcc42e131a1b09bbe3df829117b Mon Sep 17 00:00:00 2001 From: Martin Leung Date: Wed, 20 Dec 2023 16:56:11 -0500 Subject: [PATCH 0730/1290] drm/amd/display: revert "Optimize VRR updates to only necessary ones" This reverts commit 6e4337f695c25162f0296934152506ad596fcebf. The original commit causes regression in corner case with HDMI at specific timings. Reverting from staging to get the full suite to retest. Reviewed-by: Rodrigo Siqueira Signed-off-by: Martin Leung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc.c | 14 +++++++++----- drivers/gpu/drm/amd/display/dc/dc.h | 1 + drivers/gpu/drm/amd/display/dc/dc_stream.h | 2 ++ .../drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c | 2 +- .../drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c | 8 ++++---- 5 files changed, 17 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index 2d7205058c64a..69e726630241d 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -411,12 +411,9 @@ bool dc_stream_adjust_vmin_vmax(struct dc *dc, * avoid conflicting with firmware updates. */ if (dc->ctx->dce_version > DCE_VERSION_MAX) - if (dc->optimized_required) + if (dc->optimized_required || dc->wm_optimized_required) return false; - if (!memcmp(&stream->adjust, adjust, sizeof(*adjust))) - return true; - stream->adjust.v_total_max = adjust->v_total_max; stream->adjust.v_total_mid = adjust->v_total_mid; stream->adjust.v_total_mid_frame_num = adjust->v_total_mid_frame_num; @@ -2230,6 +2227,7 @@ void dc_post_update_surfaces_to_stream(struct dc *dc) } dc->optimized_required = false; + dc->wm_optimized_required = false; } bool dc_set_generic_gpio_for_stereo(bool enable, @@ -2652,6 +2650,8 @@ enum surface_update_type dc_check_update_surfaces_for_stream( } else if (memcmp(&dc->current_state->bw_ctx.bw.dcn.clk, &dc->clk_mgr->clks, offsetof(struct dc_clocks, prev_p_state_change_support)) != 0) { dc->optimized_required = true; } + + dc->optimized_required |= dc->wm_optimized_required; } return type; @@ -2859,6 +2859,9 @@ static void copy_stream_update_to_stream(struct dc *dc, if (update->vrr_active_fixed) stream->vrr_active_fixed = *update->vrr_active_fixed; + if (update->crtc_timing_adjust) + stream->adjust = *update->crtc_timing_adjust; + if (update->dpms_off) stream->dpms_off = *update->dpms_off; @@ -4288,7 +4291,8 @@ static bool full_update_required(struct dc *dc, stream_update->mst_bw_update || stream_update->func_shaper || stream_update->lut3d_func || - stream_update->pending_test_pattern)) + stream_update->pending_test_pattern || + stream_update->crtc_timing_adjust)) return true; if (stream) { diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h index f30a341bc0901..7222f63caf28a 100644 --- a/drivers/gpu/drm/amd/display/dc/dc.h +++ b/drivers/gpu/drm/amd/display/dc/dc.h @@ -1036,6 +1036,7 @@ struct dc { /* Require to optimize clocks and bandwidth for added/removed planes */ bool optimized_required; + bool wm_optimized_required; bool idle_optimizations_allowed; bool enable_c20_dtm_b0; diff --git a/drivers/gpu/drm/amd/display/dc/dc_stream.h b/drivers/gpu/drm/amd/display/dc/dc_stream.h index a23eebd9933b7..ee10941caa598 100644 --- a/drivers/gpu/drm/amd/display/dc/dc_stream.h +++ b/drivers/gpu/drm/amd/display/dc/dc_stream.h @@ -139,6 +139,7 @@ union stream_update_flags { uint32_t wb_update:1; uint32_t dsc_changed : 1; uint32_t mst_bw : 1; + uint32_t crtc_timing_adjust : 1; uint32_t fams_changed : 1; } bits; @@ -325,6 +326,7 @@ struct dc_stream_update { struct dc_3dlut *lut3d_func; struct test_pattern *pending_test_pattern; + struct dc_crtc_timing_adjust *crtc_timing_adjust; }; bool dc_is_stream_unchanged( diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c index 51dd2ae09b2a6..6dd479e8a3485 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c @@ -3076,7 +3076,7 @@ void dcn10_prepare_bandwidth( context, false); - dc->optimized_required |= hubbub->funcs->program_watermarks(hubbub, + dc->wm_optimized_required = hubbub->funcs->program_watermarks(hubbub, &context->bw_ctx.bw.dcn.watermarks, dc->res_pool->ref_clocks.dchub_ref_clock_inKhz / 1000, true); diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c index 0dfcb3cdcd20c..e931342fcf4cf 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c @@ -2159,10 +2159,10 @@ void dcn20_prepare_bandwidth( } /* program dchubbub watermarks: - * For assigning optimized_required, use |= operator since we don't want + * For assigning wm_optimized_required, use |= operator since we don't want * to clear the value if the optimize has not happened yet */ - dc->optimized_required |= hubbub->funcs->program_watermarks(hubbub, + dc->wm_optimized_required |= hubbub->funcs->program_watermarks(hubbub, &context->bw_ctx.bw.dcn.watermarks, dc->res_pool->ref_clocks.dchub_ref_clock_inKhz / 1000, false); @@ -2175,10 +2175,10 @@ void dcn20_prepare_bandwidth( if (hubbub->funcs->program_compbuf_size) { if (context->bw_ctx.dml.ip.min_comp_buffer_size_kbytes) { compbuf_size_kb = context->bw_ctx.dml.ip.min_comp_buffer_size_kbytes; - dc->optimized_required |= (compbuf_size_kb != dc->current_state->bw_ctx.dml.ip.min_comp_buffer_size_kbytes); + dc->wm_optimized_required |= (compbuf_size_kb != dc->current_state->bw_ctx.dml.ip.min_comp_buffer_size_kbytes); } else { compbuf_size_kb = context->bw_ctx.bw.dcn.compbuf_size_kb; - dc->optimized_required |= (compbuf_size_kb != dc->current_state->bw_ctx.bw.dcn.compbuf_size_kb); + dc->wm_optimized_required |= (compbuf_size_kb != dc->current_state->bw_ctx.bw.dcn.compbuf_size_kb); } hubbub->funcs->program_compbuf_size(hubbub, compbuf_size_kb, false); -- GitLab From 5f3bce13266e6fe2f7a46f94d8bc94d5274e276b Mon Sep 17 00:00:00 2001 From: Peichen Huang Date: Thu, 14 Dec 2023 23:16:34 +0800 Subject: [PATCH 0731/1290] drm/amd/display: Request usb4 bw for mst streams [WHY] When usb4 bandwidth allocation mode is enabled, driver need to request bandwidth from connection manager. For mst link, the requested bandwidth should be big enough for all remote streams. [HOW] - If mst link, the requested bandwidth should be the sum of all mst streams bandwidth added with dp MTPH overhead. - Allocate/deallcate usb4 bandwidth when setting dpms on/off. - When doing display mode validation, driver also need to consider total bandwidth of all mst streams for mst link. Reviewed-by: Cruise Hung Acked-by: Rodrigo Siqueira Signed-off-by: Peichen Huang Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dc_types.h | 12 ++-- .../gpu/drm/amd/display/dc/link/link_dpms.c | 42 ++++++++++--- .../drm/amd/display/dc/link/link_validation.c | 60 +++++++++++++++---- .../dc/link/protocols/link_dp_dpia_bw.c | 59 +++++++++++++----- .../dc/link/protocols/link_dp_dpia_bw.h | 9 +++ 5 files changed, 144 insertions(+), 38 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dc_types.h b/drivers/gpu/drm/amd/display/dc/dc_types.h index 4f276169e05a9..b08ccb8c68bc3 100644 --- a/drivers/gpu/drm/amd/display/dc/dc_types.h +++ b/drivers/gpu/drm/amd/display/dc/dc_types.h @@ -1140,23 +1140,25 @@ struct dc_panel_config { } ilr; }; +#define MAX_SINKS_PER_LINK 4 + /* * USB4 DPIA BW ALLOCATION STRUCTS */ struct dc_dpia_bw_alloc { - int sink_verified_bw; // The Verified BW that sink can allocated and use that has been verified already - int sink_allocated_bw; // The Actual Allocated BW that sink currently allocated - int sink_max_bw; // The Max BW that sink can require/support + int remote_sink_req_bw[MAX_SINKS_PER_LINK]; // BW requested by remote sinks + int link_verified_bw; // The Verified BW that link can allocated and use that has been verified already + int link_max_bw; // The Max BW that link can require/support + int allocated_bw; // The Actual Allocated BW for this DPIA int estimated_bw; // The estimated available BW for this DPIA int bw_granularity; // BW Granularity + int dp_overhead; // DP overhead in dp tunneling bool bw_alloc_enabled; // The BW Alloc Mode Support is turned ON for all 3: DP-Tx & Dpia & CM bool response_ready; // Response ready from the CM side uint8_t nrd_max_lane_count; // Non-reduced max lane count uint8_t nrd_max_link_rate; // Non-reduced max link rate }; -#define MAX_SINKS_PER_LINK 4 - enum dc_hpd_enable_select { HPD_EN_FOR_ALL_EDP = 0, HPD_EN_FOR_PRIMARY_EDP_ONLY, diff --git a/drivers/gpu/drm/amd/display/dc/link/link_dpms.c b/drivers/gpu/drm/amd/display/dc/link/link_dpms.c index 5fe8b4871c776..3de148004c066 100644 --- a/drivers/gpu/drm/amd/display/dc/link/link_dpms.c +++ b/drivers/gpu/drm/amd/display/dc/link/link_dpms.c @@ -2005,17 +2005,11 @@ static enum dc_status enable_link_dp(struct dc_state *state, } } - /* - * If the link is DP-over-USB4 do the following: - * - Train with fallback when enabling DPIA link. Conventional links are + /* Train with fallback when enabling DPIA link. Conventional links are * trained with fallback during sink detection. - * - Allocate only what the stream needs for bw in Gbps. Inform the CM - * in case stream needs more or less bw from what has been allocated - * earlier at plug time. */ - if (link->ep_type == DISPLAY_ENDPOINT_USB4_DPIA) { + if (link->ep_type == DISPLAY_ENDPOINT_USB4_DPIA) do_fallback = true; - } /* * Temporary w/a to get DP2.0 link rates to work with SST. @@ -2197,6 +2191,32 @@ static enum dc_status enable_link( return status; } +static bool allocate_usb4_bandwidth_for_stream(struct dc_stream_state *stream, int bw) +{ + return true; +} + +static bool allocate_usb4_bandwidth(struct dc_stream_state *stream) +{ + bool ret; + + int bw = dc_bandwidth_in_kbps_from_timing(&stream->timing, + dc_link_get_highest_encoding_format(stream->sink->link)); + + ret = allocate_usb4_bandwidth_for_stream(stream, bw); + + return ret; +} + +static bool deallocate_usb4_bandwidth(struct dc_stream_state *stream) +{ + bool ret; + + ret = allocate_usb4_bandwidth_for_stream(stream, 0); + + return ret; +} + void link_set_dpms_off(struct pipe_ctx *pipe_ctx) { struct dc *dc = pipe_ctx->stream->ctx->dc; @@ -2232,6 +2252,9 @@ void link_set_dpms_off(struct pipe_ctx *pipe_ctx) update_psp_stream_config(pipe_ctx, true); dc->hwss.blank_stream(pipe_ctx); + if (pipe_ctx->stream->link->ep_type == DISPLAY_ENDPOINT_USB4_DPIA) + deallocate_usb4_bandwidth(pipe_ctx->stream); + if (pipe_ctx->stream->signal == SIGNAL_TYPE_DISPLAY_PORT_MST) deallocate_mst_payload(pipe_ctx); else if (pipe_ctx->stream->signal == SIGNAL_TYPE_DISPLAY_PORT && @@ -2474,6 +2497,9 @@ void link_set_dpms_on( } } + if (pipe_ctx->stream->link->ep_type == DISPLAY_ENDPOINT_USB4_DPIA) + allocate_usb4_bandwidth(pipe_ctx->stream); + if (pipe_ctx->stream->signal == SIGNAL_TYPE_DISPLAY_PORT_MST) allocate_mst_payload(pipe_ctx); else if (pipe_ctx->stream->signal == SIGNAL_TYPE_DISPLAY_PORT && diff --git a/drivers/gpu/drm/amd/display/dc/link/link_validation.c b/drivers/gpu/drm/amd/display/dc/link/link_validation.c index b45fda96eaf64..8fe66c3678508 100644 --- a/drivers/gpu/drm/amd/display/dc/link/link_validation.c +++ b/drivers/gpu/drm/amd/display/dc/link/link_validation.c @@ -346,23 +346,61 @@ enum dc_status link_validate_mode_timing( return DC_OK; } +/* + * This function calculates the bandwidth required for the stream timing + * and aggregates the stream bandwidth for the respective dpia link + * + * @stream: pointer to the dc_stream_state struct instance + * @num_streams: number of streams to be validated + * + * return: true if validation is succeeded + */ bool link_validate_dpia_bandwidth(const struct dc_stream_state *stream, const unsigned int num_streams) { - bool ret = true; - int bw_needed[MAX_DPIA_NUM]; - struct dc_link *link[MAX_DPIA_NUM]; - - if (!num_streams || num_streams > MAX_DPIA_NUM) - return ret; + int bw_needed[MAX_DPIA_NUM] = {0}; + struct dc_link *dpia_link[MAX_DPIA_NUM] = {0}; + int num_dpias = 0; for (uint8_t i = 0; i < num_streams; ++i) { + if (stream[i].signal == SIGNAL_TYPE_DISPLAY_PORT) { + /* new dpia sst stream, check whether it exceeds max dpia */ + if (num_dpias >= MAX_DPIA_NUM) + return false; - link[i] = stream[i].link; - bw_needed[i] = dc_bandwidth_in_kbps_from_timing(&stream[i].timing, - dc_link_get_highest_encoding_format(link[i])); + dpia_link[num_dpias] = stream[i].link; + bw_needed[num_dpias] = dc_bandwidth_in_kbps_from_timing(&stream[i].timing, + dc_link_get_highest_encoding_format(dpia_link[num_dpias])); + num_dpias++; + } else if (stream[i].signal == SIGNAL_TYPE_DISPLAY_PORT_MST) { + uint8_t j = 0; + /* check whether its a known dpia link */ + for (; j < num_dpias; ++j) { + if (dpia_link[j] == stream[i].link) + break; + } + + if (j == num_dpias) { + /* new dpia mst stream, check whether it exceeds max dpia */ + if (num_dpias >= MAX_DPIA_NUM) + return false; + else { + dpia_link[j] = stream[i].link; + num_dpias++; + } + } + + bw_needed[j] += dc_bandwidth_in_kbps_from_timing(&stream[i].timing, + dc_link_get_highest_encoding_format(dpia_link[j])); + } } - ret = dpia_validate_usb4_bw(link, bw_needed, num_streams); + /* Include dp overheads */ + for (uint8_t i = 0; i < num_dpias; ++i) { + int dp_overhead = 0; + + dp_overhead = link_dp_dpia_get_dp_overhead_in_dp_tunneling(dpia_link[i]); + bw_needed[i] += dp_overhead; + } - return ret; + return dpia_validate_usb4_bw(dpia_link, bw_needed, num_dpias); } diff --git a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_dpia_bw.c b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_dpia_bw.c index a7aa8c9da868f..4ef1a6a1d1295 100644 --- a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_dpia_bw.c +++ b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_dpia_bw.c @@ -54,12 +54,18 @@ static bool get_bw_alloc_proceed_flag(struct dc_link *tmp) static void reset_bw_alloc_struct(struct dc_link *link) { link->dpia_bw_alloc_config.bw_alloc_enabled = false; - link->dpia_bw_alloc_config.sink_verified_bw = 0; - link->dpia_bw_alloc_config.sink_max_bw = 0; + link->dpia_bw_alloc_config.link_verified_bw = 0; + link->dpia_bw_alloc_config.link_max_bw = 0; + link->dpia_bw_alloc_config.allocated_bw = 0; link->dpia_bw_alloc_config.estimated_bw = 0; link->dpia_bw_alloc_config.bw_granularity = 0; + link->dpia_bw_alloc_config.dp_overhead = 0; link->dpia_bw_alloc_config.response_ready = false; - link->dpia_bw_alloc_config.sink_allocated_bw = 0; + link->dpia_bw_alloc_config.nrd_max_lane_count = 0; + link->dpia_bw_alloc_config.nrd_max_link_rate = 0; + for (int i = 0; i < MAX_SINKS_PER_LINK; i++) + link->dpia_bw_alloc_config.remote_sink_req_bw[i] = 0; + DC_LOG_DEBUG("reset usb4 bw alloc of link(%d)\n", link->link_index); } #define BW_GRANULARITY_0 4 // 0.25 Gbps @@ -210,8 +216,8 @@ static int get_host_router_total_dp_tunnel_bw(const struct dc *dc, uint8_t hr_in link_dpia_primary->dpia_bw_alloc_config.bw_alloc_enabled) && (link_dpia_secondary->hpd_status && link_dpia_secondary->dpia_bw_alloc_config.bw_alloc_enabled)) { - total_bw += link_dpia_primary->dpia_bw_alloc_config.estimated_bw + - link_dpia_secondary->dpia_bw_alloc_config.sink_allocated_bw; + total_bw += link_dpia_primary->dpia_bw_alloc_config.estimated_bw + + link_dpia_secondary->dpia_bw_alloc_config.allocated_bw; } else if (link_dpia_primary->hpd_status && link_dpia_primary->dpia_bw_alloc_config.bw_alloc_enabled) { total_bw = link_dpia_primary->dpia_bw_alloc_config.estimated_bw; @@ -264,7 +270,7 @@ static void set_usb4_req_bw_req(struct dc_link *link, int req_bw) /* Error check whether requested and allocated are equal */ req_bw = requested_bw * (Kbps_TO_Gbps / link->dpia_bw_alloc_config.bw_granularity); - if (req_bw == link->dpia_bw_alloc_config.sink_allocated_bw) { + if (req_bw == link->dpia_bw_alloc_config.allocated_bw) { DC_LOG_ERROR("%s: Request bw equals to allocated bw for link(%d)\n", __func__, link->link_index); } @@ -387,9 +393,9 @@ void dpia_handle_bw_alloc_response(struct dc_link *link, uint8_t bw, uint8_t res DC_LOG_DEBUG("%s: BW REQ SUCCESS for DP-TX Request for link(%d)\n", __func__, link->link_index); DC_LOG_DEBUG("%s: current allocated_bw(%d), new allocated_bw(%d)\n", - __func__, link->dpia_bw_alloc_config.sink_allocated_bw, bw_needed); + __func__, link->dpia_bw_alloc_config.allocated_bw, bw_needed); - link->dpia_bw_alloc_config.sink_allocated_bw = bw_needed; + link->dpia_bw_alloc_config.allocated_bw = bw_needed; link->dpia_bw_alloc_config.response_ready = true; break; @@ -427,8 +433,8 @@ int dpia_handle_usb4_bandwidth_allocation_for_link(struct dc_link *link, int pea if (link->hpd_status && peak_bw > 0) { // If DP over USB4 then we need to check BW allocation - link->dpia_bw_alloc_config.sink_max_bw = peak_bw; - set_usb4_req_bw_req(link, link->dpia_bw_alloc_config.sink_max_bw); + link->dpia_bw_alloc_config.link_max_bw = peak_bw; + set_usb4_req_bw_req(link, link->dpia_bw_alloc_config.link_max_bw); do { if (timeout > 0) @@ -440,8 +446,8 @@ int dpia_handle_usb4_bandwidth_allocation_for_link(struct dc_link *link, int pea if (!timeout) ret = 0;// ERROR TIMEOUT waiting for response for allocating bw - else if (link->dpia_bw_alloc_config.sink_allocated_bw > 0) - ret = link->dpia_bw_alloc_config.sink_allocated_bw; + else if (link->dpia_bw_alloc_config.allocated_bw > 0) + ret = link->dpia_bw_alloc_config.allocated_bw; } //2. Cold Unplug else if (!link->hpd_status) @@ -450,7 +456,6 @@ int dpia_handle_usb4_bandwidth_allocation_for_link(struct dc_link *link, int pea out: return ret; } - bool link_dp_dpia_allocate_usb4_bandwidth_for_stream(struct dc_link *link, int req_bw) { bool ret = false; @@ -458,7 +463,7 @@ bool link_dp_dpia_allocate_usb4_bandwidth_for_stream(struct dc_link *link, int r DC_LOG_DEBUG("%s: ENTER: link(%d), hpd_status(%d), current allocated_bw(%d), req_bw(%d)\n", __func__, link->link_index, link->hpd_status, - link->dpia_bw_alloc_config.sink_allocated_bw, req_bw); + link->dpia_bw_alloc_config.allocated_bw, req_bw); if (!get_bw_alloc_proceed_flag(link)) goto out; @@ -523,3 +528,29 @@ bool dpia_validate_usb4_bw(struct dc_link **link, int *bw_needed_per_dpia, const return ret; } + +int link_dp_dpia_get_dp_overhead_in_dp_tunneling(struct dc_link *link) +{ + int dp_overhead = 0, link_mst_overhead = 0; + + if (!get_bw_alloc_proceed_flag((link))) + return dp_overhead; + + /* if its mst link, add MTPH overhead */ + if ((link->type == dc_connection_mst_branch) && + !link->dpcd_caps.channel_coding_cap.bits.DP_128b_132b_SUPPORTED) { + /* For 8b/10b encoding: MTP is 64 time slots long, slot 0 is used for MTPH + * MST overhead is 1/64 of link bandwidth (excluding any overhead) + */ + const struct dc_link_settings *link_cap = + dc_link_get_link_cap(link); + uint32_t link_bw_in_kbps = + link_cap->link_rate * link_cap->lane_count * LINK_RATE_REF_FREQ_IN_KHZ * 8; + link_mst_overhead = (link_bw_in_kbps / 64) + ((link_bw_in_kbps % 64) ? 1 : 0); + } + + /* add all the overheads */ + dp_overhead = link_mst_overhead; + + return dp_overhead; +} diff --git a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_dpia_bw.h b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_dpia_bw.h index 981bc4eb6120e..3b6d8494f9d5d 100644 --- a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_dpia_bw.h +++ b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_dpia_bw.h @@ -99,4 +99,13 @@ void dpia_handle_bw_alloc_response(struct dc_link *link, uint8_t bw, uint8_t res */ bool dpia_validate_usb4_bw(struct dc_link **link, int *bw_needed, const unsigned int num_dpias); +/* + * Obtain all the DP overheads in dp tunneling for the dpia link + * + * @link: pointer to the dc_link struct instance + * + * return: DP overheads in DP tunneling + */ +int link_dp_dpia_get_dp_overhead_in_dp_tunneling(struct dc_link *link); + #endif /* DC_INC_LINK_DP_DPIA_BW_H_ */ -- GitLab From bf282eb92b84709d99186ad5940b9997eb3c1ff2 Mon Sep 17 00:00:00 2001 From: Daniel Miess Date: Wed, 20 Dec 2023 10:34:32 -0500 Subject: [PATCH 0732/1290] Revert "drm/amd/display: Fix conversions between bytes and KB" This reverts commit d0f639c5869399bf6dde4d694d5f8c0ab8c0ec46. The previous commit causes failure to light up for 1080p eDP + 8k HDMI panel combo. Reviewed-by: Charlene Liu Acked-by: Rodrigo Siqueira Signed-off-by: Daniel Miess Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../amd/display/dc/dml2/display_mode_core.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml2/display_mode_core.c b/drivers/gpu/drm/amd/display/dc/dml2/display_mode_core.c index b95bf27f2fe2f..a6b938a12de13 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2/display_mode_core.c +++ b/drivers/gpu/drm/amd/display/dc/dml2/display_mode_core.c @@ -6229,7 +6229,7 @@ static void set_calculate_prefetch_schedule_params(struct display_mode_lib_st *m CalculatePrefetchSchedule_params->GPUVMEnable = mode_lib->ms.cache_display_cfg.plane.GPUVMEnable; CalculatePrefetchSchedule_params->HostVMEnable = mode_lib->ms.cache_display_cfg.plane.HostVMEnable; CalculatePrefetchSchedule_params->HostVMMaxNonCachedPageTableLevels = mode_lib->ms.cache_display_cfg.plane.HostVMMaxPageTableLevels; - CalculatePrefetchSchedule_params->HostVMMinPageSize = mode_lib->ms.soc.hostvm_min_page_size_kbytes; + CalculatePrefetchSchedule_params->HostVMMinPageSize = mode_lib->ms.soc.hostvm_min_page_size_kbytes * 1024; CalculatePrefetchSchedule_params->DynamicMetadataEnable = mode_lib->ms.cache_display_cfg.plane.DynamicMetadataEnable[k]; CalculatePrefetchSchedule_params->DynamicMetadataVMEnabled = mode_lib->ms.ip.dynamic_metadata_vm_enabled; CalculatePrefetchSchedule_params->DynamicMetadataLinesBeforeActiveRequired = mode_lib->ms.cache_display_cfg.plane.DynamicMetadataLinesBeforeActiveRequired[k]; @@ -6329,7 +6329,7 @@ static void dml_prefetch_check(struct display_mode_lib_st *mode_lib) mode_lib->ms.NoOfDPPThisState, mode_lib->ms.dpte_group_bytes, s->HostVMInefficiencyFactor, - mode_lib->ms.soc.hostvm_min_page_size_kbytes, + mode_lib->ms.soc.hostvm_min_page_size_kbytes * 1024, mode_lib->ms.cache_display_cfg.plane.HostVMMaxPageTableLevels); s->NextMaxVStartup = s->MaxVStartupAllPlanes[j]; @@ -6542,7 +6542,7 @@ static void dml_prefetch_check(struct display_mode_lib_st *mode_lib) mode_lib->ms.cache_display_cfg.plane.HostVMEnable, mode_lib->ms.cache_display_cfg.plane.HostVMMaxPageTableLevels, mode_lib->ms.cache_display_cfg.plane.GPUVMEnable, - mode_lib->ms.soc.hostvm_min_page_size_kbytes, + mode_lib->ms.soc.hostvm_min_page_size_kbytes * 1024, mode_lib->ms.PDEAndMetaPTEBytesPerFrame[j][k], mode_lib->ms.MetaRowBytes[j][k], mode_lib->ms.DPTEBytesPerRow[j][k], @@ -7687,7 +7687,7 @@ dml_bool_t dml_core_mode_support(struct display_mode_lib_st *mode_lib) CalculateVMRowAndSwath_params->HostVMMaxNonCachedPageTableLevels = mode_lib->ms.cache_display_cfg.plane.HostVMMaxPageTableLevels; CalculateVMRowAndSwath_params->GPUVMMaxPageTableLevels = mode_lib->ms.cache_display_cfg.plane.GPUVMMaxPageTableLevels; CalculateVMRowAndSwath_params->GPUVMMinPageSizeKBytes = mode_lib->ms.cache_display_cfg.plane.GPUVMMinPageSizeKBytes; - CalculateVMRowAndSwath_params->HostVMMinPageSize = mode_lib->ms.soc.hostvm_min_page_size_kbytes; + CalculateVMRowAndSwath_params->HostVMMinPageSize = mode_lib->ms.soc.hostvm_min_page_size_kbytes * 1024; CalculateVMRowAndSwath_params->PTEBufferModeOverrideEn = mode_lib->ms.cache_display_cfg.plane.PTEBufferModeOverrideEn; CalculateVMRowAndSwath_params->PTEBufferModeOverrideVal = mode_lib->ms.cache_display_cfg.plane.PTEBufferMode; CalculateVMRowAndSwath_params->PTEBufferSizeNotExceeded = mode_lib->ms.PTEBufferSizeNotExceededPerState; @@ -7957,7 +7957,7 @@ dml_bool_t dml_core_mode_support(struct display_mode_lib_st *mode_lib) UseMinimumDCFCLK_params->GPUVMMaxPageTableLevels = mode_lib->ms.cache_display_cfg.plane.GPUVMMaxPageTableLevels; UseMinimumDCFCLK_params->HostVMEnable = mode_lib->ms.cache_display_cfg.plane.HostVMEnable; UseMinimumDCFCLK_params->NumberOfActiveSurfaces = mode_lib->ms.num_active_planes; - UseMinimumDCFCLK_params->HostVMMinPageSize = mode_lib->ms.soc.hostvm_min_page_size_kbytes; + UseMinimumDCFCLK_params->HostVMMinPageSize = mode_lib->ms.soc.hostvm_min_page_size_kbytes * 1024; UseMinimumDCFCLK_params->HostVMMaxNonCachedPageTableLevels = mode_lib->ms.cache_display_cfg.plane.HostVMMaxPageTableLevels; UseMinimumDCFCLK_params->DynamicMetadataVMEnabled = mode_lib->ms.ip.dynamic_metadata_vm_enabled; UseMinimumDCFCLK_params->ImmediateFlipRequirement = s->ImmediateFlipRequiredFinal; @@ -8699,7 +8699,7 @@ void dml_core_mode_programming(struct display_mode_lib_st *mode_lib, const struc CalculateVMRowAndSwath_params->HostVMMaxNonCachedPageTableLevels = mode_lib->ms.cache_display_cfg.plane.HostVMMaxPageTableLevels; CalculateVMRowAndSwath_params->GPUVMMaxPageTableLevels = mode_lib->ms.cache_display_cfg.plane.GPUVMMaxPageTableLevels; CalculateVMRowAndSwath_params->GPUVMMinPageSizeKBytes = mode_lib->ms.cache_display_cfg.plane.GPUVMMinPageSizeKBytes; - CalculateVMRowAndSwath_params->HostVMMinPageSize = mode_lib->ms.soc.hostvm_min_page_size_kbytes; + CalculateVMRowAndSwath_params->HostVMMinPageSize = mode_lib->ms.soc.hostvm_min_page_size_kbytes * 1024; CalculateVMRowAndSwath_params->PTEBufferModeOverrideEn = mode_lib->ms.cache_display_cfg.plane.PTEBufferModeOverrideEn; CalculateVMRowAndSwath_params->PTEBufferModeOverrideVal = mode_lib->ms.cache_display_cfg.plane.PTEBufferMode; CalculateVMRowAndSwath_params->PTEBufferSizeNotExceeded = s->dummy_boolean_array[0]; @@ -8805,7 +8805,7 @@ void dml_core_mode_programming(struct display_mode_lib_st *mode_lib, const struc mode_lib->ms.cache_display_cfg.hw.DPPPerSurface, locals->dpte_group_bytes, s->HostVMInefficiencyFactor, - mode_lib->ms.soc.hostvm_min_page_size_kbytes, + mode_lib->ms.soc.hostvm_min_page_size_kbytes * 1024, mode_lib->ms.cache_display_cfg.plane.HostVMMaxPageTableLevels); locals->TCalc = 24.0 / locals->DCFCLKDeepSleep; @@ -8995,7 +8995,7 @@ void dml_core_mode_programming(struct display_mode_lib_st *mode_lib, const struc CalculatePrefetchSchedule_params->GPUVMEnable = mode_lib->ms.cache_display_cfg.plane.GPUVMEnable; CalculatePrefetchSchedule_params->HostVMEnable = mode_lib->ms.cache_display_cfg.plane.HostVMEnable; CalculatePrefetchSchedule_params->HostVMMaxNonCachedPageTableLevels = mode_lib->ms.cache_display_cfg.plane.HostVMMaxPageTableLevels; - CalculatePrefetchSchedule_params->HostVMMinPageSize = mode_lib->ms.soc.hostvm_min_page_size_kbytes; + CalculatePrefetchSchedule_params->HostVMMinPageSize = mode_lib->ms.soc.hostvm_min_page_size_kbytes * 1024; CalculatePrefetchSchedule_params->DynamicMetadataEnable = mode_lib->ms.cache_display_cfg.plane.DynamicMetadataEnable[k]; CalculatePrefetchSchedule_params->DynamicMetadataVMEnabled = mode_lib->ms.ip.dynamic_metadata_vm_enabled; CalculatePrefetchSchedule_params->DynamicMetadataLinesBeforeActiveRequired = mode_lib->ms.cache_display_cfg.plane.DynamicMetadataLinesBeforeActiveRequired[k]; @@ -9240,7 +9240,7 @@ void dml_core_mode_programming(struct display_mode_lib_st *mode_lib, const struc mode_lib->ms.cache_display_cfg.plane.HostVMEnable, mode_lib->ms.cache_display_cfg.plane.HostVMMaxPageTableLevels, mode_lib->ms.cache_display_cfg.plane.GPUVMEnable, - mode_lib->ms.soc.hostvm_min_page_size_kbytes, + mode_lib->ms.soc.hostvm_min_page_size_kbytes * 1024, locals->PDEAndMetaPTEBytesFrame[k], locals->MetaRowByte[k], locals->PixelPTEBytesPerRow[k], -- GitLab From 2476bf4328d1a55db709ce9ad2c274d26040311b Mon Sep 17 00:00:00 2001 From: Charlene Liu Date: Thu, 21 Dec 2023 09:42:28 -0500 Subject: [PATCH 0733/1290] drm/amd/display: Update z8 latency Adjust z8 latency for performance. Reviewed-by: Muhammad Ahmed Acked-by: Rodrigo Siqueira Signed-off-by: Charlene Liu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c index 3d12dabd39e47..475c4ec43c013 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c @@ -166,9 +166,9 @@ struct _vcs_dpi_soc_bounding_box_st dcn3_5_soc = { .num_states = 5, .sr_exit_time_us = 14.0, .sr_enter_plus_exit_time_us = 16.0, - .sr_exit_z8_time_us = 525.0, - .sr_enter_plus_exit_z8_time_us = 715.0, - .fclk_change_latency_us = 20.0, + .sr_exit_z8_time_us = 210.0, + .sr_enter_plus_exit_z8_time_us = 320.0, + .fclk_change_latency_us = 24.0, .usr_retraining_latency_us = 2, .writeback_latency_us = 12.0, -- GitLab From ab76bd72ee12d9117c3a16d749ffce84f5b235bf Mon Sep 17 00:00:00 2001 From: Meenakshikumar Somasundaram Date: Tue, 19 Dec 2023 15:51:24 -0500 Subject: [PATCH 0734/1290] drm/amd/display: Dpia hpd status not in sync after S4 [Why] Dpia hpd status not in sync causing driver not enabling BW Alloc after S4. [How] Update hpd_status of the link when querying hpd state from dmub in dpia_query_hpd_status(). Reviewed-by: Nicholas Kazlauskas Acked-by: Rodrigo Siqueira Signed-off-by: Meenakshikumar Somasundaram Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../display/dc/link/protocols/link_dp_dpia.c | 36 +++++++++++-------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_dpia.c b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_dpia.c index 982eda3c46f56..6af42ba9885c0 100644 --- a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_dpia.c +++ b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_dpia.c @@ -82,25 +82,33 @@ bool dpia_query_hpd_status(struct dc_link *link) { union dmub_rb_cmd cmd = {0}; struct dc_dmub_srv *dmub_srv = link->ctx->dmub_srv; - bool is_hpd_high = false; /* prepare QUERY_HPD command */ cmd.query_hpd.header.type = DMUB_CMD__QUERY_HPD_STATE; cmd.query_hpd.data.instance = link->link_id.enum_id - ENUM_ID_1; cmd.query_hpd.data.ch_type = AUX_CHANNEL_DPIA; - /* Return HPD status reported by DMUB if query successfully executed. */ - if (dc_wake_and_execute_dmub_cmd(dmub_srv->ctx, &cmd, DM_DMUB_WAIT_TYPE_WAIT_WITH_REPLY) && - cmd.query_hpd.data.status == AUX_RET_SUCCESS) - is_hpd_high = cmd.query_hpd.data.result; - - DC_LOG_DEBUG("%s: link(%d) dpia(%d) cmd_status(%d) result(%d)\n", - __func__, - link->link_index, - link->link_id.enum_id - ENUM_ID_1, - cmd.query_hpd.data.status, - cmd.query_hpd.data.result); - - return is_hpd_high; + /* Query dpia hpd status from dmub */ + if (dc_wake_and_execute_dmub_cmd(dmub_srv->ctx, &cmd, + DM_DMUB_WAIT_TYPE_WAIT_WITH_REPLY) && + cmd.query_hpd.data.status == AUX_RET_SUCCESS) { + DC_LOG_DEBUG("%s: for link(%d) dpia(%d) success, current_hpd_status(%d) new_hpd_status(%d)\n", + __func__, + link->link_index, + link->link_id.enum_id - ENUM_ID_1, + link->hpd_status, + cmd.query_hpd.data.result); + link->hpd_status = cmd.query_hpd.data.result; + } else { + DC_LOG_ERROR("%s: for link(%d) dpia(%d) failed with status(%d), current_hpd_status(%d) new_hpd_status(0)\n", + __func__, + link->link_index, + link->link_id.enum_id - ENUM_ID_1, + cmd.query_hpd.data.status, + link->hpd_status); + link->hpd_status = false; + } + + return link->hpd_status; } -- GitLab From d32156a07575d69916944ce0e2d4a71a4c95979d Mon Sep 17 00:00:00 2001 From: Aric Cyr Date: Sun, 17 Dec 2023 19:36:16 -0500 Subject: [PATCH 0735/1290] drm/amd/display: 3.2.266 This version brings along following fixes: - Improve z8/z10 support. - Revert some of the VRR optimization. - Improve usb4 when using MST. Acked-by: Rodrigo Siqueira Signed-off-by: Aric Cyr Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h index 7222f63caf28a..5d7aa882416b3 100644 --- a/drivers/gpu/drm/amd/display/dc/dc.h +++ b/drivers/gpu/drm/amd/display/dc/dc.h @@ -51,7 +51,7 @@ struct aux_payload; struct set_config_cmd_payload; struct dmub_notification; -#define DC_VER "3.2.265" +#define DC_VER "3.2.266" #define MAX_SURFACES 3 #define MAX_PLANES 6 -- GitLab From 90bd01471d1c7f2d2db3c69259e247357991fe50 Mon Sep 17 00:00:00 2001 From: Candice Li Date: Thu, 4 Jan 2024 09:34:25 +0800 Subject: [PATCH 0736/1290] drm/amdgpu: Drop unnecessary sentences about CE and deferred error. Remove "no user action is needed" for correctable and deferred error to avoid confusion. Signed-off-by: Candice Li Reviewed-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 14 +++++--------- drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 3 +-- drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c | 3 +-- drivers/gpu/drm/amd/amdgpu/umc_v6_7.c | 2 +- 4 files changed, 8 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index fc42fb6ee1914..9a64584318e02 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1067,8 +1067,7 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev, mcm_info = &err_info->mcm_info; if (err_info->ce_count) { dev_info(adev->dev, "socket: %d, die: %d, " - "%lld new correctable hardware errors detected in %s block, " - "no user action is needed\n", + "%lld new correctable hardware errors detected in %s block\n", mcm_info->socket_id, mcm_info->die_id, err_info->ce_count, @@ -1080,8 +1079,7 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev, err_info = &err_node->err_info; mcm_info = &err_info->mcm_info; dev_info(adev->dev, "socket: %d, die: %d, " - "%lld correctable hardware errors detected in total in %s block, " - "no user action is needed\n", + "%lld correctable hardware errors detected in total in %s block\n", mcm_info->socket_id, mcm_info->die_id, err_info->ce_count, blk_name); } } @@ -1108,16 +1106,14 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev, adev->smuio.funcs->get_die_id) { dev_info(adev->dev, "socket: %d, die: %d " "%ld correctable hardware errors " - "detected in %s block, no user " - "action is needed.\n", + "detected in %s block\n", adev->smuio.funcs->get_socket_id(adev), adev->smuio.funcs->get_die_id(adev), ras_mgr->err_data.ce_count, blk_name); } else { dev_info(adev->dev, "%ld correctable hardware errors " - "detected in %s block, no user " - "action is needed.\n", + "detected in %s block\n", ras_mgr->err_data.ce_count, blk_name); } @@ -1920,7 +1916,7 @@ static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj struct amdgpu_iv_entry *entry) { dev_info(obj->adev->dev, - "Poison is created, no user action is needed.\n"); + "Poison is created\n"); } static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj, diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c index 6d24c84924cb5..19986ff6a48d7 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c @@ -401,8 +401,7 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device if (err_data.ce_count) dev_info(adev->dev, "%ld correctable hardware " - "errors detected in %s block, " - "no user action is needed.\n", + "errors detected in %s block\n", obj->err_data.ce_count, get_ras_block_str(adev->nbio.ras_if)); diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c index 25a3da83e0fb9..e90f337808034 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c @@ -597,8 +597,7 @@ static void nbio_v7_9_handle_ras_controller_intr_no_bifring(struct amdgpu_device if (err_data.ce_count) dev_info(adev->dev, "%ld correctable hardware " - "errors detected in %s block, " - "no user action is needed.\n", + "errors detected in %s block\n", obj->err_data.ce_count, get_ras_block_str(adev->nbio.ras_if)); diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c index 530549314ce46..a3ee3c4c650fe 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c @@ -64,7 +64,7 @@ static void umc_v6_7_query_error_status_helper(struct amdgpu_device *adev, uint64_t reg_value; if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1) - dev_info(adev->dev, "Deferred error, no user action is needed.\n"); + dev_info(adev->dev, "Deferred error\n"); if (mc_umc_status) dev_info(adev->dev, "MCA STATUS 0x%llx, umc_reg_offset 0x%x\n", mc_umc_status, umc_reg_offset); -- GitLab From f4a94dbb6dc0bed10a5fc63718d00f1de45b12c0 Mon Sep 17 00:00:00 2001 From: Likun Gao Date: Fri, 5 Jan 2024 17:33:34 +0800 Subject: [PATCH 0737/1290] drm/amdgpu: correct the cu count for gfx v11 Correct the algorithm of active CU to skip disabled sa for gfx v11. Signed-off-by: Likun Gao Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index 2fbcd9765980d..c7242877d5d31 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -6383,6 +6383,9 @@ static int gfx_v11_0_get_cu_info(struct amdgpu_device *adev, mutex_lock(&adev->grbm_idx_mutex); for (i = 0; i < adev->gfx.config.max_shader_engines; i++) { for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) { + bitmap = i * adev->gfx.config.max_sh_per_se + j; + if (!((gfx_v11_0_get_sa_active_bitmap(adev) >> bitmap) & 1)) + continue; mask = 1; counter = 0; gfx_v11_0_select_se_sh(adev, i, j, 0xffffffff, 0); -- GitLab From fb1e91719983c529f85602fdd08c0b7dbf384b1c Mon Sep 17 00:00:00 2001 From: Candice Li Date: Thu, 4 Jan 2024 10:27:10 +0800 Subject: [PATCH 0738/1290] drm/amdgpu: Support poison error injection via ras_ctrl debugfs Support poison error injection. Signed-off-by: Candice Li Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 9a64584318e02..3f7f5c12961d3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -305,11 +305,13 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, return -EINVAL; data->head.block = block_id; - /* only ue and ce errors are supported */ + /* only ue, ce and poison errors are supported */ if (!memcmp("ue", err, 2)) data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; else if (!memcmp("ce", err, 2)) data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE; + else if (!memcmp("poison", err, 6)) + data->head.type = AMDGPU_RAS_ERROR__POISON; else return -EINVAL; @@ -431,9 +433,10 @@ static void amdgpu_ras_instance_mask_check(struct amdgpu_device *adev, * The block is one of: umc, sdma, gfx, etc. * see ras_block_string[] for details * - * The error type is one of: ue, ce, where, + * The error type is one of: ue, ce and poison where, * ue is multi-uncorrectable * ce is single-correctable + * poison is poison * * The sub-block is a the sub-block index, pass 0 if there is no sub-block. * The address and value are hexadecimal numbers, leading 0x is optional. -- GitLab From 73cb81dc548f154547d9205d5b9603ba10e2a402 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Fri, 29 Dec 2023 14:54:35 +0800 Subject: [PATCH 0739/1290] drm/amdgpu: Packed socket_id to ras feature mask Initialize RAS feature mask bit[31:29] with socket_id. Signed-off-by: Hawking Zhang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 3f7f5c12961d3..31823a30dea21 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2919,6 +2919,11 @@ int amdgpu_ras_init(struct amdgpu_device *adev) amdgpu_ras_query_poison_mode(adev); + /* Packed socket_id to ras feature mask bits[31:29] */ + if (adev->smuio.funcs && + adev->smuio.funcs->get_socket_id) + con->features |= ((adev->smuio.funcs->get_socket_id(adev)) << 29); + /* Get RAS schema for particular SOC */ con->schema = amdgpu_get_ras_schema(adev); -- GitLab From 2a9de42e8d3c82c6990d226198602be44f43f340 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Fri, 29 Dec 2023 15:19:25 -0500 Subject: [PATCH 0740/1290] drm/amdkfd: Fix lock dependency warning with srcu ====================================================== WARNING: possible circular locking dependency detected 6.5.0-kfd-yangp #2289 Not tainted ------------------------------------------------------ kworker/0:2/996 is trying to acquire lock: (srcu){.+.+}-{0:0}, at: __synchronize_srcu+0x5/0x1a0 but task is already holding lock: ((work_completion)(&svms->deferred_list_work)){+.+.}-{0:0}, at: process_one_work+0x211/0x560 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 ((work_completion)(&svms->deferred_list_work)){+.+.}-{0:0}: __flush_work+0x88/0x4f0 svm_range_list_lock_and_flush_work+0x3d/0x110 [amdgpu] svm_range_set_attr+0xd6/0x14c0 [amdgpu] kfd_ioctl+0x1d1/0x630 [amdgpu] __x64_sys_ioctl+0x88/0xc0 -> #2 (&info->lock#2){+.+.}-{3:3}: __mutex_lock+0x99/0xc70 amdgpu_amdkfd_gpuvm_restore_process_bos+0x54/0x740 [amdgpu] restore_process_helper+0x22/0x80 [amdgpu] restore_process_worker+0x2d/0xa0 [amdgpu] process_one_work+0x29b/0x560 worker_thread+0x3d/0x3d0 -> #1 ((work_completion)(&(&process->restore_work)->work)){+.+.}-{0:0}: __flush_work+0x88/0x4f0 __cancel_work_timer+0x12c/0x1c0 kfd_process_notifier_release_internal+0x37/0x1f0 [amdgpu] __mmu_notifier_release+0xad/0x240 exit_mmap+0x6a/0x3a0 mmput+0x6a/0x120 do_exit+0x322/0xb90 do_group_exit+0x37/0xa0 __x64_sys_exit_group+0x18/0x20 do_syscall_64+0x38/0x80 -> #0 (srcu){.+.+}-{0:0}: __lock_acquire+0x1521/0x2510 lock_sync+0x5f/0x90 __synchronize_srcu+0x4f/0x1a0 __mmu_notifier_release+0x128/0x240 exit_mmap+0x6a/0x3a0 mmput+0x6a/0x120 svm_range_deferred_list_work+0x19f/0x350 [amdgpu] process_one_work+0x29b/0x560 worker_thread+0x3d/0x3d0 other info that might help us debug this: Chain exists of: srcu --> &info->lock#2 --> (work_completion)(&svms->deferred_list_work) Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock((work_completion)(&svms->deferred_list_work)); lock(&info->lock#2); lock((work_completion)(&svms->deferred_list_work)); sync(srcu); Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index d46c145835e09..4e9f07c7a9376 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -2340,8 +2340,10 @@ retry: mutex_unlock(&svms->lock); mmap_write_unlock(mm); - /* Pairs with mmget in svm_range_add_list_work */ - mmput(mm); + /* Pairs with mmget in svm_range_add_list_work. If dropping the + * last mm refcount, schedule release work to avoid circular locking + */ + mmput_async(mm); spin_lock(&svms->deferred_list_lock); } -- GitLab From c147ddc68e741aed78bba796effe049344d87ab8 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Wed, 3 Jan 2024 18:13:04 -0500 Subject: [PATCH 0741/1290] drm/amdkfd: Fix sparse __rcu annotation warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Properly mark kfd_process->ef as __rcu and consistently use the right accessor functions. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202312052245.yFpBSgNH-lkp@intel.com/ Signed-off-by: Felix Kuehling Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 ++-- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 7 +++++-- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index cf6ed5fce291f..f262b9d89541a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -311,7 +311,7 @@ void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem); int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev, struct amdgpu_bo *bo); int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info, - struct dma_fence **ef); + struct dma_fence __rcu **ef); int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct amdgpu_device *adev, struct kfd_vm_fault_info *info); int amdgpu_amdkfd_gpuvm_import_dmabuf_fd(struct amdgpu_device *adev, int fd, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index d17b2452cb1f6..f183d7faeeece 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -2802,7 +2802,7 @@ unlock_out: put_task_struct(usertask); } -static void replace_eviction_fence(struct dma_fence **ef, +static void replace_eviction_fence(struct dma_fence __rcu **ef, struct dma_fence *new_ef) { struct dma_fence *old_ef = rcu_replace_pointer(*ef, new_ef, true @@ -2837,7 +2837,7 @@ static void replace_eviction_fence(struct dma_fence **ef, * 7. Add fence to all PD and PT BOs. * 8. Unreserve all BOs */ -int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef) +int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu **ef) { struct amdkfd_process_info *process_info = info; struct amdgpu_vm *peer_vm; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 745024b313401..17fbedbf36513 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -917,7 +917,7 @@ struct kfd_process { * fence will be triggered during eviction and new one will be created * during restore */ - struct dma_fence *ef; + struct dma_fence __rcu *ef; /* Work items for evicting and restoring BOs */ struct delayed_work eviction_work; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 71df51fcc1b0d..717a60d7a4ea9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -1110,6 +1110,7 @@ static void kfd_process_wq_release(struct work_struct *work) { struct kfd_process *p = container_of(work, struct kfd_process, release_work); + struct dma_fence *ef; kfd_process_dequeue_from_all_devices(p); pqm_uninit(&p->pqm); @@ -1118,7 +1119,9 @@ static void kfd_process_wq_release(struct work_struct *work) * destroyed. This allows any BOs to be freed without * triggering pointless evictions or waiting for fences. */ - dma_fence_signal(p->ef); + synchronize_rcu(); + ef = rcu_access_pointer(p->ef); + dma_fence_signal(ef); kfd_process_remove_sysfs(p); @@ -1127,7 +1130,7 @@ static void kfd_process_wq_release(struct work_struct *work) svm_range_list_fini(p); kfd_process_destroy_pdds(p); - dma_fence_put(p->ef); + dma_fence_put(ef); kfd_event_free_process(p); -- GitLab From c2ab9ce0ee7225fc05f58a6671c43b8a3684f530 Mon Sep 17 00:00:00 2001 From: Ivan Lipski Date: Fri, 5 Jan 2024 19:40:50 -0500 Subject: [PATCH 0742/1290] Revert "drm/amd/display: fix bandwidth validation failure on DCN 2.1" This commit causes dmesg-warn on several IGT tests on DCN 3.1.6: *ERROR* link_enc_cfg_validate: Invalid link encoder assignments - 0x1c Affected IGT tests include: - amdgpu/[amd_assr|amd_plane|amd_hotplug] - kms_atomic - kms_color - kms_flip - kms_properties - kms_universal_plane and some other tests This reverts commit 3a0fa3bc245ef92838a8296e0055569b8dff94c4. Cc: Melissa Wen Cc: Hamza Mahfooz Reviewed-by: Rodrigo Siqueira Signed-off-by: Ivan Lipski Signed-off-by: Rodrigo Siqueira Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index f6575d7dee971..10b2a896c4982 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -10753,7 +10753,7 @@ static int amdgpu_dm_atomic_check(struct drm_device *dev, DRM_DEBUG_DRIVER("drm_dp_mst_atomic_check() failed\n"); goto fail; } - status = dc_validate_global_state(dc, dm_state->context, false); + status = dc_validate_global_state(dc, dm_state->context, true); if (status != DC_OK) { DRM_DEBUG_DRIVER("DC global validation failure: %s (%d)", dc_status_to_str(status), status); -- GitLab From 50e60184bfe72400c49f7806af97edaf693ecd45 Mon Sep 17 00:00:00 2001 From: James Zhu Date: Tue, 2 Jan 2024 15:53:01 -0500 Subject: [PATCH 0743/1290] drm/amdgpu: make a correction on comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use a generic comment for AMDGPU_VM_RESERVED_VRAM size. Signed-off-by: James Zhu Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index b6cd565562ad8..4740dd65b99d6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -116,7 +116,7 @@ struct amdgpu_mem_stats; #define AMDGPU_VM_FAULT_STOP_FIRST 1 #define AMDGPU_VM_FAULT_STOP_ALWAYS 2 -/* Reserve 4MB VRAM for page tables */ +/* How much VRAM be reserved for page tables */ #define AMDGPU_VM_RESERVED_VRAM (8ULL << 20) /* -- GitLab From 7075893d1d68b2b3517be250a02d86e76554ed22 Mon Sep 17 00:00:00 2001 From: Melissa Wen Date: Fri, 5 Jan 2024 21:02:09 -0100 Subject: [PATCH 0744/1290] drm/amd/display: cleanup inconsistent indenting in amdgpu_dm_color smatch warnings: amdgpu_dm_update_plane_color_mgmt() warn: inconsistent indenting Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202401051643.PPdbmG1U-lkp@intel.com/ Signed-off-by: Melissa Wen Reviewed-by: Rodrigo Siqueira Signed-off-by: Rodrigo Siqueira Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c index 9b527bffe11a1..c87b64e464ed5 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c @@ -1239,7 +1239,7 @@ int amdgpu_dm_update_plane_color_mgmt(struct dm_crtc_state *crtc, if (has_crtc_cm_degamma && ret != -EINVAL) { drm_dbg_kms(crtc->base.crtc->dev, "doesn't support plane and CRTC degamma at the same time\n"); - return -EINVAL; + return -EINVAL; } /* If we are here, it means we don't have plane degamma settings, check -- GitLab From 7dab24554dedd4e6f408af8eb2d25c89997a6a1f Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sun, 7 Jan 2024 16:12:23 -0800 Subject: [PATCH 0745/1290] md/raid1: Use blk_opf_t for read and write operations Use the type blk_opf_t for read and write operations instead of int. This patch does not affect the generated code but fixes the following sparse warning: drivers/md/raid1.c:1993:60: sparse: sparse: incorrect type in argument 5 (different base types) expected restricted blk_opf_t [usertype] opf got int rw Cc: Song Liu Cc: Jens Axboe Fixes: 3c5e514db58f ("md/raid1: Use the new blk_opf_t type") Cc: stable@vger.kernel.org # v6.0+ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202401080657.UjFnvQgX-lkp@intel.com/ Signed-off-by: Bart Van Assche Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240108001223.23835-1-bvanassche@acm.org --- drivers/md/raid1.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index aaa434f0c1751..24f0d799fd98e 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1968,12 +1968,12 @@ static void end_sync_write(struct bio *bio) } static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector, - int sectors, struct page *page, int rw) + int sectors, struct page *page, blk_opf_t rw) { if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) /* success */ return 1; - if (rw == WRITE) { + if (rw == REQ_OP_WRITE) { set_bit(WriteErrorSeen, &rdev->flags); if (!test_and_set_bit(WantReplacement, &rdev->flags)) @@ -2090,7 +2090,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio) rdev = conf->mirrors[d].rdev; if (r1_sync_page_io(rdev, sect, s, pages[idx], - WRITE) == 0) { + REQ_OP_WRITE) == 0) { r1_bio->bios[d]->bi_end_io = NULL; rdev_dec_pending(rdev, mddev); } @@ -2105,7 +2105,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio) rdev = conf->mirrors[d].rdev; if (r1_sync_page_io(rdev, sect, s, pages[idx], - READ) != 0) + REQ_OP_READ) != 0) atomic_add(s, &rdev->corrected_errors); } sectors -= s; @@ -2321,7 +2321,7 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio) !test_bit(Faulty, &rdev->flags)) { atomic_inc(&rdev->nr_pending); r1_sync_page_io(rdev, sect, s, - conf->tmppage, WRITE); + conf->tmppage, REQ_OP_WRITE); rdev_dec_pending(rdev, mddev); } } @@ -2335,7 +2335,7 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio) !test_bit(Faulty, &rdev->flags)) { atomic_inc(&rdev->nr_pending); if (r1_sync_page_io(rdev, sect, s, - conf->tmppage, READ)) { + conf->tmppage, REQ_OP_READ)) { atomic_add(s, &rdev->corrected_errors); pr_info("md/raid1:%s: read error corrected (%d sectors at %llu on %pg)\n", mdname(mddev), s, -- GitLab From 4c115c9c1f81a6efe2bd68fcefec6836f7f3dc71 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 20 Dec 2023 16:17:30 -0800 Subject: [PATCH 0746/1290] cxl/events: Create common event UUID defines Dan points out in review that the cxl_test code could be made better through the use of UUID's defines rather than being open coded.[1] Create UUID defines and use them rather than open coding them. Suggested-by: Dan Williams Signed-off-by: Ira Weiny Link: http://lore.kernel.org/r/65738d09e30e2_45e0129451@dwillia2-xfh.jf.intel.com.notmuch [1] Link: https://lore.kernel.org/r/20231220-cxl-cper-v5-3-1bb8a4ca2c7a@intel.com [djbw: clang-format uuid definitions] Reviewed-by: Jonathan Cameron Acked-by: Ard Biesheuvel Signed-off-by: Dan Williams --- drivers/cxl/core/mbox.c | 30 +++--------------------------- drivers/cxl/cxlmem.h | 24 ++++++++++++++++++++++++ tools/testing/cxl/test/mem.c | 9 +++------ 3 files changed, 30 insertions(+), 33 deletions(-) diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 00f429c440dfe..1ccc3a56e0af6 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -836,46 +836,22 @@ out: } EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL); -/* - * General Media Event Record - * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43 - */ -static const uuid_t gen_media_event_uuid = - UUID_INIT(0xfbcd0a77, 0xc260, 0x417f, - 0x85, 0xa9, 0x08, 0x8b, 0x16, 0x21, 0xeb, 0xa6); - -/* - * DRAM Event Record - * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44 - */ -static const uuid_t dram_event_uuid = - UUID_INIT(0x601dcbb3, 0x9c06, 0x4eab, - 0xb8, 0xaf, 0x4e, 0x9b, 0xfb, 0x5c, 0x96, 0x24); - -/* - * Memory Module Event Record - * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45 - */ -static const uuid_t mem_mod_event_uuid = - UUID_INIT(0xfe927475, 0xdd59, 0x4339, - 0xa5, 0x86, 0x79, 0xba, 0xb1, 0x13, 0xb7, 0x74); - static void cxl_event_trace_record(const struct cxl_memdev *cxlmd, enum cxl_event_log_type type, struct cxl_event_record_raw *record) { uuid_t *id = &record->hdr.id; - if (uuid_equal(id, &gen_media_event_uuid)) { + if (uuid_equal(id, &CXL_EVENT_GEN_MEDIA_UUID)) { struct cxl_event_gen_media *rec = (struct cxl_event_gen_media *)record; trace_cxl_general_media(cxlmd, type, id, rec); - } else if (uuid_equal(id, &dram_event_uuid)) { + } else if (uuid_equal(id, &CXL_EVENT_DRAM_UUID)) { struct cxl_event_dram *rec = (struct cxl_event_dram *)record; trace_cxl_dram(cxlmd, type, id, rec); - } else if (uuid_equal(id, &mem_mod_event_uuid)) { + } else if (uuid_equal(id, &CXL_EVENT_MEM_MODULE_UUID)) { struct cxl_event_mem_module *rec = (struct cxl_event_mem_module *)record; diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index f0e7ebb84f028..27575513ec689 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -580,6 +580,30 @@ struct cxl_mbox_identify { u8 qos_telemetry_caps; } __packed; +/* + * General Media Event Record UUID + * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43 + */ +#define CXL_EVENT_GEN_MEDIA_UUID \ + UUID_INIT(0xfbcd0a77, 0xc260, 0x417f, 0x85, 0xa9, 0x08, 0x8b, 0x16, \ + 0x21, 0xeb, 0xa6) + +/* + * DRAM Event Record UUID + * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44 + */ +#define CXL_EVENT_DRAM_UUID \ + UUID_INIT(0x601dcbb3, 0x9c06, 0x4eab, 0xb8, 0xaf, 0x4e, 0x9b, 0xfb, \ + 0x5c, 0x96, 0x24) + +/* + * Memory Module Event Record UUID + * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45 + */ +#define CXL_EVENT_MEM_MODULE_UUID \ + UUID_INIT(0xfe927475, 0xdd59, 0x4339, 0xa5, 0x86, 0x79, 0xba, 0xb1, \ + 0x13, 0xb7, 0x74) + /* * Get Event Records output payload * CXL rev 3.0 section 8.2.9.2.2; Table 8-50 diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index ee61fa3a2411f..5a95b04b329ae 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -362,8 +362,7 @@ struct cxl_event_record_raw hardware_replace = { struct cxl_event_gen_media gen_media = { .hdr = { - .id = UUID_INIT(0xfbcd0a77, 0xc260, 0x417f, - 0x85, 0xa9, 0x08, 0x8b, 0x16, 0x21, 0xeb, 0xa6), + .id = CXL_EVENT_GEN_MEDIA_UUID, .length = sizeof(struct cxl_event_gen_media), .flags[0] = CXL_EVENT_RECORD_FLAG_PERMANENT, /* .handle = Set dynamically */ @@ -380,8 +379,7 @@ struct cxl_event_gen_media gen_media = { struct cxl_event_dram dram = { .hdr = { - .id = UUID_INIT(0x601dcbb3, 0x9c06, 0x4eab, - 0xb8, 0xaf, 0x4e, 0x9b, 0xfb, 0x5c, 0x96, 0x24), + .id = CXL_EVENT_DRAM_UUID, .length = sizeof(struct cxl_event_dram), .flags[0] = CXL_EVENT_RECORD_FLAG_PERF_DEGRADED, /* .handle = Set dynamically */ @@ -400,8 +398,7 @@ struct cxl_event_dram dram = { struct cxl_event_mem_module mem_module = { .hdr = { - .id = UUID_INIT(0xfe927475, 0xdd59, 0x4339, - 0xa5, 0x86, 0x79, 0xba, 0xb1, 0x13, 0xb7, 0x74), + .id = CXL_EVENT_MEM_MODULE_UUID, .length = sizeof(struct cxl_event_mem_module), /* .handle = Set dynamically */ .related_handle = cpu_to_le16(0), -- GitLab From 207a1f82301de0b4123f00a8d26ea55bb2484757 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 20 Dec 2023 16:17:31 -0800 Subject: [PATCH 0747/1290] cxl/events: Remove passing a UUID to known event traces The UUID data is redundant in the known event trace types. The addition of static defines allows the trace macros to create the UUID data inside the trace thus removing unnecessary code. Have well known trace events use static data to set the uuid field based on the event type. Suggested-by: Jonathan Cameron Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20231220-cxl-cper-v5-4-1bb8a4ca2c7a@intel.com Acked-by: Ard Biesheuvel Signed-off-by: Dan Williams --- drivers/cxl/core/mbox.c | 6 +++--- drivers/cxl/core/trace.h | 28 ++++++++++++++++------------ 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 1ccc3a56e0af6..5f3681de10dec 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -846,16 +846,16 @@ static void cxl_event_trace_record(const struct cxl_memdev *cxlmd, struct cxl_event_gen_media *rec = (struct cxl_event_gen_media *)record; - trace_cxl_general_media(cxlmd, type, id, rec); + trace_cxl_general_media(cxlmd, type, rec); } else if (uuid_equal(id, &CXL_EVENT_DRAM_UUID)) { struct cxl_event_dram *rec = (struct cxl_event_dram *)record; - trace_cxl_dram(cxlmd, type, id, rec); + trace_cxl_dram(cxlmd, type, rec); } else if (uuid_equal(id, &CXL_EVENT_MEM_MODULE_UUID)) { struct cxl_event_mem_module *rec = (struct cxl_event_mem_module *)record; - trace_cxl_memory_module(cxlmd, type, id, rec); + trace_cxl_memory_module(cxlmd, type, rec); } else { /* For unknown record types print just the header */ trace_cxl_generic_event(cxlmd, type, id, record); diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h index 3da16026b8dbf..312cfa9e00045 100644 --- a/drivers/cxl/core/trace.h +++ b/drivers/cxl/core/trace.h @@ -181,6 +181,7 @@ TRACE_EVENT(cxl_overflow, * 1) Add CXL_EVT_TP_entry to TP_STRUCT__entry * 2) Use CXL_EVT_TP_fast_assign within TP_fast_assign; * pass the dev, log, and CXL event header + * NOTE: The uuid must be assigned by the specific trace event * 3) Use CXL_EVT_TP_printk() instead of TP_printk() * * See the generic_event tracepoint as an example. @@ -198,12 +199,11 @@ TRACE_EVENT(cxl_overflow, __field(u8, hdr_length) \ __field(u8, hdr_maint_op_class) -#define CXL_EVT_TP_fast_assign(cxlmd, l, uuid, hdr) \ +#define CXL_EVT_TP_fast_assign(cxlmd, l, hdr) \ __assign_str(memdev, dev_name(&(cxlmd)->dev)); \ __assign_str(host, dev_name((cxlmd)->dev.parent)); \ __entry->log = (l); \ __entry->serial = (cxlmd)->cxlds->serial; \ - memcpy(&__entry->hdr_uuid, (uuid), sizeof(uuid_t)); \ __entry->hdr_length = (hdr).length; \ __entry->hdr_flags = get_unaligned_le24((hdr).flags); \ __entry->hdr_handle = le16_to_cpu((hdr).handle); \ @@ -235,7 +235,8 @@ TRACE_EVENT(cxl_generic_event, ), TP_fast_assign( - CXL_EVT_TP_fast_assign(cxlmd, log, uuid, rec->hdr); + CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr); + memcpy(&__entry->hdr_uuid, uuid, sizeof(uuid_t)); memcpy(__entry->data, &rec->data, CXL_EVENT_RECORD_DATA_LENGTH); ), @@ -315,9 +316,9 @@ TRACE_EVENT(cxl_generic_event, TRACE_EVENT(cxl_general_media, TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log, - const uuid_t *uuid, struct cxl_event_gen_media *rec), + struct cxl_event_gen_media *rec), - TP_ARGS(cxlmd, log, uuid, rec), + TP_ARGS(cxlmd, log, rec), TP_STRUCT__entry( CXL_EVT_TP_entry @@ -336,7 +337,8 @@ TRACE_EVENT(cxl_general_media, ), TP_fast_assign( - CXL_EVT_TP_fast_assign(cxlmd, log, uuid, rec->hdr); + CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr); + memcpy(&__entry->hdr_uuid, &CXL_EVENT_GEN_MEDIA_UUID, sizeof(uuid_t)); /* General Media */ __entry->dpa = le64_to_cpu(rec->phys_addr); @@ -398,9 +400,9 @@ TRACE_EVENT(cxl_general_media, TRACE_EVENT(cxl_dram, TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log, - const uuid_t *uuid, struct cxl_event_dram *rec), + struct cxl_event_dram *rec), - TP_ARGS(cxlmd, log, uuid, rec), + TP_ARGS(cxlmd, log, rec), TP_STRUCT__entry( CXL_EVT_TP_entry @@ -422,7 +424,8 @@ TRACE_EVENT(cxl_dram, ), TP_fast_assign( - CXL_EVT_TP_fast_assign(cxlmd, log, uuid, rec->hdr); + CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr); + memcpy(&__entry->hdr_uuid, &CXL_EVENT_DRAM_UUID, sizeof(uuid_t)); /* DRAM */ __entry->dpa = le64_to_cpu(rec->phys_addr); @@ -547,9 +550,9 @@ TRACE_EVENT(cxl_dram, TRACE_EVENT(cxl_memory_module, TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log, - const uuid_t *uuid, struct cxl_event_mem_module *rec), + struct cxl_event_mem_module *rec), - TP_ARGS(cxlmd, log, uuid, rec), + TP_ARGS(cxlmd, log, rec), TP_STRUCT__entry( CXL_EVT_TP_entry @@ -569,7 +572,8 @@ TRACE_EVENT(cxl_memory_module, ), TP_fast_assign( - CXL_EVT_TP_fast_assign(cxlmd, log, uuid, rec->hdr); + CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr); + memcpy(&__entry->hdr_uuid, &CXL_EVENT_MEM_MODULE_UUID, sizeof(uuid_t)); /* Memory Module Event */ __entry->event_type = rec->event_type; -- GitLab From 6eade110754c085cee9e46f4d87d2c3ea4e59e8c Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 20 Dec 2023 16:17:32 -0800 Subject: [PATCH 0748/1290] cxl/events: Separate UUID from event structures The UEFI CXL CPER structure does not include the UUID. Now that the UUID is passed separately to the trace event there is no need to have the UUID in those structures. Move UUID from the event record header to the raw structures. Adjust cxl-test to Create dummy structures for creating test records. Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20231220-cxl-cper-v5-5-1bb8a4ca2c7a@intel.com Reviewed-by: Jonathan Cameron Acked-by: Ard Biesheuvel Signed-off-by: Dan Williams --- drivers/cxl/core/mbox.c | 2 +- include/linux/cxl-event.h | 6 +- tools/testing/cxl/test/mem.c | 129 ++++++++++++++++++++--------------- 3 files changed, 81 insertions(+), 56 deletions(-) diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 5f3681de10dec..4c51618968264 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -840,7 +840,7 @@ static void cxl_event_trace_record(const struct cxl_memdev *cxlmd, enum cxl_event_log_type type, struct cxl_event_record_raw *record) { - uuid_t *id = &record->hdr.id; + uuid_t *id = &record->id; if (uuid_equal(id, &CXL_EVENT_GEN_MEDIA_UUID)) { struct cxl_event_gen_media *rec = diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h index 0fc068123f8ed..3d9b5954d0c11 100644 --- a/include/linux/cxl-event.h +++ b/include/linux/cxl-event.h @@ -8,7 +8,6 @@ * CXL rev 3.0 section 8.2.9.2.1; Table 8-42 */ struct cxl_event_record_hdr { - uuid_t id; u8 length; u8 flags[3]; __le16 handle; @@ -18,8 +17,13 @@ struct cxl_event_record_hdr { u8 reserved[15]; } __packed; +/* + * Common Event Record Format + * CXL rev 3.0 section 8.2.9.2.1; Table 8-42 + */ #define CXL_EVENT_RECORD_DATA_LENGTH 0x50 struct cxl_event_record_raw { + uuid_t id; struct cxl_event_record_hdr hdr; u8 data[CXL_EVENT_RECORD_DATA_LENGTH]; } __packed; diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index 5a95b04b329ae..9cc2b8ce1efd0 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -337,9 +337,9 @@ static void cxl_mock_event_trigger(struct device *dev) } struct cxl_event_record_raw maint_needed = { + .id = UUID_INIT(0xBA5EBA11, 0xABCD, 0xEFEB, + 0xa5, 0x5a, 0xa5, 0x5a, 0xa5, 0xa5, 0x5a, 0xa5), .hdr = { - .id = UUID_INIT(0xBA5EBA11, 0xABCD, 0xEFEB, - 0xa5, 0x5a, 0xa5, 0x5a, 0xa5, 0xa5, 0x5a, 0xa5), .length = sizeof(struct cxl_event_record_raw), .flags[0] = CXL_EVENT_RECORD_FLAG_MAINT_NEEDED, /* .handle = Set dynamically */ @@ -349,9 +349,9 @@ struct cxl_event_record_raw maint_needed = { }; struct cxl_event_record_raw hardware_replace = { + .id = UUID_INIT(0xABCDEFEB, 0xBA11, 0xBA5E, + 0xa5, 0x5a, 0xa5, 0x5a, 0xa5, 0xa5, 0x5a, 0xa5), .hdr = { - .id = UUID_INIT(0xABCDEFEB, 0xBA11, 0xBA5E, - 0xa5, 0x5a, 0xa5, 0x5a, 0xa5, 0xa5, 0x5a, 0xa5), .length = sizeof(struct cxl_event_record_raw), .flags[0] = CXL_EVENT_RECORD_FLAG_HW_REPLACE, /* .handle = Set dynamically */ @@ -360,61 +360,82 @@ struct cxl_event_record_raw hardware_replace = { .data = { 0xDE, 0xAD, 0xBE, 0xEF }, }; -struct cxl_event_gen_media gen_media = { - .hdr = { - .id = CXL_EVENT_GEN_MEDIA_UUID, - .length = sizeof(struct cxl_event_gen_media), - .flags[0] = CXL_EVENT_RECORD_FLAG_PERMANENT, - /* .handle = Set dynamically */ - .related_handle = cpu_to_le16(0), +struct cxl_test_gen_media { + uuid_t id; + struct cxl_event_gen_media rec; +} __packed; + +struct cxl_test_gen_media gen_media = { + .id = CXL_EVENT_GEN_MEDIA_UUID, + .rec = { + .hdr = { + .length = sizeof(struct cxl_test_gen_media), + .flags[0] = CXL_EVENT_RECORD_FLAG_PERMANENT, + /* .handle = Set dynamically */ + .related_handle = cpu_to_le16(0), + }, + .phys_addr = cpu_to_le64(0x2000), + .descriptor = CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT, + .type = CXL_GMER_MEM_EVT_TYPE_DATA_PATH_ERROR, + .transaction_type = CXL_GMER_TRANS_HOST_WRITE, + /* .validity_flags = */ + .channel = 1, + .rank = 30 }, - .phys_addr = cpu_to_le64(0x2000), - .descriptor = CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT, - .type = CXL_GMER_MEM_EVT_TYPE_DATA_PATH_ERROR, - .transaction_type = CXL_GMER_TRANS_HOST_WRITE, - /* .validity_flags = */ - .channel = 1, - .rank = 30 }; -struct cxl_event_dram dram = { - .hdr = { - .id = CXL_EVENT_DRAM_UUID, - .length = sizeof(struct cxl_event_dram), - .flags[0] = CXL_EVENT_RECORD_FLAG_PERF_DEGRADED, - /* .handle = Set dynamically */ - .related_handle = cpu_to_le16(0), +struct cxl_test_dram { + uuid_t id; + struct cxl_event_dram rec; +} __packed; + +struct cxl_test_dram dram = { + .id = CXL_EVENT_DRAM_UUID, + .rec = { + .hdr = { + .length = sizeof(struct cxl_test_dram), + .flags[0] = CXL_EVENT_RECORD_FLAG_PERF_DEGRADED, + /* .handle = Set dynamically */ + .related_handle = cpu_to_le16(0), + }, + .phys_addr = cpu_to_le64(0x8000), + .descriptor = CXL_GMER_EVT_DESC_THRESHOLD_EVENT, + .type = CXL_GMER_MEM_EVT_TYPE_INV_ADDR, + .transaction_type = CXL_GMER_TRANS_INTERNAL_MEDIA_SCRUB, + /* .validity_flags = */ + .channel = 1, + .bank_group = 5, + .bank = 2, + .column = {0xDE, 0xAD}, }, - .phys_addr = cpu_to_le64(0x8000), - .descriptor = CXL_GMER_EVT_DESC_THRESHOLD_EVENT, - .type = CXL_GMER_MEM_EVT_TYPE_INV_ADDR, - .transaction_type = CXL_GMER_TRANS_INTERNAL_MEDIA_SCRUB, - /* .validity_flags = */ - .channel = 1, - .bank_group = 5, - .bank = 2, - .column = {0xDE, 0xAD}, }; -struct cxl_event_mem_module mem_module = { - .hdr = { - .id = CXL_EVENT_MEM_MODULE_UUID, - .length = sizeof(struct cxl_event_mem_module), - /* .handle = Set dynamically */ - .related_handle = cpu_to_le16(0), +struct cxl_test_mem_module { + uuid_t id; + struct cxl_event_mem_module rec; +} __packed; + +struct cxl_test_mem_module mem_module = { + .id = CXL_EVENT_MEM_MODULE_UUID, + .rec = { + .hdr = { + .length = sizeof(struct cxl_test_mem_module), + /* .handle = Set dynamically */ + .related_handle = cpu_to_le16(0), + }, + .event_type = CXL_MMER_TEMP_CHANGE, + .info = { + .health_status = CXL_DHI_HS_PERFORMANCE_DEGRADED, + .media_status = CXL_DHI_MS_ALL_DATA_LOST, + .add_status = (CXL_DHI_AS_CRITICAL << 2) | + (CXL_DHI_AS_WARNING << 4) | + (CXL_DHI_AS_WARNING << 5), + .device_temp = { 0xDE, 0xAD}, + .dirty_shutdown_cnt = { 0xde, 0xad, 0xbe, 0xef }, + .cor_vol_err_cnt = { 0xde, 0xad, 0xbe, 0xef }, + .cor_per_err_cnt = { 0xde, 0xad, 0xbe, 0xef }, + } }, - .event_type = CXL_MMER_TEMP_CHANGE, - .info = { - .health_status = CXL_DHI_HS_PERFORMANCE_DEGRADED, - .media_status = CXL_DHI_MS_ALL_DATA_LOST, - .add_status = (CXL_DHI_AS_CRITICAL << 2) | - (CXL_DHI_AS_WARNING << 4) | - (CXL_DHI_AS_WARNING << 5), - .device_temp = { 0xDE, 0xAD}, - .dirty_shutdown_cnt = { 0xde, 0xad, 0xbe, 0xef }, - .cor_vol_err_cnt = { 0xde, 0xad, 0xbe, 0xef }, - .cor_per_err_cnt = { 0xde, 0xad, 0xbe, 0xef }, - } }; static int mock_set_timestamp(struct cxl_dev_state *cxlds, @@ -436,11 +457,11 @@ static int mock_set_timestamp(struct cxl_dev_state *cxlds, static void cxl_mock_add_event_logs(struct mock_event_store *mes) { put_unaligned_le16(CXL_GMER_VALID_CHANNEL | CXL_GMER_VALID_RANK, - &gen_media.validity_flags); + &gen_media.rec.validity_flags); put_unaligned_le16(CXL_DER_VALID_CHANNEL | CXL_DER_VALID_BANK_GROUP | CXL_DER_VALID_BANK | CXL_DER_VALID_COLUMN, - &dram.validity_flags); + &dram.rec.validity_flags); mes_add_event(mes, CXL_EVENT_TYPE_INFO, &maint_needed); mes_add_event(mes, CXL_EVENT_TYPE_INFO, -- GitLab From f9c683386f5bc0364615138ce2b14be50848dbcf Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 20 Dec 2023 16:17:33 -0800 Subject: [PATCH 0749/1290] cxl/events: Create a CXL event union The CXL CPER and event log records share everything but a UUID/GUID in their structures. Define a cxl_event union without the UUID/GUID to be shared between the CPER and event log record formats. Adjust the code to use this union. Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20231220-cxl-cper-v5-6-1bb8a4ca2c7a@intel.com Reviewed-by: Jonathan Cameron Acked-by: Ard Biesheuvel Signed-off-by: Dan Williams --- drivers/cxl/core/mbox.c | 32 +++++++++++++------------------- drivers/cxl/core/trace.h | 8 ++++---- include/linux/cxl-event.h | 23 +++++++++++++++++------ tools/testing/cxl/test/mem.c | 31 ++++++++++++++++++------------- 4 files changed, 52 insertions(+), 42 deletions(-) diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 4c51618968264..06957696247bd 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -840,26 +840,17 @@ static void cxl_event_trace_record(const struct cxl_memdev *cxlmd, enum cxl_event_log_type type, struct cxl_event_record_raw *record) { + union cxl_event *evt = &record->event; uuid_t *id = &record->id; - if (uuid_equal(id, &CXL_EVENT_GEN_MEDIA_UUID)) { - struct cxl_event_gen_media *rec = - (struct cxl_event_gen_media *)record; - - trace_cxl_general_media(cxlmd, type, rec); - } else if (uuid_equal(id, &CXL_EVENT_DRAM_UUID)) { - struct cxl_event_dram *rec = (struct cxl_event_dram *)record; - - trace_cxl_dram(cxlmd, type, rec); - } else if (uuid_equal(id, &CXL_EVENT_MEM_MODULE_UUID)) { - struct cxl_event_mem_module *rec = - (struct cxl_event_mem_module *)record; - - trace_cxl_memory_module(cxlmd, type, rec); - } else { - /* For unknown record types print just the header */ - trace_cxl_generic_event(cxlmd, type, id, record); - } + if (uuid_equal(id, &CXL_EVENT_GEN_MEDIA_UUID)) + trace_cxl_general_media(cxlmd, type, &evt->gen_media); + else if (uuid_equal(id, &CXL_EVENT_DRAM_UUID)) + trace_cxl_dram(cxlmd, type, &evt->dram); + else if (uuid_equal(id, &CXL_EVENT_MEM_MODULE_UUID)) + trace_cxl_memory_module(cxlmd, type, &evt->mem_module); + else + trace_cxl_generic_event(cxlmd, type, id, &evt->generic); } static int cxl_clear_event_record(struct cxl_memdev_state *mds, @@ -902,7 +893,10 @@ static int cxl_clear_event_record(struct cxl_memdev_state *mds, */ i = 0; for (cnt = 0; cnt < total; cnt++) { - payload->handles[i++] = get_pl->records[cnt].hdr.handle; + struct cxl_event_record_raw *raw = &get_pl->records[cnt]; + struct cxl_event_generic *gen = &raw->event.generic; + + payload->handles[i++] = gen->hdr.handle; dev_dbg(mds->cxlds.dev, "Event log '%d': Clearing %u\n", log, le16_to_cpu(payload->handles[i])); diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h index 312cfa9e00045..89445435303aa 100644 --- a/drivers/cxl/core/trace.h +++ b/drivers/cxl/core/trace.h @@ -225,9 +225,9 @@ TRACE_EVENT(cxl_overflow, TRACE_EVENT(cxl_generic_event, TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log, - const uuid_t *uuid, struct cxl_event_record_raw *rec), + const uuid_t *uuid, struct cxl_event_generic *gen_rec), - TP_ARGS(cxlmd, log, uuid, rec), + TP_ARGS(cxlmd, log, uuid, gen_rec), TP_STRUCT__entry( CXL_EVT_TP_entry @@ -235,9 +235,9 @@ TRACE_EVENT(cxl_generic_event, ), TP_fast_assign( - CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr); + CXL_EVT_TP_fast_assign(cxlmd, log, gen_rec->hdr); memcpy(&__entry->hdr_uuid, uuid, sizeof(uuid_t)); - memcpy(__entry->data, &rec->data, CXL_EVENT_RECORD_DATA_LENGTH); + memcpy(__entry->data, gen_rec->data, CXL_EVENT_RECORD_DATA_LENGTH); ), CXL_EVT_TP_printk("%s", diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h index 3d9b5954d0c11..4d6c05f535f80 100644 --- a/include/linux/cxl-event.h +++ b/include/linux/cxl-event.h @@ -17,13 +17,8 @@ struct cxl_event_record_hdr { u8 reserved[15]; } __packed; -/* - * Common Event Record Format - * CXL rev 3.0 section 8.2.9.2.1; Table 8-42 - */ #define CXL_EVENT_RECORD_DATA_LENGTH 0x50 -struct cxl_event_record_raw { - uuid_t id; +struct cxl_event_generic { struct cxl_event_record_hdr hdr; u8 data[CXL_EVENT_RECORD_DATA_LENGTH]; } __packed; @@ -96,4 +91,20 @@ struct cxl_event_mem_module { u8 reserved[0x3d]; } __packed; +union cxl_event { + struct cxl_event_generic generic; + struct cxl_event_gen_media gen_media; + struct cxl_event_dram dram; + struct cxl_event_mem_module mem_module; +} __packed; + +/* + * Common Event Record Format; in event logs + * CXL rev 3.0 section 8.2.9.2.1; Table 8-42 + */ +struct cxl_event_record_raw { + uuid_t id; + union cxl_event event; +} __packed; + #endif /* _LINUX_CXL_EVENT_H */ diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index 9cc2b8ce1efd0..35ee41e435ab3 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -251,7 +251,8 @@ static int mock_get_event(struct device *dev, struct cxl_mbox_cmd *cmd) for (i = 0; i < CXL_TEST_EVENT_CNT && !event_log_empty(log); i++) { memcpy(&pl->records[i], event_get_current(log), sizeof(pl->records[i])); - pl->records[i].hdr.handle = event_get_cur_event_handle(log); + pl->records[i].event.generic.hdr.handle = + event_get_cur_event_handle(log); log->cur_idx++; } @@ -339,25 +340,29 @@ static void cxl_mock_event_trigger(struct device *dev) struct cxl_event_record_raw maint_needed = { .id = UUID_INIT(0xBA5EBA11, 0xABCD, 0xEFEB, 0xa5, 0x5a, 0xa5, 0x5a, 0xa5, 0xa5, 0x5a, 0xa5), - .hdr = { - .length = sizeof(struct cxl_event_record_raw), - .flags[0] = CXL_EVENT_RECORD_FLAG_MAINT_NEEDED, - /* .handle = Set dynamically */ - .related_handle = cpu_to_le16(0xa5b6), + .event.generic = { + .hdr = { + .length = sizeof(struct cxl_event_record_raw), + .flags[0] = CXL_EVENT_RECORD_FLAG_MAINT_NEEDED, + /* .handle = Set dynamically */ + .related_handle = cpu_to_le16(0xa5b6), + }, + .data = { 0xDE, 0xAD, 0xBE, 0xEF }, }, - .data = { 0xDE, 0xAD, 0xBE, 0xEF }, }; struct cxl_event_record_raw hardware_replace = { .id = UUID_INIT(0xABCDEFEB, 0xBA11, 0xBA5E, 0xa5, 0x5a, 0xa5, 0x5a, 0xa5, 0xa5, 0x5a, 0xa5), - .hdr = { - .length = sizeof(struct cxl_event_record_raw), - .flags[0] = CXL_EVENT_RECORD_FLAG_HW_REPLACE, - /* .handle = Set dynamically */ - .related_handle = cpu_to_le16(0xb6a5), + .event.generic = { + .hdr = { + .length = sizeof(struct cxl_event_record_raw), + .flags[0] = CXL_EVENT_RECORD_FLAG_HW_REPLACE, + /* .handle = Set dynamically */ + .related_handle = cpu_to_le16(0xb6a5), + }, + .data = { 0xDE, 0xAD, 0xBE, 0xEF }, }, - .data = { 0xDE, 0xAD, 0xBE, 0xEF }, }; struct cxl_test_gen_media { -- GitLab From 671a794c33c6e048ca5cedd5ad6af44d52d5d7e5 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 20 Dec 2023 16:17:34 -0800 Subject: [PATCH 0750/1290] acpi/ghes: Process CXL Component Events BIOS can configure memory devices as firmware first. This will send CXL events to the firmware instead of the OS. The firmware can then send these events to the OS via UEFI. UEFI v2.10 section N.2.14 defines a Common Platform Error Record (CPER) format for CXL Component Events. The format is mostly the same as the CXL Common Event Record Format. The difference is the use of a GUID in the Section Type rather than a UUID as part of the event itself. Add GHES support to detect CXL CPER records and call a registered callback with the event. A notifier chain was considered for the callback but the complexity did not justify the use case as only the CXL subsystem requires this event. Enforce that only one callback can be registered at any time. Cc: Ard Biesheuvel Cc: Rafael J. Wysocki Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20231220-cxl-cper-v5-7-1bb8a4ca2c7a@intel.com [djbw: fixup checkpatch errors] Reviewed-by: Jonathan Cameron Acked-by: Ard Biesheuvel Signed-off-by: Dan Williams --- drivers/acpi/apei/ghes.c | 89 +++++++++++++++++++++++++++++++++++++++ include/linux/cxl-event.h | 50 ++++++++++++++++++++++ 2 files changed, 139 insertions(+) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 63ad0541db381..56a5d2ef9e0a0 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -657,6 +658,78 @@ static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata, schedule_work(&entry->work); } +/* + * Only a single callback can be registered for CXL CPER events. + */ +static DECLARE_RWSEM(cxl_cper_rw_sem); +static cxl_cper_callback cper_callback; + +/* CXL Event record UUIDs are formatted as GUIDs and reported in section type */ + +/* + * General Media Event Record + * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43 + */ +#define CPER_SEC_CXL_GEN_MEDIA_GUID \ + GUID_INIT(0xfbcd0a77, 0xc260, 0x417f, \ + 0x85, 0xa9, 0x08, 0x8b, 0x16, 0x21, 0xeb, 0xa6) + +/* + * DRAM Event Record + * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44 + */ +#define CPER_SEC_CXL_DRAM_GUID \ + GUID_INIT(0x601dcbb3, 0x9c06, 0x4eab, \ + 0xb8, 0xaf, 0x4e, 0x9b, 0xfb, 0x5c, 0x96, 0x24) + +/* + * Memory Module Event Record + * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45 + */ +#define CPER_SEC_CXL_MEM_MODULE_GUID \ + GUID_INIT(0xfe927475, 0xdd59, 0x4339, \ + 0xa5, 0x86, 0x79, 0xba, 0xb1, 0x13, 0xb7, 0x74) + +static void cxl_cper_post_event(enum cxl_event_type event_type, + struct cxl_cper_event_rec *rec) +{ + if (rec->hdr.length <= sizeof(rec->hdr) || + rec->hdr.length > sizeof(*rec)) { + pr_err(FW_WARN "CXL CPER Invalid section length (%u)\n", + rec->hdr.length); + return; + } + + if (!(rec->hdr.validation_bits & CPER_CXL_COMP_EVENT_LOG_VALID)) { + pr_err(FW_WARN "CXL CPER invalid event\n"); + return; + } + + guard(rwsem_read)(&cxl_cper_rw_sem); + if (cper_callback) + cper_callback(event_type, rec); +} + +int cxl_cper_register_callback(cxl_cper_callback callback) +{ + guard(rwsem_write)(&cxl_cper_rw_sem); + if (cper_callback) + return -EINVAL; + cper_callback = callback; + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_cper_register_callback, CXL); + +int cxl_cper_unregister_callback(cxl_cper_callback callback) +{ + guard(rwsem_write)(&cxl_cper_rw_sem); + if (callback != cper_callback) + return -EINVAL; + cper_callback = NULL; + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_cper_unregister_callback, CXL); + static bool ghes_do_proc(struct ghes *ghes, const struct acpi_hest_generic_status *estatus) { @@ -690,6 +763,22 @@ static bool ghes_do_proc(struct ghes *ghes, } else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) { queued = ghes_handle_arm_hw_error(gdata, sev); + } else if (guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA_GUID)) { + struct cxl_cper_event_rec *rec = + acpi_hest_get_payload(gdata); + + cxl_cper_post_event(CXL_CPER_EVENT_GEN_MEDIA, rec); + } else if (guid_equal(sec_type, &CPER_SEC_CXL_DRAM_GUID)) { + struct cxl_cper_event_rec *rec = + acpi_hest_get_payload(gdata); + + cxl_cper_post_event(CXL_CPER_EVENT_DRAM, rec); + } else if (guid_equal(sec_type, + &CPER_SEC_CXL_MEM_MODULE_GUID)) { + struct cxl_cper_event_rec *rec = + acpi_hest_get_payload(gdata); + + cxl_cper_post_event(CXL_CPER_EVENT_MEM_MODULE, rec); } else { void *err = acpi_hest_get_payload(gdata); diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h index 4d6c05f535f80..95841750a383b 100644 --- a/include/linux/cxl-event.h +++ b/include/linux/cxl-event.h @@ -107,4 +107,54 @@ struct cxl_event_record_raw { union cxl_event event; } __packed; +enum cxl_event_type { + CXL_CPER_EVENT_GEN_MEDIA, + CXL_CPER_EVENT_DRAM, + CXL_CPER_EVENT_MEM_MODULE, +}; + +#define CPER_CXL_DEVICE_ID_VALID BIT(0) +#define CPER_CXL_DEVICE_SN_VALID BIT(1) +#define CPER_CXL_COMP_EVENT_LOG_VALID BIT(2) +struct cxl_cper_event_rec { + struct { + u32 length; + u64 validation_bits; + struct cper_cxl_event_devid { + u16 vendor_id; + u16 device_id; + u8 func_num; + u8 device_num; + u8 bus_num; + u16 segment_num; + u16 slot_num; /* bits 2:0 reserved */ + u8 reserved; + } __packed device_id; + struct cper_cxl_event_sn { + u32 lower_dw; + u32 upper_dw; + } __packed dev_serial_num; + } __packed hdr; + + union cxl_event event; +} __packed; + +typedef void (*cxl_cper_callback)(enum cxl_event_type type, + struct cxl_cper_event_rec *rec); + +#ifdef CONFIG_ACPI_APEI_GHES +int cxl_cper_register_callback(cxl_cper_callback callback); +int cxl_cper_unregister_callback(cxl_cper_callback callback); +#else +static inline int cxl_cper_register_callback(cxl_cper_callback callback) +{ + return 0; +} + +static inline int cxl_cper_unregister_callback(cxl_cper_callback callback) +{ + return 0; +} +#endif + #endif /* _LINUX_CXL_EVENT_H */ -- GitLab From ced085ef369af7a2b6da962ec2fbd01339f60693 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 20 Dec 2023 16:17:35 -0800 Subject: [PATCH 0751/1290] PCI: Introduce cleanup helpers for device reference counts and locks The "goto error" pattern is notorious for introducing subtle resource leaks. Use the new cleanup.h helpers for PCI device reference counts and locks. Similar to the new put_device() and device_lock() cleanup helpers, __free(put_device) and guard(device), define the same for PCI devices, __free(pci_dev_put) and guard(pci_dev). These helpers eliminate the need for "goto free;" and "goto unlock;" patterns. For example, A 'struct pci_dev *' instance declared as: struct pci_dev *pdev __free(pci_dev_put) = NULL; ...will automatically call pci_dev_put() if @pdev is non-NULL when @pdev goes out of scope (automatic variable scope). If a function wants to invoke pci_dev_put() on error, but return @pdev on success, it can do: return no_free_ptr(pdev); ...or: return_ptr(pdev); For potential cleanup opportunity there are 587 open-coded calls to pci_dev_put() in the kernel with 65 instances within 10 lines of a goto statement with the CXL driver threatening to add another one. The guard() helper holds the associated lock for the remainder of the current scope in which it was invoked. So, for example: func(...) { if (...) { ... guard(pci_dev); /* pci_dev_lock() invoked here */ ... } /* <- implied pci_dev_unlock() triggered here */ } There are 15 invocations of pci_dev_unlock() in the kernel with 5 instances within 10 lines of a goto statement. Again, the CXL driver is threatening to add another. Introduce these helpers to preclude the addition of new more error prone goto put; / goto unlock; sequences. For now, these helpers are used in drivers/cxl/pci.c to allow ACPI error reports to be fed back into the CXL driver associated with the PCI device identified in the report. Cc: Bjorn Helgaas Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20231220-cxl-cper-v5-8-1bb8a4ca2c7a@intel.com [djbw: rewrite changelog] Acked-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron Acked-by: Ard Biesheuvel Signed-off-by: Dan Williams --- include/linux/pci.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/pci.h b/include/linux/pci.h index dea043bc1e383..0d23d2e0eb1a4 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1170,6 +1170,7 @@ int pci_get_interrupt_pin(struct pci_dev *dev, struct pci_dev **bridge); u8 pci_common_swizzle(struct pci_dev *dev, u8 *pinp); struct pci_dev *pci_dev_get(struct pci_dev *dev); void pci_dev_put(struct pci_dev *dev); +DEFINE_FREE(pci_dev_put, struct pci_dev *, if (_T) pci_dev_put(_T)) void pci_remove_bus(struct pci_bus *b); void pci_stop_and_remove_bus_device(struct pci_dev *dev); void pci_stop_and_remove_bus_device_locked(struct pci_dev *dev); @@ -1874,6 +1875,7 @@ void pci_cfg_access_unlock(struct pci_dev *dev); void pci_dev_lock(struct pci_dev *dev); int pci_dev_trylock(struct pci_dev *dev); void pci_dev_unlock(struct pci_dev *dev); +DEFINE_GUARD(pci_dev, struct pci_dev *, pci_dev_lock(_T), pci_dev_unlock(_T)) /* * PCI domain support. Sometimes called PCI segment (eg by ACPI), -- GitLab From dc97f6344f205b0dfa144e1b3e16d6dc05383d57 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 20 Dec 2023 16:17:36 -0800 Subject: [PATCH 0752/1290] cxl/pci: Register for and process CPER events If the firmware has configured CXL event support to be firmware first the OS can process those events through CPER records. The CXL layer has unique DPA to HPA knowledge and standard event trace parsing in place. CPER records contain Bus, Device, Function information which can be used to identify the PCI device which is sending the event. Change the PCI driver registration to include registration of a CXL CPER callback to process events through the trace subsystem. Use new scoped based management to simplify the handling of the PCI device object. Tested-by: Smita-Koralahalli Reviewed-by: Smita-Koralahalli Link: https://lore.kernel.org/r/20231220-cxl-cper-v5-9-1bb8a4ca2c7a@intel.com Signed-off-by: Ira Weiny [djbw: use new pci_dev guard, flip init order] Reviewed-by: Jonathan Cameron Acked-by: Ard Biesheuvel Signed-off-by: Dan Williams --- drivers/cxl/core/mbox.c | 40 +++++++++++++++++++-------- drivers/cxl/cxlmem.h | 4 +++ drivers/cxl/pci.c | 58 ++++++++++++++++++++++++++++++++++++++- include/linux/cxl-event.h | 1 + 4 files changed, 90 insertions(+), 13 deletions(-) diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 06957696247bd..23021920aacee 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -836,21 +836,37 @@ out: } EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL); -static void cxl_event_trace_record(const struct cxl_memdev *cxlmd, - enum cxl_event_log_type type, - struct cxl_event_record_raw *record) +void cxl_event_trace_record(const struct cxl_memdev *cxlmd, + enum cxl_event_log_type type, + enum cxl_event_type event_type, + const uuid_t *uuid, union cxl_event *evt) { - union cxl_event *evt = &record->event; - uuid_t *id = &record->id; - - if (uuid_equal(id, &CXL_EVENT_GEN_MEDIA_UUID)) + if (event_type == CXL_CPER_EVENT_GEN_MEDIA) trace_cxl_general_media(cxlmd, type, &evt->gen_media); - else if (uuid_equal(id, &CXL_EVENT_DRAM_UUID)) + else if (event_type == CXL_CPER_EVENT_DRAM) trace_cxl_dram(cxlmd, type, &evt->dram); - else if (uuid_equal(id, &CXL_EVENT_MEM_MODULE_UUID)) + else if (event_type == CXL_CPER_EVENT_MEM_MODULE) trace_cxl_memory_module(cxlmd, type, &evt->mem_module); else - trace_cxl_generic_event(cxlmd, type, id, &evt->generic); + trace_cxl_generic_event(cxlmd, type, uuid, &evt->generic); +} +EXPORT_SYMBOL_NS_GPL(cxl_event_trace_record, CXL); + +static void __cxl_event_trace_record(const struct cxl_memdev *cxlmd, + enum cxl_event_log_type type, + struct cxl_event_record_raw *record) +{ + enum cxl_event_type ev_type = CXL_CPER_EVENT_GENERIC; + const uuid_t *uuid = &record->id; + + if (uuid_equal(uuid, &CXL_EVENT_GEN_MEDIA_UUID)) + ev_type = CXL_CPER_EVENT_GEN_MEDIA; + else if (uuid_equal(uuid, &CXL_EVENT_DRAM_UUID)) + ev_type = CXL_CPER_EVENT_DRAM; + else if (uuid_equal(uuid, &CXL_EVENT_MEM_MODULE_UUID)) + ev_type = CXL_CPER_EVENT_MEM_MODULE; + + cxl_event_trace_record(cxlmd, type, ev_type, uuid, &record->event); } static int cxl_clear_event_record(struct cxl_memdev_state *mds, @@ -961,8 +977,8 @@ static void cxl_mem_get_records_log(struct cxl_memdev_state *mds, break; for (i = 0; i < nr_rec; i++) - cxl_event_trace_record(cxlmd, type, - &payload->records[i]); + __cxl_event_trace_record(cxlmd, type, + &payload->records[i]); if (payload->flags & CXL_GET_EVENT_FLAG_OVERFLOW) trace_cxl_overflow(cxlmd, type, payload); diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 27575513ec689..3c201324a3b3e 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -802,6 +802,10 @@ void set_exclusive_cxl_commands(struct cxl_memdev_state *mds, void clear_exclusive_cxl_commands(struct cxl_memdev_state *mds, unsigned long *cmds); void cxl_mem_get_event_records(struct cxl_memdev_state *mds, u32 status); +void cxl_event_trace_record(const struct cxl_memdev *cxlmd, + enum cxl_event_log_type type, + enum cxl_event_type event_type, + const uuid_t *uuid, union cxl_event *evt); int cxl_set_timestamp(struct cxl_memdev_state *mds); int cxl_poison_state_init(struct cxl_memdev_state *mds); int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len, diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 0155fb66b580d..4fd1f207c84ee 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright(c) 2020 Intel Corporation. All rights reserved. */ +#include #include #include #include @@ -969,6 +970,61 @@ static struct pci_driver cxl_pci_driver = { }, }; +#define CXL_EVENT_HDR_FLAGS_REC_SEVERITY GENMASK(1, 0) +static void cxl_cper_event_call(enum cxl_event_type ev_type, + struct cxl_cper_event_rec *rec) +{ + struct cper_cxl_event_devid *device_id = &rec->hdr.device_id; + struct pci_dev *pdev __free(pci_dev_put) = NULL; + enum cxl_event_log_type log_type; + struct cxl_dev_state *cxlds; + unsigned int devfn; + u32 hdr_flags; + + devfn = PCI_DEVFN(device_id->device_num, device_id->func_num); + pdev = pci_get_domain_bus_and_slot(device_id->segment_num, + device_id->bus_num, devfn); + if (!pdev) + return; + + guard(pci_dev)(pdev); + if (pdev->driver != &cxl_pci_driver) + return; + + cxlds = pci_get_drvdata(pdev); + if (!cxlds) + return; + + /* Fabricate a log type */ + hdr_flags = get_unaligned_le24(rec->event.generic.hdr.flags); + log_type = FIELD_GET(CXL_EVENT_HDR_FLAGS_REC_SEVERITY, hdr_flags); + + cxl_event_trace_record(cxlds->cxlmd, log_type, ev_type, + &uuid_null, &rec->event); +} + +static int __init cxl_pci_driver_init(void) +{ + int rc; + + rc = cxl_cper_register_callback(cxl_cper_event_call); + if (rc) + return rc; + + rc = pci_register_driver(&cxl_pci_driver); + if (rc) + cxl_cper_unregister_callback(cxl_cper_event_call); + + return rc; +} + +static void __exit cxl_pci_driver_exit(void) +{ + pci_unregister_driver(&cxl_pci_driver); + cxl_cper_unregister_callback(cxl_cper_event_call); +} + +module_init(cxl_pci_driver_init); +module_exit(cxl_pci_driver_exit); MODULE_LICENSE("GPL v2"); -module_pci_driver(cxl_pci_driver); MODULE_IMPORT_NS(CXL); diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h index 95841750a383b..91125eca4c8ab 100644 --- a/include/linux/cxl-event.h +++ b/include/linux/cxl-event.h @@ -108,6 +108,7 @@ struct cxl_event_record_raw { } __packed; enum cxl_event_type { + CXL_CPER_EVENT_GENERIC, CXL_CPER_EVENT_GEN_MEDIA, CXL_CPER_EVENT_DRAM, CXL_CPER_EVENT_MEM_MODULE, -- GitLab From 25742aeb135c4a44e92fb347e037adaa145b9484 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 20 Dec 2023 08:10:28 -0500 Subject: [PATCH 0753/1290] ring-buffer: Remove stale comment from ring_buffer_size() It's been 11 years since the ring_buffer_size() function was updated to use the nr_pages from the buffer->buffers[cpu] structure instead of using the buffer->nr_pages that no longer exists. The comment in the code is more of what a change log should have and is pretty much useless for development. It's saying how things worked back in 2012 that bares no purpose on today's code. Remove it. Link: https://lore.kernel.org/linux-trace-kernel/84d3b41a72bd43dbb9d44921ef535c92@AcuMS.aculab.com/ Link: https://lore.kernel.org/linux-trace-kernel/20231220081028.7cd7e8e2@gandalf.local.home Cc: Mark Rutland Cc: Mathieu Desnoyers Reported-by: David Laight Signed-off-by: Steven Rostedt (Google) Acked-by: Masami Hiramatsu (Google) --- kernel/trace/ring_buffer.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 173d2595ce2d9..7887d61d5b569 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5122,12 +5122,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_iter_advance); */ unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu) { - /* - * Earlier, this method returned - * buffer->subbuf_size * buffer->nr_pages - * Since the nr_pages field is now removed, we have converted this to - * return the per cpu buffer value. - */ if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; -- GitLab From b6da6cbe13ebf24716438de71d50573b9f36f35d Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Mon, 25 Dec 2023 12:42:06 +0800 Subject: [PATCH 0754/1290] riscv: introduce RISCV_EFFICIENT_UNALIGNED_ACCESS Some riscv implementations such as T-HEAD's C906, C908, C910 and C920 support efficient unaligned access, for performance reason we want to enable HAVE_EFFICIENT_UNALIGNED_ACCESS on these platforms. To avoid performance regressions on other non efficient unaligned access platforms, HAVE_EFFICIENT_UNALIGNED_ACCESS can't be globally selected. To solve this problem, runtime code patching based on the detected speed is a good solution. But that's not easy, it involves lots of work to modify vairous subsystems such as net, mm, lib and so on. This can be done step by step. So let's take an easier solution: add support to efficient unaligned access and hide the support under NONPORTABLE. Now let's introduce RISCV_EFFICIENT_UNALIGNED_ACCESS which depends on NONPORTABLE, if users know during config time that the kernel will be only run on those efficient unaligned access hw platforms, they can enable it. Obviously, generic unified kernel Image shouldn't enable it. Signed-off-by: Jisheng Zhang Reviewed-by: Charlie Jenkins Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20231225044207.3821-2-jszhang@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 13 +++++++++++++ arch/riscv/Makefile | 2 ++ 2 files changed, 15 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 95a2a06acc6a6..1ba03a2b509c3 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -651,6 +651,19 @@ config RISCV_MISALIGNED load/store for both kernel and userspace. When disable, misaligned accesses will generate SIGBUS in userspace and panic in kernel. +config RISCV_EFFICIENT_UNALIGNED_ACCESS + bool "Assume the CPU supports fast unaligned memory accesses" + depends on NONPORTABLE + select HAVE_EFFICIENT_UNALIGNED_ACCESS + help + Say Y here if you want the kernel to assume that the CPU supports + efficient unaligned memory accesses. When enabled, this option + improves the performance of the kernel on such CPUs. However, the + kernel will run much more slowly, or will not be able to run at all, + on CPUs that do not support efficient unaligned memory accesses. + + If unsure what to do here, say N. + endmenu # "Platform type" menu "Kernel features" diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index a74be78678eb0..ebbe02628a276 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -108,7 +108,9 @@ KBUILD_AFLAGS_MODULE += $(call as-option,-Wa$(comma)-mno-relax) # unaligned accesses. While unaligned accesses are explicitly allowed in the # RISC-V ISA, they're emulated by machine mode traps on all extant # architectures. It's faster to have GCC emit only aligned accesses. +ifneq ($(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS),y) KBUILD_CFLAGS += $(call cc-option,-mstrict-align) +endif ifeq ($(CONFIG_STACKPROTECTOR_PER_TASK),y) prepare: stack_protector_prepare -- GitLab From d0fdc20b0429150c9dd09111f9b1d9d48117b56f Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Mon, 25 Dec 2023 12:42:07 +0800 Subject: [PATCH 0755/1290] riscv: select DCACHE_WORD_ACCESS for efficient unaligned access HW DCACHE_WORD_ACCESS uses the word-at-a-time API for optimised string comparisons in the vfs layer. This patch implements support for load_unaligned_zeropad in much the same way as has been done for arm64. Here is the test program and step: $ cat tt.c #include #include #include #define ITERATIONS 1000000 #define PATH "123456781234567812345678123456781" int main(void) { unsigned long i; struct stat buf; for (i = 0; i < ITERATIONS; i++) stat(PATH, &buf); return 0; } $ gcc -O2 tt.c $ touch 123456781234567812345678123456781 $ time ./a.out Per my test on T-HEAD C910 platforms, the above test performance is improved by about 7.5%. Signed-off-by: Jisheng Zhang Link: https://lore.kernel.org/r/20231225044207.3821-3-jszhang@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/asm-extable.h | 15 ++++++++++++ arch/riscv/include/asm/word-at-a-time.h | 27 +++++++++++++++++++++ arch/riscv/mm/extable.c | 31 +++++++++++++++++++++++++ 4 files changed, 74 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 1ba03a2b509c3..39b0c594cc4eb 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -654,6 +654,7 @@ config RISCV_MISALIGNED config RISCV_EFFICIENT_UNALIGNED_ACCESS bool "Assume the CPU supports fast unaligned memory accesses" depends on NONPORTABLE + select DCACHE_WORD_ACCESS if MMU select HAVE_EFFICIENT_UNALIGNED_ACCESS help Say Y here if you want the kernel to assume that the CPU supports diff --git a/arch/riscv/include/asm/asm-extable.h b/arch/riscv/include/asm/asm-extable.h index 00a96e7a96644..0c8bfd54fc4e0 100644 --- a/arch/riscv/include/asm/asm-extable.h +++ b/arch/riscv/include/asm/asm-extable.h @@ -6,6 +6,7 @@ #define EX_TYPE_FIXUP 1 #define EX_TYPE_BPF 2 #define EX_TYPE_UACCESS_ERR_ZERO 3 +#define EX_TYPE_LOAD_UNALIGNED_ZEROPAD 4 #ifdef CONFIG_MMU @@ -47,6 +48,11 @@ #define EX_DATA_REG_ZERO_SHIFT 5 #define EX_DATA_REG_ZERO GENMASK(9, 5) +#define EX_DATA_REG_DATA_SHIFT 0 +#define EX_DATA_REG_DATA GENMASK(4, 0) +#define EX_DATA_REG_ADDR_SHIFT 5 +#define EX_DATA_REG_ADDR GENMASK(9, 5) + #define EX_DATA_REG(reg, gpr) \ "((.L__gpr_num_" #gpr ") << " __stringify(EX_DATA_REG_##reg##_SHIFT) ")" @@ -62,6 +68,15 @@ #define _ASM_EXTABLE_UACCESS_ERR(insn, fixup, err) \ _ASM_EXTABLE_UACCESS_ERR_ZERO(insn, fixup, err, zero) +#define _ASM_EXTABLE_LOAD_UNALIGNED_ZEROPAD(insn, fixup, data, addr) \ + __DEFINE_ASM_GPR_NUMS \ + __ASM_EXTABLE_RAW(#insn, #fixup, \ + __stringify(EX_TYPE_LOAD_UNALIGNED_ZEROPAD), \ + "(" \ + EX_DATA_REG(DATA, data) " | " \ + EX_DATA_REG(ADDR, addr) \ + ")") + #endif /* __ASSEMBLY__ */ #else /* CONFIG_MMU */ diff --git a/arch/riscv/include/asm/word-at-a-time.h b/arch/riscv/include/asm/word-at-a-time.h index 7c086ac6ecd4a..f3f031e34191d 100644 --- a/arch/riscv/include/asm/word-at-a-time.h +++ b/arch/riscv/include/asm/word-at-a-time.h @@ -9,6 +9,7 @@ #define _ASM_RISCV_WORD_AT_A_TIME_H +#include #include struct word_at_a_time { @@ -45,4 +46,30 @@ static inline unsigned long find_zero(unsigned long mask) /* The mask we created is directly usable as a bytemask */ #define zero_bytemask(mask) (mask) +#ifdef CONFIG_DCACHE_WORD_ACCESS + +/* + * Load an unaligned word from kernel space. + * + * In the (very unlikely) case of the word being a page-crosser + * and the next page not being mapped, take the exception and + * return zeroes in the non-existing part. + */ +static inline unsigned long load_unaligned_zeropad(const void *addr) +{ + unsigned long ret; + + /* Load word from unaligned pointer addr */ + asm( + "1: " REG_L " %0, %2\n" + "2:\n" + _ASM_EXTABLE_LOAD_UNALIGNED_ZEROPAD(1b, 2b, %0, %1) + : "=&r" (ret) + : "r" (addr), "m" (*(unsigned long *)addr)); + + return ret; +} + +#endif /* CONFIG_DCACHE_WORD_ACCESS */ + #endif /* _ASM_RISCV_WORD_AT_A_TIME_H */ diff --git a/arch/riscv/mm/extable.c b/arch/riscv/mm/extable.c index 35484d830fd6d..dd1530af3ef15 100644 --- a/arch/riscv/mm/extable.c +++ b/arch/riscv/mm/extable.c @@ -27,6 +27,14 @@ static bool ex_handler_fixup(const struct exception_table_entry *ex, return true; } +static inline unsigned long regs_get_gpr(struct pt_regs *regs, unsigned int offset) +{ + if (unlikely(!offset || offset > MAX_REG_OFFSET)) + return 0; + + return *(unsigned long *)((unsigned long)regs + offset); +} + static inline void regs_set_gpr(struct pt_regs *regs, unsigned int offset, unsigned long val) { @@ -50,6 +58,27 @@ static bool ex_handler_uaccess_err_zero(const struct exception_table_entry *ex, return true; } +static bool +ex_handler_load_unaligned_zeropad(const struct exception_table_entry *ex, + struct pt_regs *regs) +{ + int reg_data = FIELD_GET(EX_DATA_REG_DATA, ex->data); + int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data); + unsigned long data, addr, offset; + + addr = regs_get_gpr(regs, reg_addr * sizeof(unsigned long)); + + offset = addr & 0x7UL; + addr &= ~0x7UL; + + data = *(unsigned long *)addr >> (offset * 8); + + regs_set_gpr(regs, reg_data * sizeof(unsigned long), data); + + regs->epc = get_ex_fixup(ex); + return true; +} + bool fixup_exception(struct pt_regs *regs) { const struct exception_table_entry *ex; @@ -65,6 +94,8 @@ bool fixup_exception(struct pt_regs *regs) return ex_handler_bpf(ex, regs); case EX_TYPE_UACCESS_ERR_ZERO: return ex_handler_uaccess_err_zero(ex, regs); + case EX_TYPE_LOAD_UNALIGNED_ZEROPAD: + return ex_handler_load_unaligned_zeropad(ex, regs); } BUG(); -- GitLab From dc5b56241cf3e4084723c2e27758358e0feec84f Mon Sep 17 00:00:00 2001 From: Anshul Dalal Date: Tue, 9 Jan 2024 14:38:57 -0800 Subject: [PATCH 0756/1290] dt-bindings: input: bindings for Adafruit Seesaw Gamepad Adds bindings for the Adafruit Seesaw Gamepad. The gamepad functions as an i2c device with the default address of 0x50 and has an IRQ pin that can be enabled in the driver to allow for a rising edge trigger on each button press or joystick movement. Product page: https://www.adafruit.com/product/5743 Arduino driver: https://github.com/adafruit/Adafruit_Seesaw Reviewed-by: Conor Dooley Reviewed-by: Krzysztof Kozlowski Signed-off-by: Anshul Dalal Link: https://lore.kernel.org/r/20240106015111.882325-1-anshulusr@gmail.com Signed-off-by: Dmitry Torokhov --- .../input/adafruit,seesaw-gamepad.yaml | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 Documentation/devicetree/bindings/input/adafruit,seesaw-gamepad.yaml diff --git a/Documentation/devicetree/bindings/input/adafruit,seesaw-gamepad.yaml b/Documentation/devicetree/bindings/input/adafruit,seesaw-gamepad.yaml new file mode 100644 index 0000000000000..5e86f6de69784 --- /dev/null +++ b/Documentation/devicetree/bindings/input/adafruit,seesaw-gamepad.yaml @@ -0,0 +1,63 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/input/adafruit,seesaw-gamepad.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Adafruit Mini I2C Gamepad with seesaw + +maintainers: + - Anshul Dalal + +description: | + Adafruit Mini I2C Gamepad + + +-----------------------------+ + | ___ | + | / \ (X) | + | | S | __ __ (Y) (A) | + | \___/ |ST| |SE| (B) | + | | + +-----------------------------+ + + S -> 10-bit precision bidirectional analog joystick + ST -> Start + SE -> Select + X, A, B, Y -> Digital action buttons + + Datasheet: https://cdn-learn.adafruit.com/downloads/pdf/gamepad-qt.pdf + Product page: https://www.adafruit.com/product/5743 + Arduino Driver: https://github.com/adafruit/Adafruit_Seesaw + +properties: + compatible: + const: adafruit,seesaw-gamepad + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + description: + The gamepad's IRQ pin triggers a rising edge if interrupts are enabled. + +required: + - compatible + - reg + +additionalProperties: false + +examples: + - | + #include + + i2c { + #address-cells = <1>; + #size-cells = <0>; + + joystick@50 { + compatible = "adafruit,seesaw-gamepad"; + interrupts = <18 IRQ_TYPE_EDGE_RISING>; + reg = <0x50>; + }; + }; -- GitLab From 52c4e5985a730796a3fa555b83b404708b960f9d Mon Sep 17 00:00:00 2001 From: Anshul Dalal Date: Tue, 9 Jan 2024 14:39:26 -0800 Subject: [PATCH 0757/1290] Input: driver for Adafruit Seesaw Gamepad MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a driver for a mini gamepad that communicates over i2c, the gamepad has bidirectional thumb stick input and six buttons. The gamepad chip utilizes the open framework from Adafruit called 'Seesaw' to transmit the ADC data for the joystick and digital pin state for the buttons. I have only implemented the functionality required to receive the thumb stick and button state. Steps in reading the gamepad state over i2c: 1. Reset the registers 2. Set the pin mode of the pins specified by the `BUTTON_MASK` to input `BUTTON_MASK`: A bit-map for the six digital pins internally connected to the joystick buttons. 3. Enable internal pullup resistors for the `BUTTON_MASK` 4. Bulk set the pin state HIGH for `BUTTON_MASK` 5. Poll the device for button and joystick state done by: `seesaw_read_data(struct i2c_client *client, struct seesaw_data *data)` Product page: https://www.adafruit.com/product/5743 Arduino driver: https://github.com/adafruit/Adafruit_Seesaw Driver tested on RPi Zero 2W Reviewed-by: Thomas Weißschuh Signed-off-by: Anshul Dalal Link: https://lore.kernel.org/r/20240106015111.882325-2-anshulusr@gmail.com Signed-off-by: Dmitry Torokhov --- MAINTAINERS | 7 + drivers/input/joystick/Kconfig | 10 + drivers/input/joystick/Makefile | 1 + drivers/input/joystick/adafruit-seesaw.c | 315 +++++++++++++++++++++++ 4 files changed, 333 insertions(+) create mode 100644 drivers/input/joystick/adafruit-seesaw.c diff --git a/MAINTAINERS b/MAINTAINERS index 4cc6bf79fdd83..0595c832c2489 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -441,6 +441,13 @@ W: http://wiki.analog.com/AD7879 W: https://ez.analog.com/linux-software-drivers F: drivers/input/touchscreen/ad7879.c +ADAFRUIT MINI I2C GAMEPAD +M: Anshul Dalal +L: linux-input@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/input/adafruit,seesaw-gamepad.yaml +F: drivers/input/joystick/adafruit-seesaw.c + ADDRESS SPACE LAYOUT RANDOMIZATION (ASLR) M: Jiri Kosina S: Maintained diff --git a/drivers/input/joystick/Kconfig b/drivers/input/joystick/Kconfig index ac6925ce83667..7755e5b454d2c 100644 --- a/drivers/input/joystick/Kconfig +++ b/drivers/input/joystick/Kconfig @@ -412,4 +412,14 @@ config JOYSTICK_SENSEHAT To compile this driver as a module, choose M here: the module will be called sensehat_joystick. +config JOYSTICK_SEESAW + tristate "Adafruit Mini I2C Gamepad with Seesaw" + depends on I2C + select INPUT_SPARSEKMAP + help + Say Y here if you want to use the Adafruit Mini I2C Gamepad. + + To compile this driver as a module, choose M here: the module will be + called adafruit-seesaw. + endif diff --git a/drivers/input/joystick/Makefile b/drivers/input/joystick/Makefile index 3937535f00981..9976f596a9208 100644 --- a/drivers/input/joystick/Makefile +++ b/drivers/input/joystick/Makefile @@ -28,6 +28,7 @@ obj-$(CONFIG_JOYSTICK_N64) += n64joy.o obj-$(CONFIG_JOYSTICK_PSXPAD_SPI) += psxpad-spi.o obj-$(CONFIG_JOYSTICK_PXRC) += pxrc.o obj-$(CONFIG_JOYSTICK_QWIIC) += qwiic-joystick.o +obj-$(CONFIG_JOYSTICK_SEESAW) += adafruit-seesaw.o obj-$(CONFIG_JOYSTICK_SENSEHAT) += sensehat-joystick.o obj-$(CONFIG_JOYSTICK_SIDEWINDER) += sidewinder.o obj-$(CONFIG_JOYSTICK_SPACEBALL) += spaceball.o diff --git a/drivers/input/joystick/adafruit-seesaw.c b/drivers/input/joystick/adafruit-seesaw.c new file mode 100644 index 0000000000000..1b9279f024cc6 --- /dev/null +++ b/drivers/input/joystick/adafruit-seesaw.c @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2023 Anshul Dalal + * + * Driver for Adafruit Mini I2C Gamepad + * + * Based on the work of: + * Oleh Kravchenko (Sparkfun Qwiic Joystick driver) + * + * Datasheet: https://cdn-learn.adafruit.com/downloads/pdf/gamepad-qt.pdf + * Product page: https://www.adafruit.com/product/5743 + * Firmware and hardware sources: https://github.com/adafruit/Adafruit_Seesaw + * + * TODO: + * - Add interrupt support + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define SEESAW_DEVICE_NAME "seesaw-gamepad" + +#define SEESAW_ADC_BASE 0x0900 + +#define SEESAW_GPIO_DIRCLR_BULK 0x0103 +#define SEESAW_GPIO_BULK 0x0104 +#define SEESAW_GPIO_BULK_SET 0x0105 +#define SEESAW_GPIO_PULLENSET 0x010b + +#define SEESAW_STATUS_HW_ID 0x0001 +#define SEESAW_STATUS_SWRST 0x007f + +#define SEESAW_ADC_OFFSET 0x07 + +#define SEESAW_BUTTON_A 0x05 +#define SEESAW_BUTTON_B 0x01 +#define SEESAW_BUTTON_X 0x06 +#define SEESAW_BUTTON_Y 0x02 +#define SEESAW_BUTTON_START 0x10 +#define SEESAW_BUTTON_SELECT 0x00 + +#define SEESAW_ANALOG_X 0x0e +#define SEESAW_ANALOG_Y 0x0f + +#define SEESAW_JOYSTICK_MAX_AXIS 1023 +#define SEESAW_JOYSTICK_FUZZ 2 +#define SEESAW_JOYSTICK_FLAT 4 + +#define SEESAW_GAMEPAD_POLL_INTERVAL_MS 16 +#define SEESAW_GAMEPAD_POLL_MIN 8 +#define SEESAW_GAMEPAD_POLL_MAX 32 + +static const unsigned long SEESAW_BUTTON_MASK = + BIT(SEESAW_BUTTON_A) | BIT(SEESAW_BUTTON_B) | BIT(SEESAW_BUTTON_X) | + BIT(SEESAW_BUTTON_Y) | BIT(SEESAW_BUTTON_START) | + BIT(SEESAW_BUTTON_SELECT); + +struct seesaw_gamepad { + struct input_dev *input_dev; + struct i2c_client *i2c_client; +}; + +struct seesaw_data { + u16 x; + u16 y; + u32 button_state; +}; + +static const struct key_entry seesaw_buttons_new[] = { + { KE_KEY, SEESAW_BUTTON_A, .keycode = BTN_SOUTH }, + { KE_KEY, SEESAW_BUTTON_B, .keycode = BTN_EAST }, + { KE_KEY, SEESAW_BUTTON_X, .keycode = BTN_NORTH }, + { KE_KEY, SEESAW_BUTTON_Y, .keycode = BTN_WEST }, + { KE_KEY, SEESAW_BUTTON_START, .keycode = BTN_START }, + { KE_KEY, SEESAW_BUTTON_SELECT, .keycode = BTN_SELECT }, + { KE_END, 0 } +}; + +static int seesaw_register_read(struct i2c_client *client, u16 reg, void *buf, + int count) +{ + __be16 register_buf = cpu_to_be16(reg); + struct i2c_msg message_buf[2] = { + { + .addr = client->addr, + .flags = client->flags, + .len = sizeof(register_buf), + .buf = (u8 *)®ister_buf, + }, + { + .addr = client->addr, + .flags = client->flags | I2C_M_RD, + .len = count, + .buf = (u8 *)buf, + }, + }; + int ret; + + ret = i2c_transfer(client->adapter, message_buf, + ARRAY_SIZE(message_buf)); + if (ret < 0) + return ret; + + return 0; +} + +static int seesaw_register_write_u8(struct i2c_client *client, u16 reg, + u8 value) +{ + u8 write_buf[sizeof(reg) + sizeof(value)]; + int ret; + + put_unaligned_be16(reg, write_buf); + write_buf[sizeof(reg)] = value; + + ret = i2c_master_send(client, write_buf, sizeof(write_buf)); + if (ret < 0) + return ret; + + return 0; +} + +static int seesaw_register_write_u32(struct i2c_client *client, u16 reg, + u32 value) +{ + u8 write_buf[sizeof(reg) + sizeof(value)]; + int ret; + + put_unaligned_be16(reg, write_buf); + put_unaligned_be32(value, write_buf + sizeof(reg)); + ret = i2c_master_send(client, write_buf, sizeof(write_buf)); + if (ret < 0) + return ret; + + return 0; +} + +static int seesaw_read_data(struct i2c_client *client, struct seesaw_data *data) +{ + __be16 adc_data; + __be32 read_buf; + int err; + + err = seesaw_register_read(client, SEESAW_GPIO_BULK, + &read_buf, sizeof(read_buf)); + if (err) + return err; + + data->button_state = ~be32_to_cpu(read_buf); + + err = seesaw_register_read(client, + SEESAW_ADC_BASE | + (SEESAW_ADC_OFFSET + SEESAW_ANALOG_X), + &adc_data, sizeof(adc_data)); + if (err) + return err; + /* + * ADC reads left as max and right as 0, must be reversed since kernel + * expects reports in opposite order. + */ + data->x = SEESAW_JOYSTICK_MAX_AXIS - be16_to_cpu(adc_data); + + err = seesaw_register_read(client, + SEESAW_ADC_BASE | + (SEESAW_ADC_OFFSET + SEESAW_ANALOG_Y), + &adc_data, sizeof(adc_data)); + if (err) + return err; + + data->y = be16_to_cpu(adc_data); + + return 0; +} + +static void seesaw_poll(struct input_dev *input) +{ + struct seesaw_gamepad *private = input_get_drvdata(input); + struct seesaw_data data; + int err, i; + + err = seesaw_read_data(private->i2c_client, &data); + if (err) { + dev_err_ratelimited(&input->dev, + "failed to read joystick state: %d\n", err); + return; + } + + input_report_abs(input, ABS_X, data.x); + input_report_abs(input, ABS_Y, data.y); + + for_each_set_bit(i, &SEESAW_BUTTON_MASK, + BITS_PER_TYPE(SEESAW_BUTTON_MASK)) { + if (!sparse_keymap_report_event(input, i, + data.button_state & BIT(i), + false)) + dev_err_ratelimited(&input->dev, + "failed to report keymap event"); + } + + input_sync(input); +} + +static int seesaw_probe(struct i2c_client *client) +{ + struct seesaw_gamepad *seesaw; + u8 hardware_id; + int err; + + err = seesaw_register_write_u8(client, SEESAW_STATUS_SWRST, 0xFF); + if (err) + return err; + + /* Wait for the registers to reset before proceeding */ + usleep_range(10000, 15000); + + seesaw = devm_kzalloc(&client->dev, sizeof(*seesaw), GFP_KERNEL); + if (!seesaw) + return -ENOMEM; + + err = seesaw_register_read(client, SEESAW_STATUS_HW_ID, + &hardware_id, sizeof(hardware_id)); + if (err) + return err; + + dev_dbg(&client->dev, "Adafruit Seesaw Gamepad, Hardware ID: %02x\n", + hardware_id); + + /* Set Pin Mode to input and enable pull-up resistors */ + err = seesaw_register_write_u32(client, SEESAW_GPIO_DIRCLR_BULK, + SEESAW_BUTTON_MASK); + if (err) + return err; + err = seesaw_register_write_u32(client, SEESAW_GPIO_PULLENSET, + SEESAW_BUTTON_MASK); + if (err) + return err; + err = seesaw_register_write_u32(client, SEESAW_GPIO_BULK_SET, + SEESAW_BUTTON_MASK); + if (err) + return err; + + seesaw->i2c_client = client; + seesaw->input_dev = devm_input_allocate_device(&client->dev); + if (!seesaw->input_dev) + return -ENOMEM; + + seesaw->input_dev->id.bustype = BUS_I2C; + seesaw->input_dev->name = "Adafruit Seesaw Gamepad"; + seesaw->input_dev->phys = "i2c/" SEESAW_DEVICE_NAME; + input_set_drvdata(seesaw->input_dev, seesaw); + input_set_abs_params(seesaw->input_dev, ABS_X, + 0, SEESAW_JOYSTICK_MAX_AXIS, + SEESAW_JOYSTICK_FUZZ, SEESAW_JOYSTICK_FLAT); + input_set_abs_params(seesaw->input_dev, ABS_Y, + 0, SEESAW_JOYSTICK_MAX_AXIS, + SEESAW_JOYSTICK_FUZZ, SEESAW_JOYSTICK_FLAT); + + err = sparse_keymap_setup(seesaw->input_dev, seesaw_buttons_new, NULL); + if (err) { + dev_err(&client->dev, + "failed to set up input device keymap: %d\n", err); + return err; + } + + err = input_setup_polling(seesaw->input_dev, seesaw_poll); + if (err) { + dev_err(&client->dev, "failed to set up polling: %d\n", err); + return err; + } + + input_set_poll_interval(seesaw->input_dev, + SEESAW_GAMEPAD_POLL_INTERVAL_MS); + input_set_max_poll_interval(seesaw->input_dev, SEESAW_GAMEPAD_POLL_MAX); + input_set_min_poll_interval(seesaw->input_dev, SEESAW_GAMEPAD_POLL_MIN); + + err = input_register_device(seesaw->input_dev); + if (err) { + dev_err(&client->dev, "failed to register joystick: %d\n", err); + return err; + } + + return 0; +} + +static const struct i2c_device_id seesaw_id_table[] = { + { SEESAW_DEVICE_NAME }, + { /* Sentinel */ } +}; +MODULE_DEVICE_TABLE(i2c, seesaw_id_table); + +static const struct of_device_id seesaw_of_table[] = { + { .compatible = "adafruit,seesaw-gamepad"}, + { /* Sentinel */ } +}; +MODULE_DEVICE_TABLE(of, seesaw_of_table); + +static struct i2c_driver seesaw_driver = { + .driver = { + .name = SEESAW_DEVICE_NAME, + .of_match_table = seesaw_of_table, + }, + .id_table = seesaw_id_table, + .probe = seesaw_probe, +}; +module_i2c_driver(seesaw_driver); + +MODULE_AUTHOR("Anshul Dalal "); +MODULE_DESCRIPTION("Adafruit Mini I2C Gamepad driver"); +MODULE_LICENSE("GPL"); -- GitLab From 1b5e94657320c86fc660745e3fc64321948649be Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 30 Dec 2023 22:51:56 +0900 Subject: [PATCH 0758/1290] kbuild: deb-pkg: move 'make headers' to build-arch Strictly speaking, 'make headers' should be a part of build-arch instead of binary-arch. 'make headers' constructs ready-to-copy UAPI headers in the kernel directory. Signed-off-by: Masahiro Yamada Reviewed-by: Nicolas Schier --- scripts/package/builddeb | 1 - scripts/package/debian/rules | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/package/builddeb b/scripts/package/builddeb index cc8c7a807fcc2..842ee4b405285 100755 --- a/scripts/package/builddeb +++ b/scripts/package/builddeb @@ -155,7 +155,6 @@ install_libc_headers () { rm -rf $pdir - $MAKE -f $srctree/Makefile headers $MAKE -f $srctree/Makefile headers_install INSTALL_HDR_PATH=$pdir/usr # move asm headers to /usr/include//asm to match the structure diff --git a/scripts/package/debian/rules b/scripts/package/debian/rules index 7ab31419579f0..0983077800622 100755 --- a/scripts/package/debian/rules +++ b/scripts/package/debian/rules @@ -26,8 +26,8 @@ binary-arch: build-arch build: build-arch build-indep build-indep: build-arch: - $(MAKE) $(make-opts) \ - olddefconfig all + $(MAKE) $(make-opts) olddefconfig + $(MAKE) $(make-opts) $(if $(filter um,$(ARCH)),,headers) all .PHONY: clean clean: -- GitLab From 6185d32170b683abadddf1e68be998e24f3cc5de Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 30 Dec 2023 22:51:58 +0900 Subject: [PATCH 0759/1290] kbuild: deb-pkg: use debian/ for tmpdir Use debian/ for tmpdir, which is the default of debhelper. This simplifies the code. Signed-off-by: Masahiro Yamada Reviewed-by: Nicolas Schier --- scripts/package/builddeb | 41 ++++++++++++---------------------------- 1 file changed, 12 insertions(+), 29 deletions(-) diff --git a/scripts/package/builddeb b/scripts/package/builddeb index 842ee4b405285..bf96a3c246081 100755 --- a/scripts/package/builddeb +++ b/scripts/package/builddeb @@ -25,9 +25,7 @@ if_enabled_echo() { } create_package() { - local pname="$1" pdir="$2" - - export DH_OPTIONS="-p${pname} -P${pdir}" + export DH_OPTIONS="-p${1}" dh_installdocs dh_installchangelogs @@ -39,8 +37,8 @@ create_package() { } install_linux_image () { - pdir=$1 - pname=$2 + pname=$1 + pdir=debian/$1 rm -rf ${pdir} @@ -109,7 +107,7 @@ install_linux_image () { } install_linux_image_dbg () { - pdir=$1 + pdir=debian/$1 rm -rf ${pdir} @@ -139,8 +137,8 @@ install_linux_image_dbg () { } install_kernel_headers () { - pdir=$1 - version=$2 + pdir=debian/$1 + version=${1#linux-headers-} rm -rf $pdir @@ -151,7 +149,7 @@ install_kernel_headers () { } install_libc_headers () { - pdir=$1 + pdir=debian/$1 rm -rf $pdir @@ -171,28 +169,13 @@ for package in ${packages_enabled} do case ${package} in *-dbg) - install_linux_image_dbg debian/linux-image-dbg;; - linux-image-*|user-mode-linux-*) - install_linux_image debian/linux-image ${package};; - linux-libc-dev) - install_libc_headers debian/linux-libc-dev;; - linux-headers-*) - install_kernel_headers debian/linux-headers ${package#linux-headers-};; - esac -done - -for package in ${packages_enabled} -do - case ${package} in - *-dbg) - create_package ${package} debian/linux-image-dbg;; + install_linux_image_dbg "${package}";; linux-image-*|user-mode-linux-*) - create_package ${package} debian/linux-image;; + install_linux_image "${package}";; linux-libc-dev) - create_package ${package} debian/linux-libc-dev;; + install_libc_headers "${package}";; linux-headers-*) - create_package ${package} debian/linux-headers;; + install_kernel_headers "${package}";; esac + create_package "${package}" done - -exit 0 -- GitLab From e70b8dd26711704b1ff1f1b4eb3d048ba69e29da Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Wed, 10 Jan 2024 11:57:57 +0100 Subject: [PATCH 0760/1290] ASoC: mediatek: mt8195: Remove afe-dai component and rework codec link Remove the extra 'mt8195-afe-pcm-dai' component, register the DAI drivers to the main AFE component, and rework the DAI linking between the headset codec (RT5682/RT5682S) and the TDM interface in the probe function to stop assigning name, relying on the of_node of the codec. Also replace the COMP_DUMMY codec entry with a COMP_EMPTY for the ETDM2_IN and remove it entirely from ETDM1_OUT to fix the registration flow for this sound card. While at it, since we also need to swap the codec init function from ETDM2_IN to ETDM1_OUT, remove the static assignment of both `ops` and `init` for both, as we now assign these dynamically during probe. Fixes: 13f58267cda3 ("ASoC: soc.h: don't create dummy Component via COMP_DUMMY()") Signed-off-by: AngeloGioacchino Del Regno Link: https://msgid.link/r/20240110105757.539089-1-angelogioacchino.delregno@collabora.com Signed-off-by: Mark Brown --- sound/soc/mediatek/mt8195/mt8195-afe-pcm.c | 33 +---------------- sound/soc/mediatek/mt8195/mt8195-mt6359.c | 41 +++++++++++++++------- 2 files changed, 29 insertions(+), 45 deletions(-) diff --git a/sound/soc/mediatek/mt8195/mt8195-afe-pcm.c b/sound/soc/mediatek/mt8195/mt8195-afe-pcm.c index 1e33863c85ca0..620d7ade1992e 100644 --- a/sound/soc/mediatek/mt8195/mt8195-afe-pcm.c +++ b/sound/soc/mediatek/mt8195/mt8195-afe-pcm.c @@ -1795,10 +1795,6 @@ static const struct snd_kcontrol_new mt8195_memif_controls[] = { MT8195_AFE_IRQ_28), }; -static const struct snd_soc_component_driver mt8195_afe_pcm_dai_component = { - .name = "mt8195-afe-pcm-dai", -}; - static const struct mtk_base_memif_data memif_data[MT8195_AFE_MEMIF_NUM] = { [MT8195_AFE_MEMIF_DL2] = { .name = "DL2", @@ -3037,7 +3033,6 @@ static int mt8195_afe_pcm_dev_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct reset_control *rstc; int i, irq_id, ret; - struct snd_soc_component *component; ret = of_reserved_mem_device_init(dev); if (ret) @@ -3170,36 +3165,12 @@ static int mt8195_afe_pcm_dev_probe(struct platform_device *pdev) /* register component */ ret = devm_snd_soc_register_component(dev, &mt8195_afe_component, - NULL, 0); + afe->dai_drivers, afe->num_dai_drivers); if (ret) { dev_warn(dev, "err_platform\n"); goto err_pm_put; } - component = devm_kzalloc(dev, sizeof(*component), GFP_KERNEL); - if (!component) { - ret = -ENOMEM; - goto err_pm_put; - } - - ret = snd_soc_component_initialize(component, - &mt8195_afe_pcm_dai_component, - dev); - if (ret) - goto err_pm_put; - -#ifdef CONFIG_DEBUG_FS - component->debugfs_prefix = "pcm"; -#endif - - ret = snd_soc_add_component(component, - afe->dai_drivers, - afe->num_dai_drivers); - if (ret) { - dev_warn(dev, "err_dai_component\n"); - goto err_pm_put; - } - ret = regmap_multi_reg_write(afe->regmap, mt8195_afe_reg_defaults, ARRAY_SIZE(mt8195_afe_reg_defaults)); if (ret) @@ -3224,8 +3195,6 @@ err_pm_put: static void mt8195_afe_pcm_dev_remove(struct platform_device *pdev) { - snd_soc_unregister_component(&pdev->dev); - pm_runtime_disable(&pdev->dev); if (!pm_runtime_status_suspended(&pdev->dev)) mt8195_afe_runtime_suspend(&pdev->dev); diff --git a/sound/soc/mediatek/mt8195/mt8195-mt6359.c b/sound/soc/mediatek/mt8195/mt8195-mt6359.c index 4feb9fb769679..53fd8a897b9d2 100644 --- a/sound/soc/mediatek/mt8195/mt8195-mt6359.c +++ b/sound/soc/mediatek/mt8195/mt8195-mt6359.c @@ -934,12 +934,11 @@ SND_SOC_DAILINK_DEFS(ETDM1_IN_BE, SND_SOC_DAILINK_DEFS(ETDM2_IN_BE, DAILINK_COMP_ARRAY(COMP_CPU("ETDM2_IN")), - DAILINK_COMP_ARRAY(COMP_DUMMY()), + DAILINK_COMP_ARRAY(COMP_EMPTY()), DAILINK_COMP_ARRAY(COMP_EMPTY())); SND_SOC_DAILINK_DEFS(ETDM1_OUT_BE, DAILINK_COMP_ARRAY(COMP_CPU("ETDM1_OUT")), - DAILINK_COMP_ARRAY(COMP_DUMMY()), DAILINK_COMP_ARRAY(COMP_EMPTY())); SND_SOC_DAILINK_DEFS(ETDM2_OUT_BE, @@ -1237,8 +1236,6 @@ static struct snd_soc_dai_link mt8195_mt6359_dai_links[] = { SND_SOC_DAIFMT_NB_NF | SND_SOC_DAIFMT_CBS_CFS, .dpcm_capture = 1, - .init = mt8195_rt5682_init, - .ops = &mt8195_rt5682_etdm_ops, .be_hw_params_fixup = mt8195_etdm_hw_params_fixup, SND_SOC_DAILINK_REG(ETDM2_IN_BE), }, @@ -1249,7 +1246,6 @@ static struct snd_soc_dai_link mt8195_mt6359_dai_links[] = { SND_SOC_DAIFMT_NB_NF | SND_SOC_DAIFMT_CBS_CFS, .dpcm_playback = 1, - .ops = &mt8195_rt5682_etdm_ops, .be_hw_params_fixup = mt8195_etdm_hw_params_fixup, SND_SOC_DAILINK_REG(ETDM1_OUT_BE), }, @@ -1381,7 +1377,7 @@ static int mt8195_mt6359_dev_probe(struct platform_device *pdev) struct snd_soc_dai_link *dai_link; struct mtk_soc_card_data *soc_card_data; struct mt8195_mt6359_priv *mach_priv; - struct device_node *platform_node, *adsp_node, *dp_node, *hdmi_node; + struct device_node *platform_node, *adsp_node, *codec_node, *dp_node, *hdmi_node; struct mt8195_card_data *card_data; int is5682s = 0; int init6359 = 0; @@ -1401,8 +1397,12 @@ static int mt8195_mt6359_dev_probe(struct platform_device *pdev) if (!card->name) card->name = card_data->name; - if (strstr(card->name, "_5682s")) + if (strstr(card->name, "_5682s")) { + codec_node = of_find_compatible_node(NULL, NULL, "realtek,rt5682s"); is5682s = 1; + } else + codec_node = of_find_compatible_node(NULL, NULL, "realtek,rt5682i"); + soc_card_data = devm_kzalloc(&pdev->dev, sizeof(*card_data), GFP_KERNEL); if (!soc_card_data) return -ENOMEM; @@ -1488,12 +1488,27 @@ static int mt8195_mt6359_dev_probe(struct platform_device *pdev) dai_link->codecs->dai_name = "i2s-hifi"; dai_link->init = mt8195_hdmi_codec_init; } - } else if (strcmp(dai_link->name, "ETDM1_OUT_BE") == 0 || - strcmp(dai_link->name, "ETDM2_IN_BE") == 0) { - dai_link->codecs->name = - is5682s ? RT5682S_DEV0_NAME : RT5682_DEV0_NAME; - dai_link->codecs->dai_name = - is5682s ? RT5682S_CODEC_DAI : RT5682_CODEC_DAI; + } else if (strcmp(dai_link->name, "ETDM1_OUT_BE") == 0) { + if (!codec_node) { + dev_err(&pdev->dev, "Codec not found!\n"); + } else { + dai_link->codecs->of_node = codec_node; + dai_link->codecs->name = NULL; + dai_link->codecs->dai_name = + is5682s ? RT5682S_CODEC_DAI : RT5682_CODEC_DAI; + dai_link->init = mt8195_rt5682_init; + dai_link->ops = &mt8195_rt5682_etdm_ops; + } + } else if (strcmp(dai_link->name, "ETDM2_IN_BE") == 0) { + if (!codec_node) { + dev_err(&pdev->dev, "Codec not found!\n"); + } else { + dai_link->codecs->of_node = codec_node; + dai_link->codecs->name = NULL; + dai_link->codecs->dai_name = + is5682s ? RT5682S_CODEC_DAI : RT5682_CODEC_DAI; + dai_link->ops = &mt8195_rt5682_etdm_ops; + } } else if (strcmp(dai_link->name, "DL_SRC_BE") == 0 || strcmp(dai_link->name, "UL_SRC1_BE") == 0 || strcmp(dai_link->name, "UL_SRC2_BE") == 0) { -- GitLab From 78996eee79ebdfe8b6f0e54cb6dcc792d5129291 Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Thu, 4 Jan 2024 11:42:47 -0800 Subject: [PATCH 0761/1290] riscv: Fix module loading free order Reverse order of kfree calls to resolve use-after-free error. Signed-off-by: Charlie Jenkins Fixes: d8792a5734b0 ("riscv: Safely remove entries from relocation list") Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202312132019.iYGTwW0L-lkp@intel.com/ Reported-by: kernel test robot Reported-by: Julia Lawall Closes: https://lore.kernel.org/r/202312120044.wTI1Uyaa-lkp@intel.com/ Reviewed-by: Dan Carpenter Link: https://lore.kernel.org/r/20240104-module_loading_fix-v3-1-a71f8de6ce0f@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c index aac019ed63b1b..21c7a773a8efa 100644 --- a/arch/riscv/kernel/module.c +++ b/arch/riscv/kernel/module.c @@ -723,8 +723,8 @@ static int add_relocation_to_accumulate(struct module *me, int type, if (!bucket) { kfree(entry); - kfree(rel_head); kfree(rel_head->rel_entry); + kfree(rel_head); return -ENOMEM; } -- GitLab From 4b38b36bfbd83b23e20c172d08dd85773791e3bd Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Thu, 4 Jan 2024 11:42:48 -0800 Subject: [PATCH 0762/1290] riscv: Correctly free relocation hashtable on error When there is not enough allocatable memory for the relocation hashtable, module loading should exit gracefully. Previously, this was attempted to be accomplished by checking if an unsigned number is less than zero which does not work. Instead have the caller check if the hashtable was correctly allocated and add a comment explaining that hashtable_bits that is 0 is valid. Signed-off-by: Charlie Jenkins Fixes: d8792a5734b0 ("riscv: Safely remove entries from relocation list") Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202312132019.iYGTwW0L-lkp@intel.com/ Reported-by: kernel test robot Reported-by: Julia Lawall Closes: https://lore.kernel.org/r/202312120044.wTI1Uyaa-lkp@intel.com/ Reviewed-by: Dan Carpenter Link: https://lore.kernel.org/r/20240104-module_loading_fix-v3-2-a71f8de6ce0f@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/module.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c index 21c7a773a8efa..32743180e8ef5 100644 --- a/arch/riscv/kernel/module.c +++ b/arch/riscv/kernel/module.c @@ -747,6 +747,10 @@ initialize_relocation_hashtable(unsigned int num_relocations, { /* Can safely assume that bits is not greater than sizeof(long) */ unsigned long hashtable_size = roundup_pow_of_two(num_relocations); + /* + * When hashtable_size == 1, hashtable_bits == 0. + * This is valid because the hashing algorithm returns 0 in this case. + */ unsigned int hashtable_bits = ilog2(hashtable_size); /* @@ -763,7 +767,7 @@ initialize_relocation_hashtable(unsigned int num_relocations, sizeof(*relocation_hashtable), GFP_KERNEL); if (!*relocation_hashtable) - return -ENOMEM; + return 0; __hash_init(*relocation_hashtable, hashtable_size); @@ -789,8 +793,8 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, hashtable_bits = initialize_relocation_hashtable(num_relocations, &relocation_hashtable); - if (hashtable_bits < 0) - return hashtable_bits; + if (!relocation_hashtable) + return -ENOMEM; INIT_LIST_HEAD(&used_buckets_list); -- GitLab From a35551c7244d9d061643a01eb96cc3ba04eaf45c Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Thu, 4 Jan 2024 11:42:49 -0800 Subject: [PATCH 0763/1290] riscv: Fix relocation_hashtable size A second dereference is needed to get the accurate size of the relocation_hashtable. Signed-off-by: Charlie Jenkins Fixes: d8792a5734b0 ("riscv: Safely remove entries from relocation list") Reported-by: kernel test robot Reported-by: Julia Lawall Closes: https://lore.kernel.org/r/202312120044.wTI1Uyaa-lkp@intel.com/ Reviewed-by: Dan Carpenter Link: https://lore.kernel.org/r/20240104-module_loading_fix-v3-3-a71f8de6ce0f@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c index 32743180e8ef5..ceb0adb387158 100644 --- a/arch/riscv/kernel/module.c +++ b/arch/riscv/kernel/module.c @@ -764,7 +764,7 @@ initialize_relocation_hashtable(unsigned int num_relocations, hashtable_size <<= should_double_size; *relocation_hashtable = kmalloc_array(hashtable_size, - sizeof(*relocation_hashtable), + sizeof(**relocation_hashtable), GFP_KERNEL); if (!*relocation_hashtable) return 0; -- GitLab From f503b167b66007fc6b4434cd07a044ce4a56b6a0 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Fri, 24 Nov 2023 12:39:01 +0530 Subject: [PATCH 0764/1290] RISC-V: Add stubs for sbi_console_putchar/getchar() The functions sbi_console_putchar() and sbi_console_getchar() are not defined when CONFIG_RISCV_SBI_V01 is disabled so let us add stub of these functions to avoid "#ifdef" on user side. Signed-off-by: Anup Patel Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20231124070905.1043092-2-apatel@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/sbi.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 0892f4421bc4a..66f3933c14f63 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -271,8 +271,13 @@ struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0, unsigned long arg3, unsigned long arg4, unsigned long arg5); +#ifdef CONFIG_RISCV_SBI_V01 void sbi_console_putchar(int ch); int sbi_console_getchar(void); +#else +static inline void sbi_console_putchar(int ch) { } +static inline int sbi_console_getchar(void) { return -ENOENT; } +#endif long sbi_get_mvendorid(void); long sbi_get_marchid(void); long sbi_get_mimpid(void); -- GitLab From f43fabf444ca3c4c74bf5fa5211bb2d0548715c4 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Fri, 24 Nov 2023 12:39:02 +0530 Subject: [PATCH 0765/1290] RISC-V: Add SBI debug console helper routines Let us provide SBI debug console helper routines which can be shared by serial/earlycon-riscv-sbi.c and hvc/hvc_riscv_sbi.c. Signed-off-by: Anup Patel Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20231124070905.1043092-3-apatel@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/sbi.h | 5 +++ arch/riscv/kernel/sbi.c | 66 ++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 66f3933c14f63..9eef25308d537 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -334,6 +334,11 @@ static inline unsigned long sbi_mk_version(unsigned long major, } int sbi_err_map_linux_errno(int err); + +extern bool sbi_debug_console_available; +int sbi_debug_console_write(const char *bytes, unsigned int num_bytes); +int sbi_debug_console_read(char *bytes, unsigned int num_bytes); + #else /* CONFIG_RISCV_SBI */ static inline int sbi_remote_fence_i(const struct cpumask *cpu_mask) { return -1; } static inline void sbi_init(void) {} diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index 5a62ed1da4533..e66e0999a8005 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -571,6 +572,66 @@ long sbi_get_mimpid(void) } EXPORT_SYMBOL_GPL(sbi_get_mimpid); +bool sbi_debug_console_available; + +int sbi_debug_console_write(const char *bytes, unsigned int num_bytes) +{ + phys_addr_t base_addr; + struct sbiret ret; + + if (!sbi_debug_console_available) + return -EOPNOTSUPP; + + if (is_vmalloc_addr(bytes)) + base_addr = page_to_phys(vmalloc_to_page(bytes)) + + offset_in_page(bytes); + else + base_addr = __pa(bytes); + if (PAGE_SIZE < (offset_in_page(bytes) + num_bytes)) + num_bytes = PAGE_SIZE - offset_in_page(bytes); + + if (IS_ENABLED(CONFIG_32BIT)) + ret = sbi_ecall(SBI_EXT_DBCN, SBI_EXT_DBCN_CONSOLE_WRITE, + num_bytes, lower_32_bits(base_addr), + upper_32_bits(base_addr), 0, 0, 0); + else + ret = sbi_ecall(SBI_EXT_DBCN, SBI_EXT_DBCN_CONSOLE_WRITE, + num_bytes, base_addr, 0, 0, 0, 0); + + if (ret.error == SBI_ERR_FAILURE) + return -EIO; + return ret.error ? sbi_err_map_linux_errno(ret.error) : ret.value; +} + +int sbi_debug_console_read(char *bytes, unsigned int num_bytes) +{ + phys_addr_t base_addr; + struct sbiret ret; + + if (!sbi_debug_console_available) + return -EOPNOTSUPP; + + if (is_vmalloc_addr(bytes)) + base_addr = page_to_phys(vmalloc_to_page(bytes)) + + offset_in_page(bytes); + else + base_addr = __pa(bytes); + if (PAGE_SIZE < (offset_in_page(bytes) + num_bytes)) + num_bytes = PAGE_SIZE - offset_in_page(bytes); + + if (IS_ENABLED(CONFIG_32BIT)) + ret = sbi_ecall(SBI_EXT_DBCN, SBI_EXT_DBCN_CONSOLE_READ, + num_bytes, lower_32_bits(base_addr), + upper_32_bits(base_addr), 0, 0, 0); + else + ret = sbi_ecall(SBI_EXT_DBCN, SBI_EXT_DBCN_CONSOLE_READ, + num_bytes, base_addr, 0, 0, 0, 0); + + if (ret.error == SBI_ERR_FAILURE) + return -EIO; + return ret.error ? sbi_err_map_linux_errno(ret.error) : ret.value; +} + void __init sbi_init(void) { int ret; @@ -612,6 +673,11 @@ void __init sbi_init(void) sbi_srst_reboot_nb.priority = 192; register_restart_handler(&sbi_srst_reboot_nb); } + if ((sbi_spec_version >= sbi_mk_version(2, 0)) && + (sbi_probe_extension(SBI_EXT_DBCN) > 0)) { + pr_info("SBI DBCN extension detected\n"); + sbi_debug_console_available = true; + } } else { __sbi_set_timer = __sbi_set_timer_v01; __sbi_send_ipi = __sbi_send_ipi_v01; -- GitLab From c77bf3607a0f0180aa674b58cfa76633215bb42f Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Fri, 24 Nov 2023 12:39:03 +0530 Subject: [PATCH 0766/1290] tty/serial: Add RISC-V SBI debug console based earlycon We extend the existing RISC-V SBI earlycon support to use the new RISC-V SBI debug console extension. Signed-off-by: Anup Patel Reviewed-by: Andrew Jones Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20231124070905.1043092-4-apatel@ventanamicro.com Signed-off-by: Palmer Dabbelt --- drivers/tty/serial/Kconfig | 2 +- drivers/tty/serial/earlycon-riscv-sbi.c | 27 ++++++++++++++++++++++--- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig index 732c893c8d161..1f2594b8ab9d8 100644 --- a/drivers/tty/serial/Kconfig +++ b/drivers/tty/serial/Kconfig @@ -87,7 +87,7 @@ config SERIAL_EARLYCON_SEMIHOST config SERIAL_EARLYCON_RISCV_SBI bool "Early console using RISC-V SBI" - depends on RISCV_SBI_V01 + depends on RISCV_SBI select SERIAL_CORE select SERIAL_CORE_CONSOLE select SERIAL_EARLYCON diff --git a/drivers/tty/serial/earlycon-riscv-sbi.c b/drivers/tty/serial/earlycon-riscv-sbi.c index 27afb0b74ea70..0162155f0c839 100644 --- a/drivers/tty/serial/earlycon-riscv-sbi.c +++ b/drivers/tty/serial/earlycon-riscv-sbi.c @@ -15,17 +15,38 @@ static void sbi_putc(struct uart_port *port, unsigned char c) sbi_console_putchar(c); } -static void sbi_console_write(struct console *con, - const char *s, unsigned n) +static void sbi_0_1_console_write(struct console *con, + const char *s, unsigned int n) { struct earlycon_device *dev = con->data; uart_console_write(&dev->port, s, n, sbi_putc); } +static void sbi_dbcn_console_write(struct console *con, + const char *s, unsigned int n) +{ + int ret; + + while (n) { + ret = sbi_debug_console_write(s, n); + if (ret < 0) + break; + + s += ret; + n -= ret; + } +} + static int __init early_sbi_setup(struct earlycon_device *device, const char *opt) { - device->con->write = sbi_console_write; + if (sbi_debug_console_available) + device->con->write = sbi_dbcn_console_write; + else if (IS_ENABLED(CONFIG_RISCV_SBI_V01)) + device->con->write = sbi_0_1_console_write; + else + return -ENODEV; + return 0; } EARLYCON_DECLARE(sbi, early_sbi_setup); -- GitLab From 88ead68e764cd164abb965e258c4e18841433ecf Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Fri, 24 Nov 2023 12:39:04 +0530 Subject: [PATCH 0767/1290] tty: Add SBI debug console support to HVC SBI driver RISC-V SBI specification supports advanced debug console support via SBI DBCN extension. Extend the HVC SBI driver to support it. Signed-off-by: Atish Patra Signed-off-by: Anup Patel Reviewed-by: Andrew Jones Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20231124070905.1043092-5-apatel@ventanamicro.com Signed-off-by: Palmer Dabbelt --- drivers/tty/hvc/Kconfig | 2 +- drivers/tty/hvc/hvc_riscv_sbi.c | 37 ++++++++++++++++++++++++++------- 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/drivers/tty/hvc/Kconfig b/drivers/tty/hvc/Kconfig index 4f9264d005c06..6e05c5c7bca1a 100644 --- a/drivers/tty/hvc/Kconfig +++ b/drivers/tty/hvc/Kconfig @@ -108,7 +108,7 @@ config HVC_DCC_SERIALIZE_SMP config HVC_RISCV_SBI bool "RISC-V SBI console support" - depends on RISCV_SBI_V01 + depends on RISCV_SBI select HVC_DRIVER help This enables support for console output via RISC-V SBI calls, which diff --git a/drivers/tty/hvc/hvc_riscv_sbi.c b/drivers/tty/hvc/hvc_riscv_sbi.c index 31f53fa77e4af..2f3571f17ecd4 100644 --- a/drivers/tty/hvc/hvc_riscv_sbi.c +++ b/drivers/tty/hvc/hvc_riscv_sbi.c @@ -39,21 +39,44 @@ static int hvc_sbi_tty_get(uint32_t vtermno, char *buf, int count) return i; } -static const struct hv_ops hvc_sbi_ops = { +static const struct hv_ops hvc_sbi_v01_ops = { .get_chars = hvc_sbi_tty_get, .put_chars = hvc_sbi_tty_put, }; -static int __init hvc_sbi_init(void) +static int hvc_sbi_dbcn_tty_put(uint32_t vtermno, const char *buf, int count) { - return PTR_ERR_OR_ZERO(hvc_alloc(0, 0, &hvc_sbi_ops, 16)); + return sbi_debug_console_write(buf, count); } -device_initcall(hvc_sbi_init); -static int __init hvc_sbi_console_init(void) +static int hvc_sbi_dbcn_tty_get(uint32_t vtermno, char *buf, int count) { - hvc_instantiate(0, 0, &hvc_sbi_ops); + return sbi_debug_console_read(buf, count); +} + +static const struct hv_ops hvc_sbi_dbcn_ops = { + .put_chars = hvc_sbi_dbcn_tty_put, + .get_chars = hvc_sbi_dbcn_tty_get, +}; + +static int __init hvc_sbi_init(void) +{ + int err; + + if (sbi_debug_console_available) { + err = PTR_ERR_OR_ZERO(hvc_alloc(0, 0, &hvc_sbi_dbcn_ops, 256)); + if (err) + return err; + hvc_instantiate(0, 0, &hvc_sbi_dbcn_ops); + } else if (IS_ENABLED(CONFIG_RISCV_SBI_V01)) { + err = PTR_ERR_OR_ZERO(hvc_alloc(0, 0, &hvc_sbi_v01_ops, 256)); + if (err) + return err; + hvc_instantiate(0, 0, &hvc_sbi_v01_ops); + } else { + return -ENODEV; + } return 0; } -console_initcall(hvc_sbi_console_init); +device_initcall(hvc_sbi_init); -- GitLab From 50942ad6ddb57d3cfe2e4fc1f08714d54b2565ef Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Fri, 24 Nov 2023 12:39:05 +0530 Subject: [PATCH 0768/1290] RISC-V: Enable SBI based earlycon support Let us enable SBI based earlycon support in defconfig for both RV32 and RV64 so that "earlycon=sbi" can be used again. Signed-off-by: Anup Patel Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20231124070905.1043092-6-apatel@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/configs/defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index 905881282a7cd..eaf34e871e308 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -149,6 +149,7 @@ CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_8250_DW=y CONFIG_SERIAL_OF_PLATFORM=y CONFIG_SERIAL_SH_SCI=y +CONFIG_SERIAL_EARLYCON_RISCV_SBI=y CONFIG_VIRTIO_CONSOLE=y CONFIG_HW_RANDOM=y CONFIG_HW_RANDOM_VIRTIO=y -- GitLab From 742e324a0679ce271f7475a40056bac6917a950c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 10 Jan 2024 08:29:53 -0700 Subject: [PATCH 0769/1290] block/iocost: silence warning on 'last_period' potentially being unused If CONFIG_TRACEPOINTS isn't enabled, we assign this variable but then never use it. This can cause the compiler to complain about that: block/blk-iocost.c:1264:6: warning: variable 'last_period' set but not used [-Wunused-but-set-variable] 1264 | u64 last_period, cur_period; | ^ Rather than add ifdefs to guard this, just mark it __maybe_unused. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202401102335.GiWdeIo9-lkp@intel.com/ Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-iocost.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 089fcb9cfce37..c8beec6d7df08 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1261,7 +1261,7 @@ static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now) static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; - u64 last_period, cur_period; + u64 __maybe_unused last_period, cur_period; u64 vtime, vtarget; int i; -- GitLab From 748dc0b65ec2b4b7b3dbd7befcc4a54fdcac7988 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 10 Jan 2024 18:29:42 +0900 Subject: [PATCH 0770/1290] block: fix partial zone append completion handling in req_bio_endio() Partial completions of zone append request is not allowed but if a zone append completion indicates a number of completed bytes different from the original BIO size, only the BIO status is set to error. This leads to bio_advance() not setting the BIO size to 0 and thus to not call bio_endio() at the end of req_bio_endio(). Make sure a partially completed zone append is failed and completed immediately by forcing the completed number of bytes (nbytes) to be equal to the BIO size, thus ensuring that bio_endio() is called. Fixes: 297db731847e ("block: fix req_bio_endio append error handling") Cc: stable@kernel.vger.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240110092942.442334-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-mq.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index fb29ff5cc281d..aa9a05fdd0237 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -772,11 +772,16 @@ static void req_bio_endio(struct request *rq, struct bio *bio, /* * Partial zone append completions cannot be supported as the * BIO fragments may end up not being written sequentially. + * For such case, force the completed nbytes to be equal to + * the BIO size so that bio_advance() sets the BIO remaining + * size to 0 and we end up calling bio_endio() before returning. */ - if (bio->bi_iter.bi_size != nbytes) + if (bio->bi_iter.bi_size != nbytes) { bio->bi_status = BLK_STS_IOERR; - else + nbytes = bio->bi_iter.bi_size; + } else { bio->bi_iter.bi_sector = rq->__sector; + } } bio_advance(bio, nbytes); -- GitLab From a4ff64edf9edc8f05e2183610dc8306d3279c6ac Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 14 Nov 2023 22:33:37 +0800 Subject: [PATCH 0771/1290] riscv: errata: thead: use riscv_nonstd_cache_ops for CMO Previously, we use alternative mechanism to dynamically patch the CMO operations for THEAD C906/C910 during boot for performance reason. But as pointed out by Arnd, "there is already a significant cost in accessing the invalidated cache lines afterwards, which is likely going to be much higher than the cost of an indirect branch". And indeed, there's no performance difference with GMAC and EMMC per my test on Sipeed Lichee Pi 4A board. Use riscv_nonstd_cache_ops for THEAD C906/C910 CMO to simplify the alternative code, and to acchieve Arnd's goal -- "I think moving the THEAD ops at the same level as all nonstandard operations makes sense, but I'd still leave CMO as an explicit fast path that avoids the indirect branch. This seems like the right thing to do both for readability and for platforms on which the indirect branch has a noticeable overhead." Signed-off-by: Jisheng Zhang Tested-by: Emil Renner Berthing Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20231114143338.2406-2-jszhang@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig.errata | 1 + arch/riscv/errata/thead/errata.c | 75 +++++++++++++++++++++++++++- arch/riscv/include/asm/errata_list.h | 50 +++---------------- 3 files changed, 80 insertions(+), 46 deletions(-) diff --git a/arch/riscv/Kconfig.errata b/arch/riscv/Kconfig.errata index e2c731cfed8cc..dedb8b238e73d 100644 --- a/arch/riscv/Kconfig.errata +++ b/arch/riscv/Kconfig.errata @@ -79,6 +79,7 @@ config ERRATA_THEAD_CMO depends on ERRATA_THEAD && MMU select DMA_DIRECT_REMAP select RISCV_DMA_NONCOHERENT + select RISCV_NONSTANDARD_CACHE_OPS default y help This will apply the cache management errata to handle the diff --git a/arch/riscv/errata/thead/errata.c b/arch/riscv/errata/thead/errata.c index 0554ed4bf087c..c07d957b14680 100644 --- a/arch/riscv/errata/thead/errata.c +++ b/arch/riscv/errata/thead/errata.c @@ -12,8 +12,10 @@ #include #include #include +#include #include #include +#include #include #include @@ -33,6 +35,75 @@ static bool errata_probe_pbmt(unsigned int stage, return false; } +/* + * th.dcache.ipa rs1 (invalidate, physical address) + * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 | + * 0000001 01010 rs1 000 00000 0001011 + * th.dcache.iva rs1 (invalidate, virtual address) + * 0000001 00110 rs1 000 00000 0001011 + * + * th.dcache.cpa rs1 (clean, physical address) + * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 | + * 0000001 01001 rs1 000 00000 0001011 + * th.dcache.cva rs1 (clean, virtual address) + * 0000001 00101 rs1 000 00000 0001011 + * + * th.dcache.cipa rs1 (clean then invalidate, physical address) + * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 | + * 0000001 01011 rs1 000 00000 0001011 + * th.dcache.civa rs1 (clean then invalidate, virtual address) + * 0000001 00111 rs1 000 00000 0001011 + * + * th.sync.s (make sure all cache operations finished) + * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 | + * 0000000 11001 00000 000 00000 0001011 + */ +#define THEAD_INVAL_A0 ".long 0x0265000b" +#define THEAD_CLEAN_A0 ".long 0x0255000b" +#define THEAD_FLUSH_A0 ".long 0x0275000b" +#define THEAD_SYNC_S ".long 0x0190000b" + +#define THEAD_CMO_OP(_op, _start, _size, _cachesize) \ +asm volatile("mv a0, %1\n\t" \ + "j 2f\n\t" \ + "3:\n\t" \ + THEAD_##_op##_A0 "\n\t" \ + "add a0, a0, %0\n\t" \ + "2:\n\t" \ + "bltu a0, %2, 3b\n\t" \ + THEAD_SYNC_S \ + : : "r"(_cachesize), \ + "r"((unsigned long)(_start) & ~((_cachesize) - 1UL)), \ + "r"((unsigned long)(_start) + (_size)) \ + : "a0") + +static void thead_errata_cache_inv(phys_addr_t paddr, size_t size) +{ + void *vaddr = phys_to_virt(paddr); + + THEAD_CMO_OP(INVAL, vaddr, size, riscv_cbom_block_size); +} + +static void thead_errata_cache_wback(phys_addr_t paddr, size_t size) +{ + void *vaddr = phys_to_virt(paddr); + + THEAD_CMO_OP(CLEAN, vaddr, size, riscv_cbom_block_size); +} + +static void thead_errata_cache_wback_inv(phys_addr_t paddr, size_t size) +{ + void *vaddr = phys_to_virt(paddr); + + THEAD_CMO_OP(FLUSH, vaddr, size, riscv_cbom_block_size); +} + +static const struct riscv_nonstd_cache_ops thead_errata_cmo_ops = { + .wback = &thead_errata_cache_wback, + .inv = &thead_errata_cache_inv, + .wback_inv = &thead_errata_cache_wback_inv, +}; + static bool errata_probe_cmo(unsigned int stage, unsigned long arch_id, unsigned long impid) { @@ -48,6 +119,7 @@ static bool errata_probe_cmo(unsigned int stage, if (stage == RISCV_ALTERNATIVES_BOOT) { riscv_cbom_block_size = L1_CACHE_BYTES; riscv_noncoherent_supported(); + riscv_noncoherent_register_cache_ops(&thead_errata_cmo_ops); } return true; @@ -77,8 +149,7 @@ static u32 thead_errata_probe(unsigned int stage, if (errata_probe_pbmt(stage, archid, impid)) cpu_req_errata |= BIT(ERRATA_THEAD_PBMT); - if (errata_probe_cmo(stage, archid, impid)) - cpu_req_errata |= BIT(ERRATA_THEAD_CMO); + errata_probe_cmo(stage, archid, impid); if (errata_probe_pmu(stage, archid, impid)) cpu_req_errata |= BIT(ERRATA_THEAD_PMU); diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h index 83ed25e435534..ea33288f8a25b 100644 --- a/arch/riscv/include/asm/errata_list.h +++ b/arch/riscv/include/asm/errata_list.h @@ -24,9 +24,8 @@ #ifdef CONFIG_ERRATA_THEAD #define ERRATA_THEAD_PBMT 0 -#define ERRATA_THEAD_CMO 1 -#define ERRATA_THEAD_PMU 2 -#define ERRATA_THEAD_NUMBER 3 +#define ERRATA_THEAD_PMU 1 +#define ERRATA_THEAD_NUMBER 2 #endif #ifdef __ASSEMBLY__ @@ -94,54 +93,17 @@ asm volatile(ALTERNATIVE( \ #define ALT_THEAD_PMA(_val) #endif -/* - * th.dcache.ipa rs1 (invalidate, physical address) - * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 | - * 0000001 01010 rs1 000 00000 0001011 - * th.dache.iva rs1 (invalida, virtual address) - * 0000001 00110 rs1 000 00000 0001011 - * - * th.dcache.cpa rs1 (clean, physical address) - * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 | - * 0000001 01001 rs1 000 00000 0001011 - * th.dcache.cva rs1 (clean, virtual address) - * 0000001 00101 rs1 000 00000 0001011 - * - * th.dcache.cipa rs1 (clean then invalidate, physical address) - * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 | - * 0000001 01011 rs1 000 00000 0001011 - * th.dcache.civa rs1 (... virtual address) - * 0000001 00111 rs1 000 00000 0001011 - * - * th.sync.s (make sure all cache operations finished) - * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 | - * 0000000 11001 00000 000 00000 0001011 - */ -#define THEAD_INVAL_A0 ".long 0x0265000b" -#define THEAD_CLEAN_A0 ".long 0x0255000b" -#define THEAD_FLUSH_A0 ".long 0x0275000b" -#define THEAD_SYNC_S ".long 0x0190000b" - #define ALT_CMO_OP(_op, _start, _size, _cachesize) \ -asm volatile(ALTERNATIVE_2( \ - __nops(6), \ +asm volatile(ALTERNATIVE( \ + __nops(5), \ "mv a0, %1\n\t" \ "j 2f\n\t" \ "3:\n\t" \ CBO_##_op(a0) \ "add a0, a0, %0\n\t" \ "2:\n\t" \ - "bltu a0, %2, 3b\n\t" \ - "nop", 0, RISCV_ISA_EXT_ZICBOM, CONFIG_RISCV_ISA_ZICBOM, \ - "mv a0, %1\n\t" \ - "j 2f\n\t" \ - "3:\n\t" \ - THEAD_##_op##_A0 "\n\t" \ - "add a0, a0, %0\n\t" \ - "2:\n\t" \ - "bltu a0, %2, 3b\n\t" \ - THEAD_SYNC_S, THEAD_VENDOR_ID, \ - ERRATA_THEAD_CMO, CONFIG_ERRATA_THEAD_CMO) \ + "bltu a0, %2, 3b\n\t", \ + 0, RISCV_ISA_EXT_ZICBOM, CONFIG_RISCV_ISA_ZICBOM) \ : : "r"(_cachesize), \ "r"((unsigned long)(_start) & ~((_cachesize) - 1UL)), \ "r"((unsigned long)(_start) + (_size)) \ -- GitLab From 3690492612ecd2b3fd69f39f3a56f6313ea8ef8b Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 14 Nov 2023 22:33:38 +0800 Subject: [PATCH 0772/1290] riscv: errata: thead: use pa based instructions for CMO T-HEAD CPUs such as C906/C910/C920 support phy address based CMO, use them so that we don't need to convert to virt address. Signed-off-by: Jisheng Zhang Reviewed-by: Guo Ren Link: https://lore.kernel.org/r/20231114143338.2406-3-jszhang@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/errata/thead/errata.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/arch/riscv/errata/thead/errata.c b/arch/riscv/errata/thead/errata.c index c07d957b14680..b1c410bbc1aec 100644 --- a/arch/riscv/errata/thead/errata.c +++ b/arch/riscv/errata/thead/errata.c @@ -58,9 +58,9 @@ static bool errata_probe_pbmt(unsigned int stage, * | 31 - 25 | 24 - 20 | 19 - 15 | 14 - 12 | 11 - 7 | 6 - 0 | * 0000000 11001 00000 000 00000 0001011 */ -#define THEAD_INVAL_A0 ".long 0x0265000b" -#define THEAD_CLEAN_A0 ".long 0x0255000b" -#define THEAD_FLUSH_A0 ".long 0x0275000b" +#define THEAD_INVAL_A0 ".long 0x02a5000b" +#define THEAD_CLEAN_A0 ".long 0x0295000b" +#define THEAD_FLUSH_A0 ".long 0x02b5000b" #define THEAD_SYNC_S ".long 0x0190000b" #define THEAD_CMO_OP(_op, _start, _size, _cachesize) \ @@ -79,23 +79,17 @@ asm volatile("mv a0, %1\n\t" \ static void thead_errata_cache_inv(phys_addr_t paddr, size_t size) { - void *vaddr = phys_to_virt(paddr); - - THEAD_CMO_OP(INVAL, vaddr, size, riscv_cbom_block_size); + THEAD_CMO_OP(INVAL, paddr, size, riscv_cbom_block_size); } static void thead_errata_cache_wback(phys_addr_t paddr, size_t size) { - void *vaddr = phys_to_virt(paddr); - - THEAD_CMO_OP(CLEAN, vaddr, size, riscv_cbom_block_size); + THEAD_CMO_OP(CLEAN, paddr, size, riscv_cbom_block_size); } static void thead_errata_cache_wback_inv(phys_addr_t paddr, size_t size) { - void *vaddr = phys_to_virt(paddr); - - THEAD_CMO_OP(FLUSH, vaddr, size, riscv_cbom_block_size); + THEAD_CMO_OP(FLUSH, paddr, size, riscv_cbom_block_size); } static const struct riscv_nonstd_cache_ops thead_errata_cmo_ops = { -- GitLab From b12fbc3f787e3d7e47af274a761fdee6e867bc7d Mon Sep 17 00:00:00 2001 From: David Stevens Date: Wed, 10 Jan 2024 11:19:25 +0900 Subject: [PATCH 0773/1290] virtio_balloon: stay awake while adjusting balloon A virtio_balloon's parent device may be configured so that a configuration change interrupt is a wakeup event. Extend the processing of such a wakeup event until the balloon finishes inflating or deflating by calling pm_stay_awake/pm_relax in the virtio_balloon driver. Note that these calls are no-ops if the parent device doesn't support wakeup events or if the wakeup events are not enabled. This change allows the guest to use system power states such as s2idle without running the risk the virtio_balloon's cooperative memory management becoming unresponsive to the host's requests. Tested-by: Theodore Ts'o Signed-off-by: David Stevens Signed-off-by: Theodore Ts'o Message-Id: <20240110021925.1137333-1-stevensd@google.com> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_balloon.c | 57 +++++++++++++++++++++++++++------ 1 file changed, 47 insertions(+), 10 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 1fe93e93f5bcc..fa710e6c505ad 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -119,6 +119,11 @@ struct virtio_balloon { /* Free page reporting device */ struct virtqueue *reporting_vq; struct page_reporting_dev_info pr_dev_info; + + /* State for keeping the wakeup_source active while adjusting the balloon */ + spinlock_t adjustment_lock; + bool adjustment_signal_pending; + bool adjustment_in_progress; }; static const struct virtio_device_id id_table[] = { @@ -437,6 +442,31 @@ static void virtio_balloon_queue_free_page_work(struct virtio_balloon *vb) queue_work(vb->balloon_wq, &vb->report_free_page_work); } +static void start_update_balloon_size(struct virtio_balloon *vb) +{ + unsigned long flags; + + spin_lock_irqsave(&vb->adjustment_lock, flags); + vb->adjustment_signal_pending = true; + if (!vb->adjustment_in_progress) { + vb->adjustment_in_progress = true; + pm_stay_awake(vb->vdev->dev.parent); + } + spin_unlock_irqrestore(&vb->adjustment_lock, flags); + + queue_work(system_freezable_wq, &vb->update_balloon_size_work); +} + +static void end_update_balloon_size(struct virtio_balloon *vb) +{ + spin_lock_irq(&vb->adjustment_lock); + if (!vb->adjustment_signal_pending && vb->adjustment_in_progress) { + vb->adjustment_in_progress = false; + pm_relax(vb->vdev->dev.parent); + } + spin_unlock_irq(&vb->adjustment_lock); +} + static void virtballoon_changed(struct virtio_device *vdev) { struct virtio_balloon *vb = vdev->priv; @@ -444,8 +474,7 @@ static void virtballoon_changed(struct virtio_device *vdev) spin_lock_irqsave(&vb->stop_update_lock, flags); if (!vb->stop_update) { - queue_work(system_freezable_wq, - &vb->update_balloon_size_work); + start_update_balloon_size(vb); virtio_balloon_queue_free_page_work(vb); } spin_unlock_irqrestore(&vb->stop_update_lock, flags); @@ -476,19 +505,25 @@ static void update_balloon_size_func(struct work_struct *work) vb = container_of(work, struct virtio_balloon, update_balloon_size_work); - diff = towards_target(vb); - if (!diff) - return; + spin_lock_irq(&vb->adjustment_lock); + vb->adjustment_signal_pending = false; + spin_unlock_irq(&vb->adjustment_lock); - if (diff > 0) - diff -= fill_balloon(vb, diff); - else - diff += leak_balloon(vb, -diff); - update_balloon_size(vb); + diff = towards_target(vb); + + if (diff) { + if (diff > 0) + diff -= fill_balloon(vb, diff); + else + diff += leak_balloon(vb, -diff); + update_balloon_size(vb); + } if (diff) queue_work(system_freezable_wq, work); + else + end_update_balloon_size(vb); } static int init_vqs(struct virtio_balloon *vb) @@ -992,6 +1027,8 @@ static int virtballoon_probe(struct virtio_device *vdev) goto out_unregister_oom; } + spin_lock_init(&vb->adjustment_lock); + virtio_device_ready(vdev); if (towards_target(vb)) -- GitLab From 35967bdcff325f4572b21b0d0005318da7e03f53 Mon Sep 17 00:00:00 2001 From: Changyuan Lyu Date: Wed, 20 Dec 2023 12:49:06 -0800 Subject: [PATCH 0774/1290] virtio_pmem: support feature SHMEM_REGION This patch adds the support for feature VIRTIO_PMEM_F_SHMEM_REGION (virtio spec v1.2 section 5.19.5.2 [1]). During feature negotiation, if VIRTIO_PMEM_F_SHMEM_REGION is offered by the device, the driver looks for a shared memory region of id 0. If it is found, this feature is understood. Otherwise, this feature bit is cleared. During probe, if VIRTIO_PMEM_F_SHMEM_REGION has been negotiated, virtio pmem ignores the `start` and `size` fields in device config and uses the physical address range of shared memory region 0. [1] https://docs.oasis-open.org/virtio/virtio/v1.2/csd01/virtio-v1.2-csd01.html#x1-6480002 Signed-off-by: Changyuan Lyu Message-Id: <20231220204906.566922-1-changyuanl@google.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/nvdimm/virtio_pmem.c | 36 ++++++++++++++++++++++++++++---- include/uapi/linux/virtio_pmem.h | 7 +++++++ 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/drivers/nvdimm/virtio_pmem.c b/drivers/nvdimm/virtio_pmem.c index a92eb172f0e7e..4ceced5cefcf1 100644 --- a/drivers/nvdimm/virtio_pmem.c +++ b/drivers/nvdimm/virtio_pmem.c @@ -29,12 +29,27 @@ static int init_vq(struct virtio_pmem *vpmem) return 0; }; +static int virtio_pmem_validate(struct virtio_device *vdev) +{ + struct virtio_shm_region shm_reg; + + if (virtio_has_feature(vdev, VIRTIO_PMEM_F_SHMEM_REGION) && + !virtio_get_shm_region(vdev, &shm_reg, (u8)VIRTIO_PMEM_SHMEM_REGION_ID) + ) { + dev_notice(&vdev->dev, "failed to get shared memory region %d\n", + VIRTIO_PMEM_SHMEM_REGION_ID); + __virtio_clear_bit(vdev, VIRTIO_PMEM_F_SHMEM_REGION); + } + return 0; +} + static int virtio_pmem_probe(struct virtio_device *vdev) { struct nd_region_desc ndr_desc = {}; struct nd_region *nd_region; struct virtio_pmem *vpmem; struct resource res; + struct virtio_shm_region shm_reg; int err = 0; if (!vdev->config->get) { @@ -57,10 +72,16 @@ static int virtio_pmem_probe(struct virtio_device *vdev) goto out_err; } - virtio_cread_le(vpmem->vdev, struct virtio_pmem_config, - start, &vpmem->start); - virtio_cread_le(vpmem->vdev, struct virtio_pmem_config, - size, &vpmem->size); + if (virtio_has_feature(vdev, VIRTIO_PMEM_F_SHMEM_REGION)) { + virtio_get_shm_region(vdev, &shm_reg, (u8)VIRTIO_PMEM_SHMEM_REGION_ID); + vpmem->start = shm_reg.addr; + vpmem->size = shm_reg.len; + } else { + virtio_cread_le(vpmem->vdev, struct virtio_pmem_config, + start, &vpmem->start); + virtio_cread_le(vpmem->vdev, struct virtio_pmem_config, + size, &vpmem->size); + } res.start = vpmem->start; res.end = vpmem->start + vpmem->size - 1; @@ -122,10 +143,17 @@ static void virtio_pmem_remove(struct virtio_device *vdev) virtio_reset_device(vdev); } +static unsigned int features[] = { + VIRTIO_PMEM_F_SHMEM_REGION, +}; + static struct virtio_driver virtio_pmem_driver = { + .feature_table = features, + .feature_table_size = ARRAY_SIZE(features), .driver.name = KBUILD_MODNAME, .driver.owner = THIS_MODULE, .id_table = id_table, + .validate = virtio_pmem_validate, .probe = virtio_pmem_probe, .remove = virtio_pmem_remove, }; diff --git a/include/uapi/linux/virtio_pmem.h b/include/uapi/linux/virtio_pmem.h index d676b3620383c..ede4f3564977d 100644 --- a/include/uapi/linux/virtio_pmem.h +++ b/include/uapi/linux/virtio_pmem.h @@ -14,6 +14,13 @@ #include #include +/* Feature bits */ +/* guest physical address range will be indicated as shared memory region 0 */ +#define VIRTIO_PMEM_F_SHMEM_REGION 0 + +/* shmid of the shared memory region corresponding to the pmem */ +#define VIRTIO_PMEM_SHMEM_REGION_ID 0 + struct virtio_pmem_config { __le64 start; __le64 size; -- GitLab From 95e7249691f082a5178d4d6f60fcdee91da458ab Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Wed, 13 Dec 2023 23:26:49 -0600 Subject: [PATCH 0775/1290] scsi: virtio_scsi: Add mq_poll support This adds polling support to virtio-scsi. It's based on and works similar to virtblk support where we add a module param to specify the number of poll queues then subtract to calculate the IO queues. When using 8 poll queues and a vhost worker per queue we see 4K IOPs with fio: fio --filename=/dev/sda --direct=1 --rw=randread --bs=4k \ --ioengine=io_uring --hipri --iodepth=128 --numjobs=$NUM_JOBS increase like: jobs base poll 1 207K 296K 2 392K 552K 3 581K 860K 4 765K 1235K 5 936K 1598K 6 1104K 1880K 7 1253K 2095K 8 1311k 2187K Signed-off-by: Mike Christie Message-Id: <20231214052649.57743-1-michael.christie@oracle.com> Signed-off-by: Michael S. Tsirkin --- drivers/scsi/virtio_scsi.c | 78 +++++++++++++++++++++++++++++++++++--- 1 file changed, 73 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 9d1bdcdc13312..4cf20be668a60 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -37,6 +37,11 @@ #define VIRTIO_SCSI_EVENT_LEN 8 #define VIRTIO_SCSI_VQ_BASE 2 +static unsigned int virtscsi_poll_queues; +module_param(virtscsi_poll_queues, uint, 0644); +MODULE_PARM_DESC(virtscsi_poll_queues, + "The number of dedicated virtqueues for polling I/O"); + /* Command queue element */ struct virtio_scsi_cmd { struct scsi_cmnd *sc; @@ -76,6 +81,7 @@ struct virtio_scsi { struct virtio_scsi_event_node event_list[VIRTIO_SCSI_EVENT_LEN]; u32 num_queues; + int io_queues[HCTX_MAX_TYPES]; struct hlist_node node; @@ -722,9 +728,49 @@ static int virtscsi_abort(struct scsi_cmnd *sc) static void virtscsi_map_queues(struct Scsi_Host *shost) { struct virtio_scsi *vscsi = shost_priv(shost); - struct blk_mq_queue_map *qmap = &shost->tag_set.map[HCTX_TYPE_DEFAULT]; + int i, qoff; + + for (i = 0, qoff = 0; i < shost->nr_maps; i++) { + struct blk_mq_queue_map *map = &shost->tag_set.map[i]; + + map->nr_queues = vscsi->io_queues[i]; + map->queue_offset = qoff; + qoff += map->nr_queues; + + if (map->nr_queues == 0) + continue; + + /* + * Regular queues have interrupts and hence CPU affinity is + * defined by the core virtio code, but polling queues have + * no interrupts so we let the block layer assign CPU affinity. + */ + if (i == HCTX_TYPE_POLL) + blk_mq_map_queues(map); + else + blk_mq_virtio_map_queues(map, vscsi->vdev, 2); + } +} + +static int virtscsi_mq_poll(struct Scsi_Host *shost, unsigned int queue_num) +{ + struct virtio_scsi *vscsi = shost_priv(shost); + struct virtio_scsi_vq *virtscsi_vq = &vscsi->req_vqs[queue_num]; + unsigned long flags; + unsigned int len; + int found = 0; + void *buf; + + spin_lock_irqsave(&virtscsi_vq->vq_lock, flags); + + while ((buf = virtqueue_get_buf(virtscsi_vq->vq, &len)) != NULL) { + virtscsi_complete_cmd(vscsi, buf); + found++; + } + + spin_unlock_irqrestore(&virtscsi_vq->vq_lock, flags); - blk_mq_virtio_map_queues(qmap, vscsi->vdev, 2); + return found; } static void virtscsi_commit_rqs(struct Scsi_Host *shost, u16 hwq) @@ -751,6 +797,7 @@ static const struct scsi_host_template virtscsi_host_template = { .this_id = -1, .cmd_size = sizeof(struct virtio_scsi_cmd), .queuecommand = virtscsi_queuecommand, + .mq_poll = virtscsi_mq_poll, .commit_rqs = virtscsi_commit_rqs, .change_queue_depth = virtscsi_change_queue_depth, .eh_abort_handler = virtscsi_abort, @@ -795,13 +842,14 @@ static int virtscsi_init(struct virtio_device *vdev, { int err; u32 i; - u32 num_vqs; + u32 num_vqs, num_poll_vqs, num_req_vqs; vq_callback_t **callbacks; const char **names; struct virtqueue **vqs; struct irq_affinity desc = { .pre_vectors = 2 }; - num_vqs = vscsi->num_queues + VIRTIO_SCSI_VQ_BASE; + num_req_vqs = vscsi->num_queues; + num_vqs = num_req_vqs + VIRTIO_SCSI_VQ_BASE; vqs = kmalloc_array(num_vqs, sizeof(struct virtqueue *), GFP_KERNEL); callbacks = kmalloc_array(num_vqs, sizeof(vq_callback_t *), GFP_KERNEL); @@ -812,15 +860,31 @@ static int virtscsi_init(struct virtio_device *vdev, goto out; } + num_poll_vqs = min_t(unsigned int, virtscsi_poll_queues, + num_req_vqs - 1); + vscsi->io_queues[HCTX_TYPE_DEFAULT] = num_req_vqs - num_poll_vqs; + vscsi->io_queues[HCTX_TYPE_READ] = 0; + vscsi->io_queues[HCTX_TYPE_POLL] = num_poll_vqs; + + dev_info(&vdev->dev, "%d/%d/%d default/read/poll queues\n", + vscsi->io_queues[HCTX_TYPE_DEFAULT], + vscsi->io_queues[HCTX_TYPE_READ], + vscsi->io_queues[HCTX_TYPE_POLL]); + callbacks[0] = virtscsi_ctrl_done; callbacks[1] = virtscsi_event_done; names[0] = "control"; names[1] = "event"; - for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs; i++) { + for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs - num_poll_vqs; i++) { callbacks[i] = virtscsi_req_done; names[i] = "request"; } + for (; i < num_vqs; i++) { + callbacks[i] = NULL; + names[i] = "request_poll"; + } + /* Discover virtqueues and write information to configuration. */ err = virtio_find_vqs(vdev, num_vqs, vqs, callbacks, names, &desc); if (err) @@ -874,6 +938,7 @@ static int virtscsi_probe(struct virtio_device *vdev) sg_elems = virtscsi_config_get(vdev, seg_max) ?: 1; shost->sg_tablesize = sg_elems; + shost->nr_maps = 1; vscsi = shost_priv(shost); vscsi->vdev = vdev; vscsi->num_queues = num_queues; @@ -883,6 +948,9 @@ static int virtscsi_probe(struct virtio_device *vdev) if (err) goto virtscsi_init_failed; + if (vscsi->io_queues[HCTX_TYPE_POLL]) + shost->nr_maps = HCTX_TYPE_POLL + 1; + shost->can_queue = virtqueue_get_vring_size(vscsi->req_vqs[0].vq); cmd_per_lun = virtscsi_config_get(vdev, cmd_per_lun) ?: 1; -- GitLab From c7e194402be3ed385dfaefaf1681bb731fd776e2 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Mon, 25 Dec 2023 15:42:09 +0200 Subject: [PATCH 0776/1290] vdpa: Track device suspended state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Set vdpa device suspended state on successful suspend. Clear it on successful resume and reset. The state will be locked by the vhost_vdpa mutex. The mutex is taken during suspend, resume and reset in vhost_vdpa_unlocked_ioctl. The exception is vhost_vdpa_open which does a device reset but that should be safe because it can only happen before the other ops. Signed-off-by: Dragos Tatulea Suggested-by: Eugenio Pérez Message-Id: <20231225134210.151540-2-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vdpa.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index a51c69c078d93..5f046770c0a43 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -59,6 +59,7 @@ struct vhost_vdpa { int in_batch; struct vdpa_iova_range range; u32 batch_asid; + bool suspended; }; static DEFINE_IDA(vhost_vdpa_ida); @@ -232,6 +233,8 @@ static int _compat_vdpa_reset(struct vhost_vdpa *v) struct vdpa_device *vdpa = v->vdpa; u32 flags = 0; + v->suspended = false; + if (v->vdev.vqs) { flags |= !vhost_backend_has_feature(v->vdev.vqs[0], VHOST_BACKEND_F_IOTLB_PERSIST) ? @@ -590,11 +593,16 @@ static long vhost_vdpa_suspend(struct vhost_vdpa *v) { struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; + int ret; if (!ops->suspend) return -EOPNOTSUPP; - return ops->suspend(vdpa); + ret = ops->suspend(vdpa); + if (!ret) + v->suspended = true; + + return ret; } /* After a successful return of this ioctl the device resumes processing @@ -605,11 +613,16 @@ static long vhost_vdpa_resume(struct vhost_vdpa *v) { struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; + int ret; if (!ops->resume) return -EOPNOTSUPP; - return ops->resume(vdpa); + ret = ops->resume(vdpa); + if (!ret) + v->suspended = false; + + return ret; } static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, -- GitLab From a09483c4065fe1e14812f2371cee1cba78a13d60 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Mon, 25 Dec 2023 15:42:10 +0200 Subject: [PATCH 0777/1290] vdpa: Block vq property changes in DRIVER_OK MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The virtio standard doesn't allow for virtqueue address and state changes when the device is in DRIVER_OK. Return an error in such cases unless the device is suspended. The suspended device exception is needed because some devices support virtqueue changes when the device is suspended. Signed-off-by: Dragos Tatulea Suggested-by: Eugenio Pérez Message-Id: <20231225134210.151540-3-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vdpa.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 5f046770c0a43..5c3e019c01229 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -703,6 +703,9 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, switch (cmd) { case VHOST_SET_VRING_ADDR: + if ((ops->get_status(vdpa) & VIRTIO_CONFIG_S_DRIVER_OK) && !v->suspended) + return -EINVAL; + if (ops->set_vq_address(vdpa, idx, (u64)(uintptr_t)vq->desc, (u64)(uintptr_t)vq->avail, @@ -711,6 +714,9 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, break; case VHOST_SET_VRING_BASE: + if ((ops->get_status(vdpa) & VIRTIO_CONFIG_S_DRIVER_OK) && !v->suspended) + return -EINVAL; + if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) { vq_state.packed.last_avail_idx = vq->last_avail_idx & 0x7fff; vq_state.packed.last_avail_counter = !!(vq->last_avail_idx & 0x8000); -- GitLab From ef067191f73cce3ee192e991ce486d95524655d5 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Mon, 25 Dec 2023 17:11:56 +0200 Subject: [PATCH 0778/1290] vdpa/mlx5: Expose resumable vq capability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Necessary for checking if resumable vqs are supported by the hardware. Actual support will be added in a downstream patch. Reviewed-by: Gal Pressman Acked-by: Eugenio Pérez Signed-off-by: Dragos Tatulea Message-Id: <20231225151203.152687-2-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin --- include/linux/mlx5/mlx5_ifc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 6f3631425f386..9eaceaf6bcb06 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1236,7 +1236,8 @@ struct mlx5_ifc_virtio_emulation_cap_bits { u8 reserved_at_c0[0x13]; u8 desc_group_mkey_supported[0x1]; - u8 reserved_at_d4[0xc]; + u8 freeze_to_rdy_supported[0x1]; + u8 reserved_at_d5[0xb]; u8 reserved_at_e0[0x20]; -- GitLab From 651cdaa9c028b1edf0897e10f560fed4f4fb1fb6 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Mon, 25 Dec 2023 17:11:57 +0200 Subject: [PATCH 0779/1290] vdpa/mlx5: Allow modifying multiple vq fields in one modify command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a bitmask variable that tracks hw vq field changes that are supposed to be modified on next hw vq change command. This will be useful to set multiple vq fields when resuming the vq. Reviewed-by: Gal Pressman Acked-by: Eugenio Pérez Acked-by: Jason Wang Signed-off-by: Dragos Tatulea Message-Id: <20231225151203.152687-3-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 48 +++++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 26ba7da6b4106..1e08a88056402 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -120,6 +120,9 @@ struct mlx5_vdpa_virtqueue { u16 avail_idx; u16 used_idx; int fw_state; + + u64 modified_fields; + struct msi_map map; /* keep last in the struct */ @@ -1181,7 +1184,19 @@ static bool is_valid_state_change(int oldstate, int newstate) } } -static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int state) +static bool modifiable_virtqueue_fields(struct mlx5_vdpa_virtqueue *mvq) +{ + /* Only state is always modifiable */ + if (mvq->modified_fields & ~MLX5_VIRTQ_MODIFY_MASK_STATE) + return mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT || + mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND; + + return true; +} + +static int modify_virtqueue(struct mlx5_vdpa_net *ndev, + struct mlx5_vdpa_virtqueue *mvq, + int state) { int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in); u32 out[MLX5_ST_SZ_DW(modify_virtio_net_q_out)] = {}; @@ -1193,6 +1208,9 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_NONE) return 0; + if (!modifiable_virtqueue_fields(mvq)) + return -EINVAL; + if (!is_valid_state_change(mvq->fw_state, state)) return -EINVAL; @@ -1208,17 +1226,28 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid); obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, obj_context); - MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select, - MLX5_VIRTQ_MODIFY_MASK_STATE); - MLX5_SET(virtio_net_q_object, obj_context, state, state); + if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_STATE) + MLX5_SET(virtio_net_q_object, obj_context, state, state); + + MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select, mvq->modified_fields); err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out)); kfree(in); if (!err) mvq->fw_state = state; + mvq->modified_fields = 0; + return err; } +static int modify_virtqueue_state(struct mlx5_vdpa_net *ndev, + struct mlx5_vdpa_virtqueue *mvq, + unsigned int state) +{ + mvq->modified_fields |= MLX5_VIRTQ_MODIFY_MASK_STATE; + return modify_virtqueue(ndev, mvq, state); +} + static int counter_set_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) { u32 in[MLX5_ST_SZ_DW(create_virtio_q_counters_in)] = {}; @@ -1347,7 +1376,7 @@ static int setup_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) goto err_vq; if (mvq->ready) { - err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY); + err = modify_virtqueue_state(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY); if (err) { mlx5_vdpa_warn(&ndev->mvdev, "failed to modify to ready vq idx %d(%d)\n", idx, err); @@ -1382,7 +1411,7 @@ static void suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *m if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) return; - if (modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND)) + if (modify_virtqueue_state(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND)) mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed\n"); if (query_virtqueue(ndev, mvq, &attr)) { @@ -1407,6 +1436,7 @@ static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue * return; suspend_vq(ndev, mvq); + mvq->modified_fields = 0; destroy_virtqueue(ndev, mvq); dealloc_vector(ndev, mvq); counter_set_dealloc(ndev, mvq); @@ -2207,7 +2237,7 @@ static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready if (!ready) { suspend_vq(ndev, mvq); } else { - err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY); + err = modify_virtqueue_state(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY); if (err) { mlx5_vdpa_warn(mvdev, "modify VQ %d to ready failed (%d)\n", idx, err); ready = false; @@ -2804,8 +2834,10 @@ static void clear_vqs_ready(struct mlx5_vdpa_net *ndev) { int i; - for (i = 0; i < ndev->mvdev.max_vqs; i++) + for (i = 0; i < ndev->mvdev.max_vqs; i++) { ndev->vqs[i].ready = false; + ndev->vqs[i].modified_fields = 0; + } ndev->mvdev.cvq.ready = false; } -- GitLab From 145096937b8a6a8d5889f5a1a6fb453c52cc48a1 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Mon, 25 Dec 2023 17:11:58 +0200 Subject: [PATCH 0780/1290] vdpa/mlx5: Introduce per vq and device resume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement vdpa vq and device resume if capability detected. Add support for suspend -> ready state change. Reviewed-by: Gal Pressman Acked-by: Eugenio Pérez Acked-by: Jason Wang Signed-off-by: Dragos Tatulea Message-Id: <20231225151203.152687-4-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 69 +++++++++++++++++++++++++++---- 1 file changed, 62 insertions(+), 7 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 1e08a88056402..f8f088cced50a 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1170,7 +1170,12 @@ err_cmd: return err; } -static bool is_valid_state_change(int oldstate, int newstate) +static bool is_resumable(struct mlx5_vdpa_net *ndev) +{ + return ndev->mvdev.vdev.config->resume; +} + +static bool is_valid_state_change(int oldstate, int newstate, bool resumable) { switch (oldstate) { case MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT: @@ -1178,6 +1183,7 @@ static bool is_valid_state_change(int oldstate, int newstate) case MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY: return newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND; case MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND: + return resumable ? newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY : false; case MLX5_VIRTIO_NET_Q_OBJECT_STATE_ERR: default: return false; @@ -1200,6 +1206,7 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev, { int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in); u32 out[MLX5_ST_SZ_DW(modify_virtio_net_q_out)] = {}; + bool state_change = false; void *obj_context; void *cmd_hdr; void *in; @@ -1211,9 +1218,6 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev, if (!modifiable_virtqueue_fields(mvq)) return -EINVAL; - if (!is_valid_state_change(mvq->fw_state, state)) - return -EINVAL; - in = kzalloc(inlen, GFP_KERNEL); if (!in) return -ENOMEM; @@ -1226,17 +1230,29 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev, MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid); obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, obj_context); - if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_STATE) + + if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_STATE) { + if (!is_valid_state_change(mvq->fw_state, state, is_resumable(ndev))) { + err = -EINVAL; + goto done; + } + MLX5_SET(virtio_net_q_object, obj_context, state, state); + state_change = true; + } MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select, mvq->modified_fields); err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out)); - kfree(in); - if (!err) + if (err) + goto done; + + if (state_change) mvq->fw_state = state; mvq->modified_fields = 0; +done: + kfree(in); return err; } @@ -1430,6 +1446,24 @@ static void suspend_vqs(struct mlx5_vdpa_net *ndev) suspend_vq(ndev, &ndev->vqs[i]); } +static void resume_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) +{ + if (!mvq->initialized || !is_resumable(ndev)) + return; + + if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND) + return; + + if (modify_virtqueue_state(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)) + mlx5_vdpa_warn(&ndev->mvdev, "modify to resume failed for vq %u\n", mvq->index); +} + +static void resume_vqs(struct mlx5_vdpa_net *ndev) +{ + for (int i = 0; i < ndev->mvdev.max_vqs; i++) + resume_vq(ndev, &ndev->vqs[i]); +} + static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) { if (!mvq->initialized) @@ -3261,6 +3295,23 @@ static int mlx5_vdpa_suspend(struct vdpa_device *vdev) return 0; } +static int mlx5_vdpa_resume(struct vdpa_device *vdev) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev; + + ndev = to_mlx5_vdpa_ndev(mvdev); + + mlx5_vdpa_info(mvdev, "resuming device\n"); + + down_write(&ndev->reslock); + mvdev->suspended = false; + resume_vqs(ndev); + register_link_notifier(ndev); + up_write(&ndev->reslock); + return 0; +} + static int mlx5_set_group_asid(struct vdpa_device *vdev, u32 group, unsigned int asid) { @@ -3317,6 +3368,7 @@ static const struct vdpa_config_ops mlx5_vdpa_ops = { .get_vq_dma_dev = mlx5_get_vq_dma_dev, .free = mlx5_vdpa_free, .suspend = mlx5_vdpa_suspend, + .resume = mlx5_vdpa_resume, /* Op disabled if not supported. */ }; static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu) @@ -3688,6 +3740,9 @@ static int mlx5v_probe(struct auxiliary_device *adev, if (!MLX5_CAP_DEV_VDPA_EMULATION(mdev, desc_group_mkey_supported)) mgtdev->vdpa_ops.get_vq_desc_group = NULL; + if (!MLX5_CAP_DEV_VDPA_EMULATION(mdev, freeze_to_rdy_supported)) + mgtdev->vdpa_ops.resume = NULL; + err = vdpa_mgmtdev_register(&mgtdev->mgtdev); if (err) goto reg_err; -- GitLab From 9b23417825df470e4c9e98e7ed4b2c37465bfa1e Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Mon, 25 Dec 2023 17:11:59 +0200 Subject: [PATCH 0781/1290] vdpa/mlx5: Mark vq addrs for modification in hw vq Addresses get set by .set_vq_address. hw vq addresses will be updated on next modify_virtqueue. Reviewed-by: Gal Pressman Signed-off-by: Dragos Tatulea Message-Id: <20231225151203.152687-5-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 9 +++++++++ include/linux/mlx5/mlx5_ifc_vdpa.h | 1 + 2 files changed, 10 insertions(+) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index f8f088cced50a..80e066de08661 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1209,6 +1209,7 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev, bool state_change = false; void *obj_context; void *cmd_hdr; + void *vq_ctx; void *in; int err; @@ -1230,6 +1231,7 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev, MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid); obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, obj_context); + vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context); if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_STATE) { if (!is_valid_state_change(mvq->fw_state, state, is_resumable(ndev))) { @@ -1241,6 +1243,12 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev, state_change = true; } + if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS) { + MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr); + MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr); + MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr); + } + MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select, mvq->modified_fields); err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out)); if (err) @@ -2202,6 +2210,7 @@ static int mlx5_vdpa_set_vq_address(struct vdpa_device *vdev, u16 idx, u64 desc_ mvq->desc_addr = desc_area; mvq->device_addr = device_area; mvq->driver_addr = driver_area; + mvq->modified_fields |= MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS; return 0; } diff --git a/include/linux/mlx5/mlx5_ifc_vdpa.h b/include/linux/mlx5/mlx5_ifc_vdpa.h index b86d51a855f67..9594ac4057406 100644 --- a/include/linux/mlx5/mlx5_ifc_vdpa.h +++ b/include/linux/mlx5/mlx5_ifc_vdpa.h @@ -145,6 +145,7 @@ enum { MLX5_VIRTQ_MODIFY_MASK_STATE = (u64)1 << 0, MLX5_VIRTQ_MODIFY_MASK_DIRTY_BITMAP_PARAMS = (u64)1 << 3, MLX5_VIRTQ_MODIFY_MASK_DIRTY_BITMAP_DUMP_ENABLE = (u64)1 << 4, + MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS = (u64)1 << 6, MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY = (u64)1 << 14, }; -- GitLab From 60c43b3f6b4eb5a3d672952a0d65991f414ea258 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Mon, 25 Dec 2023 17:12:00 +0200 Subject: [PATCH 0782/1290] vdpa/mlx5: Mark vq state for modification in hw vq .set_vq_state will set the indices and mark the fields to be modified in the hw vq. Advertise that the device supports changing the vq state when the device is in DRIVER_OK state and suspended. Reviewed-by: Gal Pressman Signed-off-by: Dragos Tatulea Acked-by: Jason Wang Message-Id: <20231225151203.152687-6-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 8 ++++++++ include/linux/mlx5/mlx5_ifc_vdpa.h | 2 ++ 2 files changed, 10 insertions(+) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 80e066de08661..d6c8506cec8f5 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1249,6 +1249,12 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev, MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr); } + if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_AVAIL_IDX) + MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx); + + if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX) + MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx); + MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select, mvq->modified_fields); err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out)); if (err) @@ -2328,6 +2334,8 @@ static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx, mvq->used_idx = state->split.avail_index; mvq->avail_idx = state->split.avail_index; + mvq->modified_fields |= MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_AVAIL_IDX | + MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX; return 0; } diff --git a/include/linux/mlx5/mlx5_ifc_vdpa.h b/include/linux/mlx5/mlx5_ifc_vdpa.h index 9594ac4057406..32e712106e684 100644 --- a/include/linux/mlx5/mlx5_ifc_vdpa.h +++ b/include/linux/mlx5/mlx5_ifc_vdpa.h @@ -146,6 +146,8 @@ enum { MLX5_VIRTQ_MODIFY_MASK_DIRTY_BITMAP_PARAMS = (u64)1 << 3, MLX5_VIRTQ_MODIFY_MASK_DIRTY_BITMAP_DUMP_ENABLE = (u64)1 << 4, MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS = (u64)1 << 6, + MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_AVAIL_IDX = (u64)1 << 7, + MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX = (u64)1 << 8, MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY = (u64)1 << 14, }; -- GitLab From f756dd3e2a4c704c0ab5ecb143ab71f1249af497 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Mon, 25 Dec 2023 17:12:01 +0200 Subject: [PATCH 0783/1290] vdpa/mlx5: Use vq suspend/resume during .set_map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of tearing down and setting up vq resources, use vq suspend/resume during .set_map to speed things up a bit. The vq mr is updated with the new mapping while the vqs are suspended. If the device doesn't support resumable vqs, do the old teardown and setup dance. Reviewed-by: Gal Pressman Acked-by: Eugenio Pérez Acked-by: Jason Wang Signed-off-by: Dragos Tatulea Message-Id: <20231225151203.152687-7-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 46 ++++++++++++++++++++++++------ include/linux/mlx5/mlx5_ifc_vdpa.h | 1 + 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index d6c8506cec8f5..6a21223d97a8e 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1206,6 +1206,7 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev, { int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in); u32 out[MLX5_ST_SZ_DW(modify_virtio_net_q_out)] = {}; + struct mlx5_vdpa_dev *mvdev = &ndev->mvdev; bool state_change = false; void *obj_context; void *cmd_hdr; @@ -1255,6 +1256,24 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev, if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX) MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx); + if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY) { + struct mlx5_vdpa_mr *mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP]]; + + if (mr) + MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, mr->mkey); + else + mvq->modified_fields &= ~MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY; + } + + if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY) { + struct mlx5_vdpa_mr *mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_DESC_GROUP]]; + + if (mr && MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, desc_group_mkey_supported)) + MLX5_SET(virtio_q, vq_ctx, desc_group_mkey, mr->mkey); + else + mvq->modified_fields &= ~MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY; + } + MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select, mvq->modified_fields); err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out)); if (err) @@ -2784,24 +2803,35 @@ static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev, unsigned int asid) { struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + bool teardown = !is_resumable(ndev); int err; suspend_vqs(ndev); - err = save_channels_info(ndev); - if (err) - return err; + if (teardown) { + err = save_channels_info(ndev); + if (err) + return err; - teardown_driver(ndev); + teardown_driver(ndev); + } mlx5_vdpa_update_mr(mvdev, new_mr, asid); + for (int i = 0; i < ndev->cur_num_vqs; i++) + ndev->vqs[i].modified_fields |= MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY | + MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY; + if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK) || mvdev->suspended) return 0; - restore_channels_info(ndev); - err = setup_driver(mvdev); - if (err) - return err; + if (teardown) { + restore_channels_info(ndev); + err = setup_driver(mvdev); + if (err) + return err; + } + + resume_vqs(ndev); return 0; } diff --git a/include/linux/mlx5/mlx5_ifc_vdpa.h b/include/linux/mlx5/mlx5_ifc_vdpa.h index 32e712106e684..40371c916cf94 100644 --- a/include/linux/mlx5/mlx5_ifc_vdpa.h +++ b/include/linux/mlx5/mlx5_ifc_vdpa.h @@ -148,6 +148,7 @@ enum { MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS = (u64)1 << 6, MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_AVAIL_IDX = (u64)1 << 7, MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX = (u64)1 << 8, + MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY = (u64)1 << 11, MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY = (u64)1 << 14, }; -- GitLab From a06bd11b18fdf2d40e2b81d4318abc6cc38e70c9 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Mon, 25 Dec 2023 17:12:02 +0200 Subject: [PATCH 0784/1290] vdpa/mlx5: Introduce reference counting to mrs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Deleting the old mr during mr update (.set_map) and then modifying the vqs with the new mr is not a good flow for firmware. The firmware expects that mkeys are deleted after there are no more vqs referencing them. Introduce reference counting for mrs to fix this. It is the only way to make sure that mkeys are not in use by vqs. An mr reference is taken when the mr is associated to the mr asid table and when the mr is linked to the vq on create/modify. The reference is released when the mkey is unlinked from the vq (trough modify/destroy) and from the mr asid table. To make things consistent, get rid of mlx5_vdpa_destroy_mr and use get/put semantics everywhere. Reviewed-by: Gal Pressman Acked-by: Eugenio Pérez Acked-by: Jason Wang Signed-off-by: Dragos Tatulea Message-Id: <20231225151203.152687-8-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/mlx5/core/mlx5_vdpa.h | 8 +++-- drivers/vdpa/mlx5/core/mr.c | 50 ++++++++++++++++++++---------- drivers/vdpa/mlx5/net/mlx5_vnet.c | 45 ++++++++++++++++++++++----- 3 files changed, 78 insertions(+), 25 deletions(-) diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h index 84547d998bcf3..1a0d27b6e09af 100644 --- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h @@ -35,6 +35,8 @@ struct mlx5_vdpa_mr { struct vhost_iotlb *iotlb; bool user_mr; + + refcount_t refcount; }; struct mlx5_vdpa_resources { @@ -118,8 +120,10 @@ int mlx5_vdpa_destroy_mkey(struct mlx5_vdpa_dev *mvdev, u32 mkey); struct mlx5_vdpa_mr *mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb); void mlx5_vdpa_destroy_mr_resources(struct mlx5_vdpa_dev *mvdev); -void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev, - struct mlx5_vdpa_mr *mr); +void mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev, + struct mlx5_vdpa_mr *mr); +void mlx5_vdpa_put_mr(struct mlx5_vdpa_dev *mvdev, + struct mlx5_vdpa_mr *mr); void mlx5_vdpa_update_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr, unsigned int asid); diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c index 2197c46e563a1..c7dc8914354aa 100644 --- a/drivers/vdpa/mlx5/core/mr.c +++ b/drivers/vdpa/mlx5/core/mr.c @@ -498,32 +498,52 @@ static void destroy_user_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr static void _mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr) { + if (WARN_ON(!mr)) + return; + if (mr->user_mr) destroy_user_mr(mvdev, mr); else destroy_dma_mr(mvdev, mr); vhost_iotlb_free(mr->iotlb); + + kfree(mr); } -void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev, - struct mlx5_vdpa_mr *mr) +static void _mlx5_vdpa_put_mr(struct mlx5_vdpa_dev *mvdev, + struct mlx5_vdpa_mr *mr) { if (!mr) return; + if (refcount_dec_and_test(&mr->refcount)) + _mlx5_vdpa_destroy_mr(mvdev, mr); +} + +void mlx5_vdpa_put_mr(struct mlx5_vdpa_dev *mvdev, + struct mlx5_vdpa_mr *mr) +{ mutex_lock(&mvdev->mr_mtx); + _mlx5_vdpa_put_mr(mvdev, mr); + mutex_unlock(&mvdev->mr_mtx); +} - _mlx5_vdpa_destroy_mr(mvdev, mr); +static void _mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev, + struct mlx5_vdpa_mr *mr) +{ + if (!mr) + return; - for (int i = 0; i < MLX5_VDPA_NUM_AS; i++) { - if (mvdev->mr[i] == mr) - mvdev->mr[i] = NULL; - } + refcount_inc(&mr->refcount); +} +void mlx5_vdpa_get_mr(struct mlx5_vdpa_dev *mvdev, + struct mlx5_vdpa_mr *mr) +{ + mutex_lock(&mvdev->mr_mtx); + _mlx5_vdpa_get_mr(mvdev, mr); mutex_unlock(&mvdev->mr_mtx); - - kfree(mr); } void mlx5_vdpa_update_mr(struct mlx5_vdpa_dev *mvdev, @@ -534,20 +554,16 @@ void mlx5_vdpa_update_mr(struct mlx5_vdpa_dev *mvdev, mutex_lock(&mvdev->mr_mtx); + _mlx5_vdpa_put_mr(mvdev, old_mr); mvdev->mr[asid] = new_mr; - if (old_mr) { - _mlx5_vdpa_destroy_mr(mvdev, old_mr); - kfree(old_mr); - } mutex_unlock(&mvdev->mr_mtx); - } void mlx5_vdpa_destroy_mr_resources(struct mlx5_vdpa_dev *mvdev) { for (int i = 0; i < MLX5_VDPA_NUM_AS; i++) - mlx5_vdpa_destroy_mr(mvdev, mvdev->mr[i]); + mlx5_vdpa_update_mr(mvdev, NULL, i); prune_iotlb(mvdev->cvq.iotlb); } @@ -607,6 +623,8 @@ struct mlx5_vdpa_mr *mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, if (err) goto out_err; + refcount_set(&mr->refcount, 1); + return mr; out_err: @@ -651,7 +669,7 @@ int mlx5_vdpa_reset_mr(struct mlx5_vdpa_dev *mvdev, unsigned int asid) if (asid >= MLX5_VDPA_NUM_AS) return -EINVAL; - mlx5_vdpa_destroy_mr(mvdev, mvdev->mr[asid]); + mlx5_vdpa_update_mr(mvdev, NULL, asid); if (asid == 0 && MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) { if (mlx5_vdpa_create_dma_mr(mvdev)) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 6a21223d97a8e..133cbb66dcfe4 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -123,6 +123,9 @@ struct mlx5_vdpa_virtqueue { u64 modified_fields; + struct mlx5_vdpa_mr *vq_mr; + struct mlx5_vdpa_mr *desc_mr; + struct msi_map map; /* keep last in the struct */ @@ -946,6 +949,14 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque kfree(in); mvq->virtq_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); + mlx5_vdpa_get_mr(mvdev, vq_mr); + mvq->vq_mr = vq_mr; + + if (vq_desc_mr && MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, desc_group_mkey_supported)) { + mlx5_vdpa_get_mr(mvdev, vq_desc_mr); + mvq->desc_mr = vq_desc_mr; + } + return 0; err_cmd: @@ -972,6 +983,12 @@ static void destroy_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtq } mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE; umems_destroy(ndev, mvq); + + mlx5_vdpa_put_mr(&ndev->mvdev, mvq->vq_mr); + mvq->vq_mr = NULL; + + mlx5_vdpa_put_mr(&ndev->mvdev, mvq->desc_mr); + mvq->desc_mr = NULL; } static u32 get_rqpn(struct mlx5_vdpa_virtqueue *mvq, bool fw) @@ -1207,6 +1224,8 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev, int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in); u32 out[MLX5_ST_SZ_DW(modify_virtio_net_q_out)] = {}; struct mlx5_vdpa_dev *mvdev = &ndev->mvdev; + struct mlx5_vdpa_mr *desc_mr = NULL; + struct mlx5_vdpa_mr *vq_mr = NULL; bool state_change = false; void *obj_context; void *cmd_hdr; @@ -1257,19 +1276,19 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev, MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx); if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY) { - struct mlx5_vdpa_mr *mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP]]; + vq_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP]]; - if (mr) - MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, mr->mkey); + if (vq_mr) + MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, vq_mr->mkey); else mvq->modified_fields &= ~MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY; } if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY) { - struct mlx5_vdpa_mr *mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_DESC_GROUP]]; + desc_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_DESC_GROUP]]; - if (mr && MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, desc_group_mkey_supported)) - MLX5_SET(virtio_q, vq_ctx, desc_group_mkey, mr->mkey); + if (desc_mr && MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, desc_group_mkey_supported)) + MLX5_SET(virtio_q, vq_ctx, desc_group_mkey, desc_mr->mkey); else mvq->modified_fields &= ~MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY; } @@ -1282,6 +1301,18 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev, if (state_change) mvq->fw_state = state; + if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY) { + mlx5_vdpa_put_mr(mvdev, mvq->vq_mr); + mlx5_vdpa_get_mr(mvdev, vq_mr); + mvq->vq_mr = vq_mr; + } + + if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY) { + mlx5_vdpa_put_mr(mvdev, mvq->desc_mr); + mlx5_vdpa_get_mr(mvdev, desc_mr); + mvq->desc_mr = desc_mr; + } + mvq->modified_fields = 0; done: @@ -3095,7 +3126,7 @@ static int set_map_data(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb, return mlx5_vdpa_update_cvq_iotlb(mvdev, iotlb, asid); out_err: - mlx5_vdpa_destroy_mr(mvdev, new_mr); + mlx5_vdpa_put_mr(mvdev, new_mr); return err; } -- GitLab From f16d65124380ac6de8055c4a8e5373a1043bb09b Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Mon, 25 Dec 2023 17:12:03 +0200 Subject: [PATCH 0785/1290] vdpa/mlx5: Add mkey leak detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Track allocated mrs in a list and show warning when leaks are detected on device free or reset. Reviewed-by: Gal Pressman Acked-by: Eugenio Pérez Signed-off-by: Dragos Tatulea Message-Id: <20231225151203.152687-9-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/mlx5/core/mlx5_vdpa.h | 2 ++ drivers/vdpa/mlx5/core/mr.c | 23 +++++++++++++++++++++++ drivers/vdpa/mlx5/net/mlx5_vnet.c | 2 ++ 3 files changed, 27 insertions(+) diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h index 1a0d27b6e09af..50aac8fe57ef5 100644 --- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h @@ -37,6 +37,7 @@ struct mlx5_vdpa_mr { bool user_mr; refcount_t refcount; + struct list_head mr_list; }; struct mlx5_vdpa_resources { @@ -95,6 +96,7 @@ struct mlx5_vdpa_dev { u32 generation; struct mlx5_vdpa_mr *mr[MLX5_VDPA_NUM_AS]; + struct list_head mr_list_head; /* serialize mr access */ struct mutex mr_mtx; struct mlx5_control_vq cvq; diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c index c7dc8914354aa..4758914ccf860 100644 --- a/drivers/vdpa/mlx5/core/mr.c +++ b/drivers/vdpa/mlx5/core/mr.c @@ -508,6 +508,8 @@ static void _mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_ vhost_iotlb_free(mr->iotlb); + list_del(&mr->mr_list); + kfree(mr); } @@ -560,12 +562,31 @@ void mlx5_vdpa_update_mr(struct mlx5_vdpa_dev *mvdev, mutex_unlock(&mvdev->mr_mtx); } +static void mlx5_vdpa_show_mr_leaks(struct mlx5_vdpa_dev *mvdev) +{ + struct mlx5_vdpa_mr *mr; + + mutex_lock(&mvdev->mr_mtx); + + list_for_each_entry(mr, &mvdev->mr_list_head, mr_list) { + + mlx5_vdpa_warn(mvdev, "mkey still alive after resource delete: " + "mr: %p, mkey: 0x%x, refcount: %u\n", + mr, mr->mkey, refcount_read(&mr->refcount)); + } + + mutex_unlock(&mvdev->mr_mtx); + +} + void mlx5_vdpa_destroy_mr_resources(struct mlx5_vdpa_dev *mvdev) { for (int i = 0; i < MLX5_VDPA_NUM_AS; i++) mlx5_vdpa_update_mr(mvdev, NULL, i); prune_iotlb(mvdev->cvq.iotlb); + + mlx5_vdpa_show_mr_leaks(mvdev); } static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, @@ -592,6 +613,8 @@ static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, if (err) goto err_iotlb; + list_add_tail(&mr->mr_list, &mvdev->mr_list_head); + return 0; err_iotlb: diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 133cbb66dcfe4..778821bab7d93 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -3722,6 +3722,8 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, if (err) goto err_mpfs; + INIT_LIST_HEAD(&mvdev->mr_list_head); + if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) { err = mlx5_vdpa_create_dma_mr(mvdev); if (err) -- GitLab From 06c59d427017fcde3107c236177fcc74c9db7909 Mon Sep 17 00:00:00 2001 From: William Butler Date: Wed, 10 Jan 2024 18:28:55 +0000 Subject: [PATCH 0786/1290] nvme-pci: set doorbell config before unquiescing During resets, if queues are unquiesced first, then the host can submit IOs to the controller using shadow doorbell logic but the controller won't be aware. This can lead to necessary MMIO doorbells from being not issued, causing requests to be delayed and timed-out. Signed-off-by: William Butler Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 75e763ce09aa0..46d3897b8986a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2746,10 +2746,10 @@ static void nvme_reset_work(struct work_struct *work) * controller around but remove all namespaces. */ if (dev->online_queues > 1) { + nvme_dbbuf_set(dev); nvme_unquiesce_io_queues(&dev->ctrl); nvme_wait_freeze(&dev->ctrl); nvme_pci_update_nr_queues(dev); - nvme_dbbuf_set(dev); nvme_unfreeze(&dev->ctrl); } else { dev_warn(dev->ctrl.device, "IO queues lost\n"); -- GitLab From fe80eb15dea5125ea64845c9de0dd7f8478dd267 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 10 Jan 2024 10:05:32 -0700 Subject: [PATCH 0787/1290] io_uring/rw: cleanup io_rw_done() This originally came from the aio side, and it's laid out rather oddly. The common case here is that we either get -EIOCBQUEUED from submitting an async request, or that we complete the request correctly with the given number of bytes. Handling the odd internal restart error codes is not a common operation. Lay it out a bit more optimally that better explains the normal flow, and switch to avoiding the indirect call completely as this is our kiocb and we know the completion handler can only be one of two possible variants. While at it, move it to where it belongs in the file, with fellow end IO helpers. Outside of being easier to read, this also reduces the text size of the function by 24 bytes for me on arm64. Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- io_uring/rw.c | 48 +++++++++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 0c856726b15db..118cc9f1cf160 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -168,27 +168,6 @@ void io_readv_writev_cleanup(struct io_kiocb *req) kfree(io->free_iovec); } -static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) -{ - switch (ret) { - case -EIOCBQUEUED: - break; - case -ERESTARTSYS: - case -ERESTARTNOINTR: - case -ERESTARTNOHAND: - case -ERESTART_RESTARTBLOCK: - /* - * We can't just restart the syscall, since previously - * submitted sqes may already be in progress. Just fail this - * IO with EINTR. - */ - ret = -EINTR; - fallthrough; - default: - kiocb->ki_complete(kiocb, ret); - } -} - static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); @@ -371,6 +350,33 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) smp_store_release(&req->iopoll_completed, 1); } +static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) +{ + /* IO was queued async, completion will happen later */ + if (ret == -EIOCBQUEUED) + return; + + /* transform internal restart error codes */ + if (unlikely(ret < 0)) { + switch (ret) { + case -ERESTARTSYS: + case -ERESTARTNOINTR: + case -ERESTARTNOHAND: + case -ERESTART_RESTARTBLOCK: + /* + * We can't just restart the syscall, since previously + * submitted sqes may already be in progress. Just fail + * this IO with EINTR. + */ + ret = -EINTR; + break; + } + } + + INDIRECT_CALL_2(kiocb->ki_complete, io_complete_rw_iopoll, + io_complete_rw, kiocb, ret); +} + static int kiocb_done(struct io_kiocb *req, ssize_t ret, unsigned int issue_flags) { -- GitLab From 07a29b134ce8e47aef15ea71eab8e6b3734a9720 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 8 Dec 2023 13:53:20 +0100 Subject: [PATCH 0788/1290] nvmet-tcp: avoid circular locking dependency on install_queue() nvmet_tcp_install_queue() is driven from the ->io_work workqueue function, but will call flush_workqueue() which might trigger ->release_work() which in itself calls flush_work on ->io_work. To avoid that check for pending queue in disconnecting status, and return 'controller busy' when we reached a certain threshold. Signed-off-by: Hannes Reinecke Tested-by: Shin'ichiro Kawasaki Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 4dc60cbcb205b..6a1e6bb80062d 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -25,6 +25,7 @@ #define NVMET_TCP_DEF_INLINE_DATA_SIZE (4 * PAGE_SIZE) #define NVMET_TCP_MAXH2CDATA 0x400000 /* 16M arbitrary limit */ +#define NVMET_TCP_BACKLOG 128 static int param_store_val(const char *str, int *val, int min, int max) { @@ -2067,7 +2068,7 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport) goto err_sock; } - ret = kernel_listen(port->sock, 128); + ret = kernel_listen(port->sock, NVMET_TCP_BACKLOG); if (ret) { pr_err("failed to listen %d on port sock\n", ret); goto err_sock; @@ -2133,8 +2134,19 @@ static u16 nvmet_tcp_install_queue(struct nvmet_sq *sq) container_of(sq, struct nvmet_tcp_queue, nvme_sq); if (sq->qid == 0) { - /* Let inflight controller teardown complete */ - flush_workqueue(nvmet_wq); + struct nvmet_tcp_queue *q; + int pending = 0; + + /* Check for pending controller teardown */ + mutex_lock(&nvmet_tcp_queue_mutex); + list_for_each_entry(q, &nvmet_tcp_queue_list, queue_list) { + if (q->nvme_sq.ctrl == sq->ctrl && + q->state == NVMET_TCP_Q_DISCONNECTING) + pending++; + } + mutex_unlock(&nvmet_tcp_queue_mutex); + if (pending > NVMET_TCP_BACKLOG) + return NVME_SC_CONNECT_CTRL_BUSY; } queue->nr_cmds = sq->size * 2; -- GitLab From 31deaeb11ba7a885116c9c30892b9f763c04d59c Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 8 Dec 2023 13:53:21 +0100 Subject: [PATCH 0789/1290] nvmet-rdma: avoid circular locking dependency on install_queue() nvmet_rdma_install_queue() is driven from the ->io_work workqueue function, but will call flush_workqueue() which might trigger ->release_work() which in itself calls flush_work on ->io_work. To avoid that check for pending queue in disconnecting status, and return 'controller busy' when we reached a certain threshold. Signed-off-by: Hannes Reinecke Tested-by: Shin'ichiro Kawasaki Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/rdma.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 4597bca43a6d8..667f9c04f35d5 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -37,6 +37,8 @@ #define NVMET_RDMA_MAX_MDTS 8 #define NVMET_RDMA_MAX_METADATA_MDTS 5 +#define NVMET_RDMA_BACKLOG 128 + struct nvmet_rdma_srq; struct nvmet_rdma_cmd { @@ -1583,8 +1585,19 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, } if (queue->host_qid == 0) { - /* Let inflight controller teardown complete */ - flush_workqueue(nvmet_wq); + struct nvmet_rdma_queue *q; + int pending = 0; + + /* Check for pending controller teardown */ + mutex_lock(&nvmet_rdma_queue_mutex); + list_for_each_entry(q, &nvmet_rdma_queue_list, queue_list) { + if (q->nvme_sq.ctrl == queue->nvme_sq.ctrl && + q->state == NVMET_RDMA_Q_DISCONNECTING) + pending++; + } + mutex_unlock(&nvmet_rdma_queue_mutex); + if (pending > NVMET_RDMA_BACKLOG) + return NVME_SC_CONNECT_CTRL_BUSY; } ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn); @@ -1880,7 +1893,7 @@ static int nvmet_rdma_enable_port(struct nvmet_rdma_port *port) goto out_destroy_id; } - ret = rdma_listen(cm_id, 128); + ret = rdma_listen(cm_id, NVMET_RDMA_BACKLOG); if (ret) { pr_err("listening to %pISpcs failed (%d)\n", addr, ret); goto out_destroy_id; -- GitLab From 78f70c02bdbccb5e9b0b0c728185d4aeb7044ace Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 9 Jan 2024 08:57:19 +0100 Subject: [PATCH 0790/1290] vfio/virtio: fix virtio-pci dependency The new vfio-virtio driver already has a dependency on VIRTIO_PCI_ADMIN_LEGACY, but that is a bool symbol and allows vfio-virtio to be built-in even if virtio-pci itself is a loadable module. This leads to a link failure: aarch64-linux-ld: drivers/vfio/pci/virtio/main.o: in function `virtiovf_pci_probe': main.c:(.text+0xec): undefined reference to `virtio_pci_admin_has_legacy_io' aarch64-linux-ld: drivers/vfio/pci/virtio/main.o: in function `virtiovf_pci_init_device': main.c:(.text+0x260): undefined reference to `virtio_pci_admin_legacy_io_notify_info' aarch64-linux-ld: drivers/vfio/pci/virtio/main.o: in function `virtiovf_pci_bar0_rw': main.c:(.text+0x6ec): undefined reference to `virtio_pci_admin_legacy_common_io_read' aarch64-linux-ld: main.c:(.text+0x6f4): undefined reference to `virtio_pci_admin_legacy_device_io_read' aarch64-linux-ld: main.c:(.text+0x7f0): undefined reference to `virtio_pci_admin_legacy_common_io_write' aarch64-linux-ld: main.c:(.text+0x7f8): undefined reference to `virtio_pci_admin_legacy_device_io_write' Add another explicit dependency on the tristate symbol. Fixes: eb61eca0e8c3 ("vfio/virtio: Introduce a vfio driver over virtio devices") Signed-off-by: Arnd Bergmann Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240109075731.2726731-1-arnd@kernel.org Signed-off-by: Alex Williamson --- drivers/vfio/pci/virtio/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vfio/pci/virtio/Kconfig b/drivers/vfio/pci/virtio/Kconfig index fc3a0be9d8d42..bd80eca4a196c 100644 --- a/drivers/vfio/pci/virtio/Kconfig +++ b/drivers/vfio/pci/virtio/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config VIRTIO_VFIO_PCI tristate "VFIO support for VIRTIO NET PCI devices" - depends on VIRTIO_PCI_ADMIN_LEGACY + depends on VIRTIO_PCI && VIRTIO_PCI_ADMIN_LEGACY select VFIO_PCI_CORE help This provides support for exposing VIRTIO NET VF devices which support -- GitLab From d61b40bf15ce453f3aa71f6b423938e239e7f8f8 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 8 Jan 2024 18:17:34 -0800 Subject: [PATCH 0791/1290] xfs: fix backwards logic in xfs_bmap_alloc_account We're only allocating from the realtime device if the inode is marked for realtime and we're /not/ allocating into the attr fork. Fixes: 58643460546d ("xfs: also use xfs_bmap_btalloc_accounting for RT allocations") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_bmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 98aaca933bddb..f362345467fac 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3277,7 +3277,7 @@ xfs_bmap_alloc_account( struct xfs_bmalloca *ap) { bool isrt = XFS_IS_REALTIME_INODE(ap->ip) && - (ap->flags & XFS_BMAPI_ATTRFORK); + !(ap->flags & XFS_BMAPI_ATTRFORK); uint fld; if (ap->flags & XFS_BMAPI_COWFORK) { -- GitLab From 4dc4af9ce32681fbd16aa0e757ccba341cc9d4ca Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 6 Dec 2023 12:08:09 +0100 Subject: [PATCH 0792/1290] riscv: sbi: Introduce system suspend support When the SUSP SBI extension is present it implies that the standard "suspend to RAM" type is available. Wire it up to the generic platform suspend support, also applying the already present support for non-retentive CPU suspend. When the kernel is built with CONFIG_SUSPEND, one can do 'echo mem > /sys/power/state' to suspend. Resumption will occur when a platform-specific wake-up event arrives. Signed-off-by: Andrew Jones Tested-by: Samuel Holland Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20231206110807.35882-4-ajones@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 2 +- arch/riscv/include/asm/sbi.h | 9 ++++++++ arch/riscv/kernel/suspend.c | 44 ++++++++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 1 deletion(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 3db3d0fa046ef..e0e7cb89ee349 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -66,7 +66,7 @@ config RISCV select CLINT_TIMER if !MMU select CLONE_BACKWARDS select COMMON_CLK - select CPU_PM if CPU_IDLE || HIBERNATION + select CPU_PM if CPU_IDLE || HIBERNATION || SUSPEND select EDAC_SUPPORT select FRAME_POINTER if PERF_EVENTS || (FUNCTION_TRACER && !DYNAMIC_FTRACE) select GENERIC_ARCH_TOPOLOGY diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 0892f4421bc4a..f09356e187dfc 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -29,6 +29,7 @@ enum sbi_ext_id { SBI_EXT_RFENCE = 0x52464E43, SBI_EXT_HSM = 0x48534D, SBI_EXT_SRST = 0x53525354, + SBI_EXT_SUSP = 0x53555350, SBI_EXT_PMU = 0x504D55, SBI_EXT_DBCN = 0x4442434E, @@ -114,6 +115,14 @@ enum sbi_srst_reset_reason { SBI_SRST_RESET_REASON_SYS_FAILURE, }; +enum sbi_ext_susp_fid { + SBI_EXT_SUSP_SYSTEM_SUSPEND = 0, +}; + +enum sbi_ext_susp_sleep_type { + SBI_SUSP_SLEEP_TYPE_SUSPEND_TO_RAM = 0, +}; + enum sbi_ext_pmu_fid { SBI_EXT_PMU_NUM_COUNTERS = 0, SBI_EXT_PMU_COUNTER_GET_INFO, diff --git a/arch/riscv/kernel/suspend.c b/arch/riscv/kernel/suspend.c index 3c89b8ec69c49..239509367e423 100644 --- a/arch/riscv/kernel/suspend.c +++ b/arch/riscv/kernel/suspend.c @@ -4,8 +4,12 @@ * Copyright (c) 2022 Ventana Micro Systems Inc. */ +#define pr_fmt(fmt) "suspend: " fmt + #include +#include #include +#include #include void suspend_save_csrs(struct suspend_context *context) @@ -85,3 +89,43 @@ int cpu_suspend(unsigned long arg, return rc; } + +#ifdef CONFIG_RISCV_SBI +static int sbi_system_suspend(unsigned long sleep_type, + unsigned long resume_addr, + unsigned long opaque) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_SUSP, SBI_EXT_SUSP_SYSTEM_SUSPEND, + sleep_type, resume_addr, opaque, 0, 0, 0); + if (ret.error) + return sbi_err_map_linux_errno(ret.error); + + return ret.value; +} + +static int sbi_system_suspend_enter(suspend_state_t state) +{ + return cpu_suspend(SBI_SUSP_SLEEP_TYPE_SUSPEND_TO_RAM, sbi_system_suspend); +} + +static const struct platform_suspend_ops sbi_system_suspend_ops = { + .valid = suspend_valid_only_mem, + .enter = sbi_system_suspend_enter, +}; + +static int __init sbi_system_suspend_init(void) +{ + if (sbi_spec_version >= sbi_mk_version(2, 0) && + sbi_probe_extension(SBI_EXT_SUSP) > 0) { + pr_info("SBI SUSP extension detected\n"); + if (IS_ENABLED(CONFIG_SUSPEND)) + suspend_set_ops(&sbi_system_suspend_ops); + } + + return 0; +} + +arch_initcall(sbi_system_suspend_init); +#endif /* CONFIG_RISCV_SBI */ -- GitLab From a452816132d699bbb2af6fab8530685306054bda Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Wed, 27 Dec 2023 09:57:38 -0800 Subject: [PATCH 0793/1290] dt-bindings: riscv: cpus: Clarify mmu-type interpretation The current description implies that only a single address translation mode is available to the operating system. However, some implementations support multiple address translation modes, and the operating system is free to choose between them. Per the RISC-V privileged specification, Sv48 implementations must also implement Sv39, and likewise Sv57 implies support for Sv48. This means it is possible to describe all supported address translation modes using a single value, by naming the largest supported mode. This appears to have been the intended usage of the property, so note it explicitly. Fixes: 4fd669a8c487 ("dt-bindings: riscv: convert cpu binding to json-schema") Signed-off-by: Samuel Holland Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20231227175739.1453782-1-samuel.holland@sifive.com Signed-off-by: Palmer Dabbelt --- Documentation/devicetree/bindings/riscv/cpus.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/riscv/cpus.yaml b/Documentation/devicetree/bindings/riscv/cpus.yaml index 23646b684ea27..72f8af4828185 100644 --- a/Documentation/devicetree/bindings/riscv/cpus.yaml +++ b/Documentation/devicetree/bindings/riscv/cpus.yaml @@ -63,8 +63,8 @@ properties: mmu-type: description: - Identifies the MMU address translation mode used on this - hart. These values originate from the RISC-V Privileged + Identifies the largest MMU address translation mode supported by + this hart. These values originate from the RISC-V Privileged Specification document, available from https://riscv.org/specifications/ $ref: /schemas/types.yaml#/definitions/string -- GitLab From 07df87c0f8815898cb994408c4b6dd542a1394b8 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Fri, 8 Dec 2023 16:06:51 +0000 Subject: [PATCH 0794/1290] dt-bindings: riscv: permit numbers in "riscv,isa" There are some extensions that contain numbers, such as Zve32f, which are enabled by the "max" cpu type in QEMU. Signed-off-by: Conor Dooley Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20231208-uncolored-oxidant-5ab37dd3ab84@spud Signed-off-by: Palmer Dabbelt --- Documentation/devicetree/bindings/riscv/extensions.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/riscv/extensions.yaml b/Documentation/devicetree/bindings/riscv/extensions.yaml index 27beedb981987..63d81dc895e5c 100644 --- a/Documentation/devicetree/bindings/riscv/extensions.yaml +++ b/Documentation/devicetree/bindings/riscv/extensions.yaml @@ -48,7 +48,7 @@ properties: insensitive, letters in the riscv,isa string must be all lowercase. $ref: /schemas/types.yaml#/definitions/string - pattern: ^rv(?:64|32)imaf?d?q?c?b?k?j?p?v?h?(?:[hsxz](?:[a-z])+)?(?:_[hsxz](?:[a-z])+)*$ + pattern: ^rv(?:64|32)imaf?d?q?c?b?k?j?p?v?h?(?:[hsxz](?:[0-9a-z])+)?(?:_[hsxz](?:[0-9a-z])+)*$ deprecated: true riscv,isa-base: -- GitLab From d3e591a38c98d448ae84eba1f89388c55382cb0e Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Sun, 29 Oct 2023 09:35:00 -0300 Subject: [PATCH 0795/1290] dt-bindings: riscv: Document cbop-block-size Following the examples of cbom-block-size and cboz-block-size, cbop-block-size is the cache size of Zicbop (cbo.prefetch) operations. The most common case is to have all cache block sizes to be the same size (e.g. profiles such as rva22u64 mandates a 64 bytes size for all cache operations), but there's no specification requirement for that, and an implementation can have different cache sizes for each operation. Cc: Rob Herring Cc: Conor Dooley Signed-off-by: Daniel Henrique Barboza Acked-by: Conor Dooley Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20231029123500.739409-1-dbarboza@ventanamicro.com Signed-off-by: Palmer Dabbelt --- Documentation/devicetree/bindings/riscv/cpus.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/devicetree/bindings/riscv/cpus.yaml b/Documentation/devicetree/bindings/riscv/cpus.yaml index 72f8af4828185..9d8670c00e3b3 100644 --- a/Documentation/devicetree/bindings/riscv/cpus.yaml +++ b/Documentation/devicetree/bindings/riscv/cpus.yaml @@ -80,6 +80,11 @@ properties: description: The blocksize in bytes for the Zicbom cache operations. + riscv,cbop-block-size: + $ref: /schemas/types.yaml#/definitions/uint32 + description: + The blocksize in bytes for the Zicbop cache operations. + riscv,cboz-block-size: $ref: /schemas/types.yaml#/definitions/uint32 description: -- GitLab From e3b3ec967a7d93b9010a5af9a2394c8b5c8f31ed Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Thu, 11 Jan 2024 11:52:26 +0100 Subject: [PATCH 0796/1290] ASoC: mediatek: sof-common: Add NULL check for normal_link string It's not granted that all entries of struct sof_conn_stream declare a `normal_link` (a non-SOF, direct link) string, and this is the case for SoCs that support only SOF paths (hence do not support both direct and SOF usecases). For example, in the case of MT8188 there is no normal_link string in any of the sof_conn_stream entries and there will be more drivers doing that in the future. To avoid possible NULL pointer KPs, add a NULL check for `normal_link`. Fixes: 0caf1120c583 ("ASoC: mediatek: mt8195: extract SOF common code") Signed-off-by: AngeloGioacchino Del Regno Link: https://msgid.link/r/20240111105226.117603-1-angelogioacchino.delregno@collabora.com Signed-off-by: Mark Brown --- sound/soc/mediatek/common/mtk-dsp-sof-common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/mediatek/common/mtk-dsp-sof-common.c b/sound/soc/mediatek/common/mtk-dsp-sof-common.c index f3894010f6563..7ec8965a70c06 100644 --- a/sound/soc/mediatek/common/mtk-dsp-sof-common.c +++ b/sound/soc/mediatek/common/mtk-dsp-sof-common.c @@ -24,7 +24,7 @@ int mtk_sof_dai_link_fixup(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai_link *sof_dai_link = NULL; const struct sof_conn_stream *conn = &sof_priv->conn_streams[i]; - if (strcmp(rtd->dai_link->name, conn->normal_link)) + if (conn->normal_link && strcmp(rtd->dai_link->name, conn->normal_link)) continue; for_each_card_rtds(card, runtime) { -- GitLab From ff172d4818ad32dba433dae189e36684e43c5c74 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Thu, 14 Dec 2023 14:29:35 +0100 Subject: [PATCH 0797/1290] riscv: Use hugepage mappings for vmemmap This will allow better TLB utilization and then should be more performant. Before: ---[ vmemmap start ]--- 0xffff8d8002000000-0xffff8d8012000000 0x000000046ec00000 256M PTE . .. .. D A G . . W R V ---[ vmemmap end ]--- After: ---[ vmemmap start ]--- 0xffff8d8002000000-0xffff8d8012000000 0x000000046ec00000 256M PMD . .. .. D A G . . W R V ---[ vmemmap end ]--- Signed-off-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20231214132935.212864-1-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index a65937336cdc8..f533dd667a83a 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1387,10 +1387,29 @@ void __init misc_mem_init(void) } #ifdef CONFIG_SPARSEMEM_VMEMMAP +void __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, + unsigned long addr, unsigned long next) +{ + pmd_set_huge(pmd, virt_to_phys(p), PAGE_KERNEL); +} + +int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, + unsigned long addr, unsigned long next) +{ + vmemmap_verify((pte_t *)pmdp, node, addr, next); + return 1; +} + int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap) { - return vmemmap_populate_basepages(start, end, node, NULL); + /* + * Note that SPARSEMEM_VMEMMAP is only selected for rv64 and that we + * can't use hugepage mappings for 2-level page table because in case of + * memory hotplug, we are not able to update all the page tables with + * the new PMDs. + */ + return vmemmap_populate_hugepages(start, end, node, NULL); } #endif -- GitLab From 54d7431af73e2fa53b73cfeb2bec559c6664a4e4 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 8 Jan 2024 20:36:40 +0100 Subject: [PATCH 0798/1290] riscv: Add support for BATCHED_UNMAP_TLB_FLUSH Allow to defer the flushing of the TLB when unmapping pages, which allows to reduce the numbers of IPI and the number of sfence.vma. The ubenchmarch used in commit 43b3dfdd0455 ("arm64: support batched/deferred tlb shootdown during page reclamation/migration") that was multithreaded to force the usage of IPI shows good performance improvement on all platforms: * Unmatched: ~34% * TH1520 : ~78% * Qemu : ~81% In addition, perf on qemu reports an important decrease in time spent dealing with IPIs: Before: 68.17% main [kernel.kallsyms] [k] __sbi_rfence_v02_call After : 8.64% main [kernel.kallsyms] [k] __sbi_rfence_v02_call * Benchmark: int stick_this_thread_to_core(int core_id) { int num_cores = sysconf(_SC_NPROCESSORS_ONLN); if (core_id < 0 || core_id >= num_cores) return EINVAL; cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET(core_id, &cpuset); pthread_t current_thread = pthread_self(); return pthread_setaffinity_np(current_thread, sizeof(cpu_set_t), &cpuset); } static void *fn_thread (void *p_data) { int ret; pthread_t thread; stick_this_thread_to_core((int)p_data); while (1) { sleep(1); } return NULL; } int main() { volatile unsigned char *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); pthread_t threads[4]; int ret; for (int i = 0; i < 4; ++i) { ret = pthread_create(&threads[i], NULL, fn_thread, (void *)i); if (ret) { printf("%s", strerror (ret)); } } memset(p, 0x88, SIZE); for (int k = 0; k < 10000; k++) { /* swap in */ for (int i = 0; i < SIZE; i += 4096) { (void)p[i]; } /* swap out */ madvise(p, SIZE, MADV_PAGEOUT); } for (int i = 0; i < 4; i++) { pthread_cancel(threads[i]); } for (int i = 0; i < 4; i++) { pthread_join(threads[i], NULL); } return 0; } Signed-off-by: Alexandre Ghiti Reviewed-by: Jisheng Zhang Tested-by: Jisheng Zhang # Tested on TH1520 Tested-by: Nam Cao Link: https://lore.kernel.org/r/20240108193640.344929-1-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- .../features/vm/TLB/arch-support.txt | 2 +- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/tlbbatch.h | 15 ++++ arch/riscv/include/asm/tlbflush.h | 8 +++ arch/riscv/mm/tlbflush.c | 69 +++++++++++++------ 5 files changed, 74 insertions(+), 21 deletions(-) create mode 100644 arch/riscv/include/asm/tlbbatch.h diff --git a/Documentation/features/vm/TLB/arch-support.txt b/Documentation/features/vm/TLB/arch-support.txt index 8fd22073a847e..d222bd3ee7495 100644 --- a/Documentation/features/vm/TLB/arch-support.txt +++ b/Documentation/features/vm/TLB/arch-support.txt @@ -20,7 +20,7 @@ | openrisc: | .. | | parisc: | TODO | | powerpc: | TODO | - | riscv: | TODO | + | riscv: | ok | | s390: | TODO | | sh: | TODO | | sparc: | TODO | diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index e0e7cb89ee349..d42155c29a55b 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -53,6 +53,7 @@ config RISCV select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS select ARCH_USES_CFI_TRAPS if CFI_CLANG + select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP && MMU select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_GENERAL_HUGETLB if !RISCV_ISA_SVNAPOT diff --git a/arch/riscv/include/asm/tlbbatch.h b/arch/riscv/include/asm/tlbbatch.h new file mode 100644 index 0000000000000..46014f70b9daa --- /dev/null +++ b/arch/riscv/include/asm/tlbbatch.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2023 Rivos Inc. + */ + +#ifndef _ASM_RISCV_TLBBATCH_H +#define _ASM_RISCV_TLBBATCH_H + +#include + +struct arch_tlbflush_unmap_batch { + struct cpumask cpumask; +}; + +#endif /* _ASM_RISCV_TLBBATCH_H */ diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h index 8f3418c5f1724..9c8a67b1285ec 100644 --- a/arch/riscv/include/asm/tlbflush.h +++ b/arch/riscv/include/asm/tlbflush.h @@ -46,6 +46,14 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end); void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); #endif + +bool arch_tlbbatch_should_defer(struct mm_struct *mm); +void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, + struct mm_struct *mm, + unsigned long uaddr); +void arch_flush_tlb_batched_pending(struct mm_struct *mm); +void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); + #else /* CONFIG_SMP && CONFIG_MMU */ #define flush_tlb_all() local_flush_tlb_all() diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c index e6659d7368b35..f0190f5fdd050 100644 --- a/arch/riscv/mm/tlbflush.c +++ b/arch/riscv/mm/tlbflush.c @@ -93,29 +93,23 @@ static void __ipi_flush_tlb_range_asid(void *info) local_flush_tlb_range_asid(d->start, d->size, d->stride, d->asid); } -static void __flush_tlb_range(struct mm_struct *mm, unsigned long start, - unsigned long size, unsigned long stride) +static void __flush_tlb_range(struct cpumask *cmask, unsigned long asid, + unsigned long start, unsigned long size, + unsigned long stride) { struct flush_tlb_range_data ftd; - const struct cpumask *cmask; - unsigned long asid = FLUSH_TLB_NO_ASID; bool broadcast; - if (mm) { - unsigned int cpuid; + if (cpumask_empty(cmask)) + return; - cmask = mm_cpumask(mm); - if (cpumask_empty(cmask)) - return; + if (cmask != cpu_online_mask) { + unsigned int cpuid; cpuid = get_cpu(); /* check if the tlbflush needs to be sent to other CPUs */ broadcast = cpumask_any_but(cmask, cpuid) < nr_cpu_ids; - - if (static_branch_unlikely(&use_asid_allocator)) - asid = atomic_long_read(&mm->context.id) & asid_mask; } else { - cmask = cpu_online_mask; broadcast = true; } @@ -135,25 +129,34 @@ static void __flush_tlb_range(struct mm_struct *mm, unsigned long start, local_flush_tlb_range_asid(start, size, stride, asid); } - if (mm) + if (cmask != cpu_online_mask) put_cpu(); } +static inline unsigned long get_mm_asid(struct mm_struct *mm) +{ + return static_branch_unlikely(&use_asid_allocator) ? + atomic_long_read(&mm->context.id) & asid_mask : FLUSH_TLB_NO_ASID; +} + void flush_tlb_mm(struct mm_struct *mm) { - __flush_tlb_range(mm, 0, FLUSH_TLB_MAX_SIZE, PAGE_SIZE); + __flush_tlb_range(mm_cpumask(mm), get_mm_asid(mm), + 0, FLUSH_TLB_MAX_SIZE, PAGE_SIZE); } void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned int page_size) { - __flush_tlb_range(mm, start, end - start, page_size); + __flush_tlb_range(mm_cpumask(mm), get_mm_asid(mm), + start, end - start, page_size); } void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) { - __flush_tlb_range(vma->vm_mm, addr, PAGE_SIZE, PAGE_SIZE); + __flush_tlb_range(mm_cpumask(vma->vm_mm), get_mm_asid(vma->vm_mm), + addr, PAGE_SIZE, PAGE_SIZE); } void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, @@ -185,18 +188,44 @@ void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, } } - __flush_tlb_range(vma->vm_mm, start, end - start, stride_size); + __flush_tlb_range(mm_cpumask(vma->vm_mm), get_mm_asid(vma->vm_mm), + start, end - start, stride_size); } void flush_tlb_kernel_range(unsigned long start, unsigned long end) { - __flush_tlb_range(NULL, start, end - start, PAGE_SIZE); + __flush_tlb_range((struct cpumask *)cpu_online_mask, FLUSH_TLB_NO_ASID, + start, end - start, PAGE_SIZE); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - __flush_tlb_range(vma->vm_mm, start, end - start, PMD_SIZE); + __flush_tlb_range(mm_cpumask(vma->vm_mm), get_mm_asid(vma->vm_mm), + start, end - start, PMD_SIZE); } #endif + +bool arch_tlbbatch_should_defer(struct mm_struct *mm) +{ + return true; +} + +void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, + struct mm_struct *mm, + unsigned long uaddr) +{ + cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); +} + +void arch_flush_tlb_batched_pending(struct mm_struct *mm) +{ + flush_tlb_mm(mm); +} + +void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) +{ + __flush_tlb_range(&batch->cpumask, FLUSH_TLB_NO_ASID, 0, + FLUSH_TLB_MAX_SIZE, PAGE_SIZE); +} -- GitLab From b91c26fdb0e8150cdb610cdaadea62bb5e43bee0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20M=C3=BCllner?= Date: Thu, 23 Nov 2023 19:58:17 +0100 Subject: [PATCH 0799/1290] tools: selftests: riscv: Fix compile warnings in hwprobe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GCC prints a couple of format string warnings when compiling the hwprobe test. Let's follow the recommendation in Documentation/printk-formats.txt to fix these warnings. Signed-off-by: Christoph Müllner Reviewed-by: Alexandre Ghiti Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20231123185821.2272504-2-christoph.muellner@vrull.eu Signed-off-by: Palmer Dabbelt --- tools/testing/selftests/riscv/hwprobe/hwprobe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/riscv/hwprobe/hwprobe.c b/tools/testing/selftests/riscv/hwprobe/hwprobe.c index c474891df3071..abb825811c70e 100644 --- a/tools/testing/selftests/riscv/hwprobe/hwprobe.c +++ b/tools/testing/selftests/riscv/hwprobe/hwprobe.c @@ -29,7 +29,7 @@ int main(int argc, char **argv) /* Fail if the kernel claims not to recognize a base key. */ if ((i < 4) && (pairs[i].key != i)) ksft_exit_fail_msg("Failed to recognize base key: key != i, " - "key=%ld, i=%ld\n", pairs[i].key, i); + "key=%lld, i=%ld\n", pairs[i].key, i); if (pairs[i].key != RISCV_HWPROBE_KEY_BASE_BEHAVIOR) continue; @@ -37,7 +37,7 @@ int main(int argc, char **argv) if (pairs[i].value & RISCV_HWPROBE_BASE_BEHAVIOR_IMA) continue; - ksft_exit_fail_msg("Unexpected pair: (%ld, %ld)\n", pairs[i].key, pairs[i].value); + ksft_exit_fail_msg("Unexpected pair: (%lld, %llu)\n", pairs[i].key, pairs[i].value); } out = riscv_hwprobe(pairs, 8, 0, 0, 0); -- GitLab From ac7b2a02d62faff8c6d45bacb5cb9ea565b47776 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20M=C3=BCllner?= Date: Thu, 23 Nov 2023 19:58:18 +0100 Subject: [PATCH 0800/1290] tools: selftests: riscv: Fix compile warnings in cbo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GCC prints a couple of format string warnings when compiling the cbo test. Let's follow the recommendation in Documentation/printk-formats.txt to fix these warnings. Signed-off-by: Christoph Müllner Reviewed-by: Alexandre Ghiti Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20231123185821.2272504-3-christoph.muellner@vrull.eu Signed-off-by: Palmer Dabbelt --- tools/testing/selftests/riscv/hwprobe/cbo.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/riscv/hwprobe/cbo.c b/tools/testing/selftests/riscv/hwprobe/cbo.c index 50a2cc8aef387..c6a83ab11e223 100644 --- a/tools/testing/selftests/riscv/hwprobe/cbo.c +++ b/tools/testing/selftests/riscv/hwprobe/cbo.c @@ -97,7 +97,7 @@ static void test_zicboz(void *arg) block_size = pair.value; ksft_test_result(rc == 0 && pair.key == RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE && is_power_of_2(block_size), "Zicboz block size\n"); - ksft_print_msg("Zicboz block size: %ld\n", block_size); + ksft_print_msg("Zicboz block size: %llu\n", block_size); illegal_insn = false; cbo_zero(&mem[block_size]); @@ -121,7 +121,7 @@ static void test_zicboz(void *arg) for (j = 0; j < block_size; ++j) { if (mem[i * block_size + j] != expected) { ksft_test_result_fail("cbo.zero check\n"); - ksft_print_msg("cbo.zero check: mem[%d] != 0x%x\n", + ksft_print_msg("cbo.zero check: mem[%llu] != 0x%x\n", i * block_size + j, expected); return; } @@ -201,7 +201,7 @@ int main(int argc, char **argv) pair.key = RISCV_HWPROBE_KEY_IMA_EXT_0; rc = riscv_hwprobe(&pair, 1, sizeof(cpu_set_t), (unsigned long *)&cpus, 0); if (rc < 0) - ksft_exit_fail_msg("hwprobe() failed with %d\n", rc); + ksft_exit_fail_msg("hwprobe() failed with %ld\n", rc); assert(rc == 0 && pair.key == RISCV_HWPROBE_KEY_IMA_EXT_0); if (pair.value & RISCV_HWPROBE_EXT_ZICBOZ) { -- GitLab From b250c90898412878fe56f069b1a6b1b94a7bbdfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20M=C3=BCllner?= Date: Thu, 23 Nov 2023 19:58:19 +0100 Subject: [PATCH 0801/1290] tools: selftests: riscv: Add missing include for vector test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GCC raises the following warning: warning: 'status' may be used uninitialized The warning comes from the fact, that the signature of waitpid() is unknown and therefore the initialization of GCC cannot be guessed. Let's add the relevant header to address this warning. Signed-off-by: Christoph Müllner Reviewed-by: Alexandre Ghiti Reviewed-by: Andy Chiu Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20231123185821.2272504-4-christoph.muellner@vrull.eu Signed-off-by: Palmer Dabbelt --- tools/testing/selftests/riscv/vector/vstate_exec_nolibc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/riscv/vector/vstate_exec_nolibc.c b/tools/testing/selftests/riscv/vector/vstate_exec_nolibc.c index 2c0d2b1126c1e..1f9969bed2355 100644 --- a/tools/testing/selftests/riscv/vector/vstate_exec_nolibc.c +++ b/tools/testing/selftests/riscv/vector/vstate_exec_nolibc.c @@ -1,4 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only + +#include + #define THIS_PROGRAM "./vstate_exec_nolibc" int main(int argc, char **argv) -- GitLab From e1baf5e68ed128c1e22ba43e5190526d85de323c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20M=C3=BCllner?= Date: Thu, 23 Nov 2023 19:58:20 +0100 Subject: [PATCH 0802/1290] tools: selftests: riscv: Fix compile warnings in vector tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GCC prints a couple of format string warnings when compiling the vector tests. Let's follow the recommendation in Documentation/printk-formats.txt to fix these warnings. Signed-off-by: Christoph Müllner Reviewed-by: Alexandre Ghiti Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20231123185821.2272504-5-christoph.muellner@vrull.eu Signed-off-by: Palmer Dabbelt --- tools/testing/selftests/riscv/vector/v_initval_nolibc.c | 2 +- tools/testing/selftests/riscv/vector/vstate_prctl.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/riscv/vector/v_initval_nolibc.c b/tools/testing/selftests/riscv/vector/v_initval_nolibc.c index 66764edb0d526..1dd94197da30c 100644 --- a/tools/testing/selftests/riscv/vector/v_initval_nolibc.c +++ b/tools/testing/selftests/riscv/vector/v_initval_nolibc.c @@ -27,7 +27,7 @@ int main(void) datap = malloc(MAX_VSIZE); if (!datap) { - ksft_test_result_fail("fail to allocate memory for size = %lu\n", MAX_VSIZE); + ksft_test_result_fail("fail to allocate memory for size = %d\n", MAX_VSIZE); exit(-1); } diff --git a/tools/testing/selftests/riscv/vector/vstate_prctl.c b/tools/testing/selftests/riscv/vector/vstate_prctl.c index b348b475be570..8ad94e08ff4d0 100644 --- a/tools/testing/selftests/riscv/vector/vstate_prctl.c +++ b/tools/testing/selftests/riscv/vector/vstate_prctl.c @@ -68,7 +68,7 @@ int test_and_compare_child(long provided, long expected, int inherit) } rc = launch_test(inherit); if (rc != expected) { - ksft_test_result_fail("Test failed, check %d != %d\n", rc, + ksft_test_result_fail("Test failed, check %d != %ld\n", rc, expected); return -2; } @@ -87,7 +87,7 @@ int main(void) pair.key = RISCV_HWPROBE_KEY_IMA_EXT_0; rc = riscv_hwprobe(&pair, 1, 0, NULL, 0); if (rc < 0) { - ksft_test_result_fail("hwprobe() failed with %d\n", rc); + ksft_test_result_fail("hwprobe() failed with %ld\n", rc); return -1; } -- GitLab From 12c16919652b5873f524c8b361336ecfa5ce5e6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20M=C3=BCllner?= Date: Thu, 23 Nov 2023 19:58:21 +0100 Subject: [PATCH 0803/1290] tools: selftests: riscv: Fix compile warnings in mm tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When building the mm tests with a riscv32 compiler, we see a range of shift-count-overflow errors from shifting 1UL by more than 32 bits in do_mmaps(). Since, the relevant code is only called from code that is gated by `__riscv_xlen == 64`, we can just apply the same gating to do_mmaps(). Signed-off-by: Christoph Müllner Reviewed-by: Alexandre Ghiti Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20231123185821.2272504-6-christoph.muellner@vrull.eu Signed-off-by: Palmer Dabbelt --- tools/testing/selftests/riscv/mm/mmap_test.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/riscv/mm/mmap_test.h b/tools/testing/selftests/riscv/mm/mmap_test.h index 9b8434f62f570..2e0db9c5be6c3 100644 --- a/tools/testing/selftests/riscv/mm/mmap_test.h +++ b/tools/testing/selftests/riscv/mm/mmap_test.h @@ -18,6 +18,8 @@ struct addresses { int *on_56_addr; }; +// Only works on 64 bit +#if __riscv_xlen == 64 static inline void do_mmaps(struct addresses *mmap_addresses) { /* @@ -50,6 +52,7 @@ static inline void do_mmaps(struct addresses *mmap_addresses) mmap_addresses->on_56_addr = mmap(on_56_bits, 5 * sizeof(int), prot, flags, 0, 0); } +#endif /* __riscv_xlen == 64 */ static inline int memory_layout(void) { -- GitLab From adb1f95d388a43c4c564ef3e436f18900dde978e Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 29 Oct 2023 08:20:40 +0100 Subject: [PATCH 0804/1290] riscv: Fix an off-by-one in get_early_cmdline() The ending NULL is not taken into account by strncat(), so switch to strlcat() to correctly compute the size of the available memory when appending CONFIG_CMDLINE to 'early_cmdline'. Fixes: 26e7aacb83df ("riscv: Allow to downgrade paging mode from the command line") Signed-off-by: Christophe JAILLET Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/9f66d2b58c8052d4055e90b8477ee55d9a0914f9.1698564026.git.christophe.jaillet@wanadoo.fr Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/pi/cmdline_early.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/riscv/kernel/pi/cmdline_early.c b/arch/riscv/kernel/pi/cmdline_early.c index 68e786c84c949..f6d4dedffb842 100644 --- a/arch/riscv/kernel/pi/cmdline_early.c +++ b/arch/riscv/kernel/pi/cmdline_early.c @@ -38,8 +38,7 @@ static char *get_early_cmdline(uintptr_t dtb_pa) if (IS_ENABLED(CONFIG_CMDLINE_EXTEND) || IS_ENABLED(CONFIG_CMDLINE_FORCE) || fdt_cmdline_size == 0 /* CONFIG_CMDLINE_FALLBACK */) { - strncat(early_cmdline, CONFIG_CMDLINE, - COMMAND_LINE_SIZE - fdt_cmdline_size); + strlcat(early_cmdline, CONFIG_CMDLINE, COMMAND_LINE_SIZE); } return early_cmdline; -- GitLab From 5f449e245e5b0d9d63eef6c8968fbdc3a8594407 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Fri, 22 Dec 2023 06:57:00 -0500 Subject: [PATCH 0805/1290] riscv: mm: Fixup compat mode boot failure In COMPAT mode, the STACK_TOP is DEFAULT_MAP_WINDOW (0x80000000), but the TASK_SIZE is 0x7fff000. When the user stack is upon 0x7fff000, it will cause a user segment fault. Sometimes, it would cause boot failure when the whole rootfs is rv32. Freeing unused kernel image (initmem) memory: 2236K Run /sbin/init as init process Starting init: /sbin/init exists but couldn't execute it (error -14) Run /etc/init as init process ... Increase the TASK_SIZE to cover STACK_TOP. Cc: stable@vger.kernel.org Fixes: add2cc6b6515 ("RISC-V: mm: Restrict address space for sv39,sv48,sv57") Signed-off-by: Guo Ren Signed-off-by: Guo Ren Reviewed-by: Leonardo Bras Reviewed-by: Charlie Jenkins Link: https://lore.kernel.org/r/20231222115703.2404036-2-guoren@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 294044429e8e1..4342e142eea98 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -881,7 +881,7 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) #define TASK_SIZE_MIN (PGDIR_SIZE_L3 * PTRS_PER_PGD / 2) #ifdef CONFIG_COMPAT -#define TASK_SIZE_32 (_AC(0x80000000, UL) - PAGE_SIZE) +#define TASK_SIZE_32 (_AC(0x80000000, UL)) #define TASK_SIZE (test_thread_flag(TIF_32BIT) ? \ TASK_SIZE_32 : TASK_SIZE_64) #else -- GitLab From 97b7ac69be2e5a683e898f5267f659fde52efdd5 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Fri, 22 Dec 2023 06:57:01 -0500 Subject: [PATCH 0806/1290] riscv: mm: Fixup compat arch_get_mmap_end When the task is in COMPAT mode, the arch_get_mmap_end should be 2GB, not TASK_SIZE_64. The TASK_SIZE has contained is_compat_mode() detection, so change the definition of STACK_TOP_MAX to TASK_SIZE directly. Cc: stable@vger.kernel.org Fixes: add2cc6b6515 ("RISC-V: mm: Restrict address space for sv39,sv48,sv57") Signed-off-by: Guo Ren Signed-off-by: Guo Ren Reviewed-by: Leonardo Bras Reviewed-by: Charlie Jenkins Link: https://lore.kernel.org/r/20231222115703.2404036-3-guoren@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/processor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index f19f861cda549..e1944ff0757a8 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -16,7 +16,7 @@ #ifdef CONFIG_64BIT #define DEFAULT_MAP_WINDOW (UL(1) << (MMAP_VA_BITS - 1)) -#define STACK_TOP_MAX TASK_SIZE_64 +#define STACK_TOP_MAX TASK_SIZE #define arch_get_mmap_end(addr, len, flags) \ ({ \ -- GitLab From f35b88b66fbb5c90298ce3aa483b8a2cf1f39ad0 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 10 Jan 2024 20:10:08 -0800 Subject: [PATCH 0807/1290] iommu: Add cache_invalidate_user op The updates of the PTEs in the nested page table will be propagated to the hardware caches. Add a new domain op cache_invalidate_user() for the userspace to flush the hardware caches for a nested domain through iommufd. No wrapper for it, as it's only supposed to be used by iommufd. Then, pass in invalidation requests in form of a user data array containing a number of invalidation data entries. Link: https://lore.kernel.org/r/20240111041015.47920-2-yi.l.liu@intel.com Reviewed-by: Kevin Tian Signed-off-by: Lu Baolu Signed-off-by: Nicolin Chen Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- include/linux/iommu.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 6291aa7b079b0..93c0d12dd047c 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -284,6 +284,23 @@ struct iommu_user_data { size_t len; }; +/** + * struct iommu_user_data_array - iommu driver specific user space data array + * @type: The data type of all the entries in the user buffer array + * @uptr: Pointer to the user buffer array + * @entry_len: The fixed-width length of an entry in the array, in bytes + * @entry_num: The number of total entries in the array + * + * The user buffer includes an array of requests with format defined in + * include/uapi/linux/iommufd.h + */ +struct iommu_user_data_array { + unsigned int type; + void __user *uptr; + size_t entry_len; + u32 entry_num; +}; + /** * __iommu_copy_struct_from_user - Copy iommu driver specific user space data * @dst_data: Pointer to an iommu driver specific user data that is defined in @@ -440,6 +457,13 @@ struct iommu_ops { * @iotlb_sync_map: Sync mappings created recently using @map to the hardware * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush * queue + * @cache_invalidate_user: Flush hardware cache for user space IO page table. + * The @domain must be IOMMU_DOMAIN_NESTED. The @array + * passes in the cache invalidation requests, in form + * of a driver data structure. The driver must update + * array->entry_num to report the number of handled + * invalidation requests. The driver data structure + * must be defined in include/uapi/linux/iommufd.h * @iova_to_phys: translate iova to physical address * @enforce_cache_coherency: Prevent any kind of DMA from bypassing IOMMU_CACHE, * including no-snoop TLPs on PCIe or other platform @@ -465,6 +489,8 @@ struct iommu_domain_ops { size_t size); void (*iotlb_sync)(struct iommu_domain *domain, struct iommu_iotlb_gather *iotlb_gather); + int (*cache_invalidate_user)(struct iommu_domain *domain, + struct iommu_user_data_array *array); phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, dma_addr_t iova); -- GitLab From 8c6eabae3807e048b9f17733af5e20500fbf858c Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 10 Jan 2024 20:10:09 -0800 Subject: [PATCH 0808/1290] iommufd: Add IOMMU_HWPT_INVALIDATE In nested translation, the stage-1 page table is user-managed but cached by the IOMMU hardware, so an update on present page table entries in the stage-1 page table should be followed with a cache invalidation. Add an IOMMU_HWPT_INVALIDATE ioctl to support such a cache invalidation. It takes hwpt_id to specify the iommu_domain, and a multi-entry array to support multiple invalidation data in one ioctl. enum iommu_hwpt_invalidate_data_type is defined to tag the data type of the entries in the multi-entry array. Link: https://lore.kernel.org/r/20240111041015.47920-3-yi.l.liu@intel.com Reviewed-by: Kevin Tian Co-developed-by: Nicolin Chen Signed-off-by: Nicolin Chen Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/hw_pagetable.c | 41 +++++++++++++++++++++++ drivers/iommu/iommufd/iommufd_private.h | 10 ++++++ drivers/iommu/iommufd/main.c | 3 ++ include/uapi/linux/iommufd.h | 43 +++++++++++++++++++++++++ 4 files changed, 97 insertions(+) diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index cbb5df0a6c32f..4e8711f19f721 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -371,3 +371,44 @@ int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd) iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj); return rc; } + +int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd) +{ + struct iommu_hwpt_invalidate *cmd = ucmd->cmd; + struct iommu_user_data_array data_array = { + .type = cmd->data_type, + .uptr = u64_to_user_ptr(cmd->data_uptr), + .entry_len = cmd->entry_len, + .entry_num = cmd->entry_num, + }; + struct iommufd_hw_pagetable *hwpt; + u32 done_num = 0; + int rc; + + if (cmd->__reserved) { + rc = -EOPNOTSUPP; + goto out; + } + + if (cmd->entry_num && (!cmd->data_uptr || !cmd->entry_len)) { + rc = -EINVAL; + goto out; + } + + hwpt = iommufd_get_hwpt_nested(ucmd, cmd->hwpt_id); + if (IS_ERR(hwpt)) { + rc = PTR_ERR(hwpt); + goto out; + } + + rc = hwpt->domain->ops->cache_invalidate_user(hwpt->domain, + &data_array); + done_num = data_array.entry_num; + + iommufd_put_object(ucmd->ictx, &hwpt->obj); +out: + cmd->entry_num = done_num; + if (iommufd_ucmd_respond(ucmd, sizeof(*cmd))) + return -EFAULT; + return rc; +} diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index abae041e256f7..991f864d1f9bc 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -328,6 +328,15 @@ iommufd_get_hwpt_paging(struct iommufd_ucmd *ucmd, u32 id) IOMMUFD_OBJ_HWPT_PAGING), struct iommufd_hwpt_paging, common.obj); } + +static inline struct iommufd_hw_pagetable * +iommufd_get_hwpt_nested(struct iommufd_ucmd *ucmd, u32 id) +{ + return container_of(iommufd_get_object(ucmd->ictx, id, + IOMMUFD_OBJ_HWPT_NESTED), + struct iommufd_hw_pagetable, obj); +} + int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd); int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd); @@ -345,6 +354,7 @@ void iommufd_hwpt_paging_abort(struct iommufd_object *obj); void iommufd_hwpt_nested_destroy(struct iommufd_object *obj); void iommufd_hwpt_nested_abort(struct iommufd_object *obj); int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd); +int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd); static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx, struct iommufd_hw_pagetable *hwpt) diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index c9091e46d208a..39b32932c61ee 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -322,6 +322,7 @@ union ucmd_buffer { struct iommu_hw_info info; struct iommu_hwpt_alloc hwpt; struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap; + struct iommu_hwpt_invalidate cache; struct iommu_hwpt_set_dirty_tracking set_dirty_tracking; struct iommu_ioas_alloc alloc; struct iommu_ioas_allow_iovas allow_iovas; @@ -360,6 +361,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { __reserved), IOCTL_OP(IOMMU_HWPT_GET_DIRTY_BITMAP, iommufd_hwpt_get_dirty_bitmap, struct iommu_hwpt_get_dirty_bitmap, data), + IOCTL_OP(IOMMU_HWPT_INVALIDATE, iommufd_hwpt_invalidate, + struct iommu_hwpt_invalidate, __reserved), IOCTL_OP(IOMMU_HWPT_SET_DIRTY_TRACKING, iommufd_hwpt_set_dirty_tracking, struct iommu_hwpt_set_dirty_tracking, __reserved), IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl, diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 0b2bc6252e2ca..824560c50ec6d 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -49,6 +49,7 @@ enum { IOMMUFD_CMD_GET_HW_INFO, IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING, IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP, + IOMMUFD_CMD_HWPT_INVALIDATE, }; /** @@ -613,4 +614,46 @@ struct iommu_hwpt_get_dirty_bitmap { #define IOMMU_HWPT_GET_DIRTY_BITMAP _IO(IOMMUFD_TYPE, \ IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP) +/** + * enum iommu_hwpt_invalidate_data_type - IOMMU HWPT Cache Invalidation + * Data Type + * @IOMMU_HWPT_INVALIDATE_DATA_VTD_S1: Invalidation data for VTD_S1 + */ +enum iommu_hwpt_invalidate_data_type { + IOMMU_HWPT_INVALIDATE_DATA_VTD_S1, +}; + +/** + * struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE) + * @size: sizeof(struct iommu_hwpt_invalidate) + * @hwpt_id: ID of a nested HWPT for cache invalidation + * @data_uptr: User pointer to an array of driver-specific cache invalidation + * data. + * @data_type: One of enum iommu_hwpt_invalidate_data_type, defining the data + * type of all the entries in the invalidation request array. It + * should be a type supported by the hwpt pointed by @hwpt_id. + * @entry_len: Length (in bytes) of a request entry in the request array + * @entry_num: Input the number of cache invalidation requests in the array. + * Output the number of requests successfully handled by kernel. + * @__reserved: Must be 0. + * + * Invalidate the iommu cache for user-managed page table. Modifications on a + * user-managed page table should be followed by this operation to sync cache. + * Each ioctl can support one or more cache invalidation requests in the array + * that has a total size of @entry_len * @entry_num. + * + * An empty invalidation request array by setting @entry_num==0 is allowed, and + * @entry_len and @data_uptr would be ignored in this case. This can be used to + * check if the given @data_type is supported or not by kernel. + */ +struct iommu_hwpt_invalidate { + __u32 size; + __u32 hwpt_id; + __aligned_u64 data_uptr; + __u32 data_type; + __u32 entry_len; + __u32 entry_num; + __u32 __reserved; +}; +#define IOMMU_HWPT_INVALIDATE _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_INVALIDATE) #endif -- GitLab From 77785117f9c73fd71a440a5ac86dd80752967adc Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 10 Jan 2024 20:10:10 -0800 Subject: [PATCH 0809/1290] iommu: Add iommu_copy_struct_from_user_array helper Wrap up the data pointer/num sanity and __iommu_copy_struct_from_user() call for iommu drivers to copy driver specific data at a specific location in the struct iommu_user_data_array. And expect it to be used in cache_invalidate_user ops for example. Link: https://lore.kernel.org/r/20240111041015.47920-4-yi.l.liu@intel.com Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Co-developed-by: Yi Liu Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- include/linux/iommu.h | 51 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 93c0d12dd047c..9dbadf74a3a1e 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -341,6 +341,57 @@ static inline int __iommu_copy_struct_from_user( sizeof(*kdst), \ offsetofend(typeof(*kdst), min_last)) +/** + * __iommu_copy_struct_from_user_array - Copy iommu driver specific user space + * data from an iommu_user_data_array + * @dst_data: Pointer to an iommu driver specific user data that is defined in + * include/uapi/linux/iommufd.h + * @src_array: Pointer to a struct iommu_user_data_array for a user space array + * @data_type: The data type of the @dst_data. Must match with @src_array.type + * @index: Index to the location in the array to copy user data from + * @data_len: Length of current user data structure, i.e. sizeof(struct _dst) + * @min_len: Initial length of user data structure for backward compatibility. + * This should be offsetofend using the last member in the user data + * struct that was initially added to include/uapi/linux/iommufd.h + */ +static inline int __iommu_copy_struct_from_user_array( + void *dst_data, const struct iommu_user_data_array *src_array, + unsigned int data_type, unsigned int index, size_t data_len, + size_t min_len) +{ + struct iommu_user_data src_data; + + if (WARN_ON(!src_array || index >= src_array->entry_num)) + return -EINVAL; + if (!src_array->entry_num) + return -EINVAL; + src_data.uptr = src_array->uptr + src_array->entry_len * index; + src_data.len = src_array->entry_len; + src_data.type = src_array->type; + + return __iommu_copy_struct_from_user(dst_data, &src_data, data_type, + data_len, min_len); +} + +/** + * iommu_copy_struct_from_user_array - Copy iommu driver specific user space + * data from an iommu_user_data_array + * @kdst: Pointer to an iommu driver specific user data that is defined in + * include/uapi/linux/iommufd.h + * @user_array: Pointer to a struct iommu_user_data_array for a user space + * array + * @data_type: The data type of the @kdst. Must match with @user_array->type + * @index: Index to the location in the array to copy user data from + * @min_last: The last member of the data structure @kdst points in the + * initial version. + * Return 0 for success, otherwise -error. + */ +#define iommu_copy_struct_from_user_array(kdst, user_array, data_type, index, \ + min_last) \ + __iommu_copy_struct_from_user_array( \ + kdst, user_array, data_type, index, sizeof(*(kdst)), \ + offsetofend(typeof(*(kdst)), min_last)) + /** * struct iommu_ops - iommu ops and capabilities * @capable: check capability -- GitLab From ac8691203c07bb1897681d5c66374174336392fe Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 10 Jan 2024 20:10:11 -0800 Subject: [PATCH 0810/1290] iommufd/selftest: Add mock_domain_cache_invalidate_user support Add mock_domain_cache_invalidate_user() data structure to support user space selftest program to cover user cache invalidation pathway. Link: https://lore.kernel.org/r/20240111041015.47920-5-yi.l.liu@intel.com Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Co-developed-by: Yi Liu Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/iommufd_test.h | 18 ++++++++++ drivers/iommu/iommufd/selftest.c | 50 ++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index 7910fbe1962d7..09dfc8aa65c4f 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -148,4 +148,22 @@ struct iommu_hwpt_selftest { __u32 iotlb; }; +/* Should not be equal to any defined value in enum iommu_hwpt_invalidate_data_type */ +#define IOMMU_HWPT_INVALIDATE_DATA_SELFTEST 0xdeadbeef +#define IOMMU_HWPT_INVALIDATE_DATA_SELFTEST_INVALID 0xdadbeef + +/** + * struct iommu_hwpt_invalidate_selftest - Invalidation data for Mock driver + * (IOMMU_HWPT_INVALIDATE_DATA_SELFTEST) + * @flags: Invalidate flags + * @iotlb_id: Invalidate iotlb entry index + * + * If IOMMU_TEST_INVALIDATE_ALL is set in @flags, @iotlb_id will be ignored + */ +struct iommu_hwpt_invalidate_selftest { +#define IOMMU_TEST_INVALIDATE_FLAG_ALL (1 << 0) + __u32 flags; + __u32 iotlb_id; +}; + #endif diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 022ef8f55088a..23879135d1c37 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -473,9 +473,59 @@ static void mock_domain_free_nested(struct iommu_domain *domain) kfree(mock_nested); } +static int +mock_domain_cache_invalidate_user(struct iommu_domain *domain, + struct iommu_user_data_array *array) +{ + struct mock_iommu_domain_nested *mock_nested = + container_of(domain, struct mock_iommu_domain_nested, domain); + struct iommu_hwpt_invalidate_selftest inv; + u32 processed = 0; + int i = 0, j; + int rc = 0; + + if (array->type != IOMMU_HWPT_INVALIDATE_DATA_SELFTEST) { + rc = -EINVAL; + goto out; + } + + for ( ; i < array->entry_num; i++) { + rc = iommu_copy_struct_from_user_array(&inv, array, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + i, iotlb_id); + if (rc) + break; + + if (inv.flags & ~IOMMU_TEST_INVALIDATE_FLAG_ALL) { + rc = -EOPNOTSUPP; + break; + } + + if (inv.iotlb_id > MOCK_NESTED_DOMAIN_IOTLB_ID_MAX) { + rc = -EINVAL; + break; + } + + if (inv.flags & IOMMU_TEST_INVALIDATE_FLAG_ALL) { + /* Invalidate all mock iotlb entries and ignore iotlb_id */ + for (j = 0; j < MOCK_NESTED_DOMAIN_IOTLB_NUM; j++) + mock_nested->iotlb[j] = 0; + } else { + mock_nested->iotlb[inv.iotlb_id] = 0; + } + + processed++; + } + +out: + array->entry_num = processed; + return rc; +} + static struct iommu_domain_ops domain_nested_ops = { .free = mock_domain_free_nested, .attach_dev = mock_domain_nop_attach, + .cache_invalidate_user = mock_domain_cache_invalidate_user, }; static inline struct iommufd_hw_pagetable * -- GitLab From e1fa6640d58e3529bd5e392dd92371cde2b31283 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 10 Jan 2024 20:10:12 -0800 Subject: [PATCH 0811/1290] iommufd/selftest: Add IOMMU_TEST_OP_MD_CHECK_IOTLB test op Allow to test whether IOTLB has been invalidated or not. Link: https://lore.kernel.org/r/20240111041015.47920-6-yi.l.liu@intel.com Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/iommufd_test.h | 5 ++++ drivers/iommu/iommufd/selftest.c | 26 +++++++++++++++++++ tools/testing/selftests/iommu/iommufd.c | 4 +++ tools/testing/selftests/iommu/iommufd_utils.h | 24 +++++++++++++++++ 4 files changed, 59 insertions(+) diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index 09dfc8aa65c4f..482d4059f5db6 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -21,6 +21,7 @@ enum { IOMMU_TEST_OP_ACCESS_REPLACE_IOAS, IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS, IOMMU_TEST_OP_DIRTY, + IOMMU_TEST_OP_MD_CHECK_IOTLB, }; enum { @@ -121,6 +122,10 @@ struct iommu_test_cmd { __aligned_u64 uptr; __aligned_u64 out_nr_dirty; } dirty; + struct { + __u32 id; + __u32 iotlb; + } check_iotlb; }; __u32 last; }; diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 23879135d1c37..db648c81507fa 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -843,6 +843,28 @@ static int iommufd_test_md_check_refs(struct iommufd_ucmd *ucmd, return 0; } +static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd, + u32 mockpt_id, unsigned int iotlb_id, + u32 iotlb) +{ + struct mock_iommu_domain_nested *mock_nested; + struct iommufd_hw_pagetable *hwpt; + int rc = 0; + + hwpt = get_md_pagetable_nested(ucmd, mockpt_id, &mock_nested); + if (IS_ERR(hwpt)) + return PTR_ERR(hwpt); + + mock_nested = container_of(hwpt->domain, + struct mock_iommu_domain_nested, domain); + + if (iotlb_id > MOCK_NESTED_DOMAIN_IOTLB_ID_MAX || + mock_nested->iotlb[iotlb_id] != iotlb) + rc = -EINVAL; + iommufd_put_object(ucmd->ictx, &hwpt->obj); + return rc; +} + struct selftest_access { struct iommufd_access *access; struct file *file; @@ -1324,6 +1346,10 @@ int iommufd_test(struct iommufd_ucmd *ucmd) return iommufd_test_md_check_refs( ucmd, u64_to_user_ptr(cmd->check_refs.uptr), cmd->check_refs.length, cmd->check_refs.refs); + case IOMMU_TEST_OP_MD_CHECK_IOTLB: + return iommufd_test_md_check_iotlb(ucmd, cmd->id, + cmd->check_iotlb.id, + cmd->check_iotlb.iotlb); case IOMMU_TEST_OP_CREATE_ACCESS: return iommufd_test_create_access(ucmd, cmd->id, cmd->create_access.flags); diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 6ed328c863c4f..c8763b880a161 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -330,6 +330,10 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested) &nested_hwpt_id[1], IOMMU_HWPT_DATA_SELFTEST, &data, sizeof(data)); + test_cmd_hwpt_check_iotlb_all(nested_hwpt_id[0], + IOMMU_TEST_IOTLB_DEFAULT); + test_cmd_hwpt_check_iotlb_all(nested_hwpt_id[1], + IOMMU_TEST_IOTLB_DEFAULT); /* Negative test: a nested hwpt on top of a nested hwpt */ test_err_hwpt_alloc_nested(EINVAL, self->device_id, diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index ad9202335656c..fe0a0f566b678 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -195,6 +195,30 @@ static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, flags, \ hwpt_id, data_type, data, data_len)) +#define test_cmd_hwpt_check_iotlb(hwpt_id, iotlb_id, expected) \ + ({ \ + struct iommu_test_cmd test_cmd = { \ + .size = sizeof(test_cmd), \ + .op = IOMMU_TEST_OP_MD_CHECK_IOTLB, \ + .id = hwpt_id, \ + .check_iotlb = { \ + .id = iotlb_id, \ + .iotlb = expected, \ + }, \ + }; \ + ASSERT_EQ(0, \ + ioctl(self->fd, \ + _IOMMU_TEST_CMD(IOMMU_TEST_OP_MD_CHECK_IOTLB), \ + &test_cmd)); \ + }) + +#define test_cmd_hwpt_check_iotlb_all(hwpt_id, expected) \ + ({ \ + int i; \ + for (i = 0; i < MOCK_NESTED_DOMAIN_IOTLB_NUM; i++) \ + test_cmd_hwpt_check_iotlb(hwpt_id, i, expected); \ + }) + static int _test_cmd_access_replace_ioas(int fd, __u32 access_id, unsigned int ioas_id) { -- GitLab From bf26eb83fd3b6f392cf68285c8858db5bfd5e570 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 10 Jan 2024 20:10:13 -0800 Subject: [PATCH 0812/1290] iommufd/selftest: Add coverage for IOMMU_HWPT_INVALIDATE ioctl Add test cases for the IOMMU_HWPT_INVALIDATE ioctl and verify it by using the new IOMMU_TEST_OP_MD_CHECK_IOTLB. Link: https://lore.kernel.org/r/20240111041015.47920-7-yi.l.liu@intel.com Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Co-developed-by: Yi Liu Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- tools/testing/selftests/iommu/iommufd.c | 148 ++++++++++++++++++ tools/testing/selftests/iommu/iommufd_utils.h | 31 ++++ 2 files changed, 179 insertions(+) diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index c8763b880a161..1a881e7a21d1b 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -116,6 +116,7 @@ TEST_F(iommufd, cmd_length) TEST_LENGTH(iommu_destroy, IOMMU_DESTROY, id); TEST_LENGTH(iommu_hw_info, IOMMU_GET_HW_INFO, __reserved); TEST_LENGTH(iommu_hwpt_alloc, IOMMU_HWPT_ALLOC, __reserved); + TEST_LENGTH(iommu_hwpt_invalidate, IOMMU_HWPT_INVALIDATE, __reserved); TEST_LENGTH(iommu_ioas_alloc, IOMMU_IOAS_ALLOC, out_ioas_id); TEST_LENGTH(iommu_ioas_iova_ranges, IOMMU_IOAS_IOVA_RANGES, out_iova_alignment); @@ -271,7 +272,9 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested) struct iommu_hwpt_selftest data = { .iotlb = IOMMU_TEST_IOTLB_DEFAULT, }; + struct iommu_hwpt_invalidate_selftest inv_reqs[2] = {}; uint32_t nested_hwpt_id[2] = {}; + uint32_t num_inv; uint32_t parent_hwpt_id = 0; uint32_t parent_hwpt_id_not_work = 0; uint32_t test_hwpt_id = 0; @@ -344,6 +347,151 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested) EXPECT_ERRNO(EBUSY, _test_ioctl_destroy(self->fd, parent_hwpt_id)); + /* hwpt_invalidate only supports a user-managed hwpt (nested) */ + num_inv = 1; + test_err_hwpt_invalidate(ENOENT, parent_hwpt_id, inv_reqs, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs), &num_inv); + assert(!num_inv); + + /* Check data_type by passing zero-length array */ + num_inv = 0; + test_cmd_hwpt_invalidate(nested_hwpt_id[0], inv_reqs, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs), &num_inv); + assert(!num_inv); + + /* Negative test: Invalid data_type */ + num_inv = 1; + test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], inv_reqs, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST_INVALID, + sizeof(*inv_reqs), &num_inv); + assert(!num_inv); + + /* Negative test: structure size sanity */ + num_inv = 1; + test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], inv_reqs, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs) + 1, &num_inv); + assert(!num_inv); + + num_inv = 1; + test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], inv_reqs, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + 1, &num_inv); + assert(!num_inv); + + /* Negative test: invalid flag is passed */ + num_inv = 1; + inv_reqs[0].flags = 0xffffffff; + test_err_hwpt_invalidate(EOPNOTSUPP, nested_hwpt_id[0], inv_reqs, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs), &num_inv); + assert(!num_inv); + + /* Negative test: invalid data_uptr when array is not empty */ + num_inv = 1; + inv_reqs[0].flags = 0; + test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], NULL, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs), &num_inv); + assert(!num_inv); + + /* Negative test: invalid entry_len when array is not empty */ + num_inv = 1; + inv_reqs[0].flags = 0; + test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], inv_reqs, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + 0, &num_inv); + assert(!num_inv); + + /* Negative test: invalid iotlb_id */ + num_inv = 1; + inv_reqs[0].flags = 0; + inv_reqs[0].iotlb_id = MOCK_NESTED_DOMAIN_IOTLB_ID_MAX + 1; + test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], inv_reqs, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs), &num_inv); + assert(!num_inv); + + /* + * Invalidate the 1st iotlb entry but fail the 2nd request + * due to invalid flags configuration in the 2nd request. + */ + num_inv = 2; + inv_reqs[0].flags = 0; + inv_reqs[0].iotlb_id = 0; + inv_reqs[1].flags = 0xffffffff; + inv_reqs[1].iotlb_id = 1; + test_err_hwpt_invalidate(EOPNOTSUPP, nested_hwpt_id[0], inv_reqs, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs), &num_inv); + assert(num_inv == 1); + test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 0, 0); + test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 1, + IOMMU_TEST_IOTLB_DEFAULT); + test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 2, + IOMMU_TEST_IOTLB_DEFAULT); + test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 3, + IOMMU_TEST_IOTLB_DEFAULT); + + /* + * Invalidate the 1st iotlb entry but fail the 2nd request + * due to invalid iotlb_id configuration in the 2nd request. + */ + num_inv = 2; + inv_reqs[0].flags = 0; + inv_reqs[0].iotlb_id = 0; + inv_reqs[1].flags = 0; + inv_reqs[1].iotlb_id = MOCK_NESTED_DOMAIN_IOTLB_ID_MAX + 1; + test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], inv_reqs, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs), &num_inv); + assert(num_inv == 1); + test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 0, 0); + test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 1, + IOMMU_TEST_IOTLB_DEFAULT); + test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 2, + IOMMU_TEST_IOTLB_DEFAULT); + test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 3, + IOMMU_TEST_IOTLB_DEFAULT); + + /* Invalidate the 2nd iotlb entry and verify */ + num_inv = 1; + inv_reqs[0].flags = 0; + inv_reqs[0].iotlb_id = 1; + test_cmd_hwpt_invalidate(nested_hwpt_id[0], inv_reqs, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs), &num_inv); + assert(num_inv == 1); + test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 0, 0); + test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 1, 0); + test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 2, + IOMMU_TEST_IOTLB_DEFAULT); + test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 3, + IOMMU_TEST_IOTLB_DEFAULT); + + /* Invalidate the 3rd and 4th iotlb entries and verify */ + num_inv = 2; + inv_reqs[0].flags = 0; + inv_reqs[0].iotlb_id = 2; + inv_reqs[1].flags = 0; + inv_reqs[1].iotlb_id = 3; + test_cmd_hwpt_invalidate(nested_hwpt_id[0], inv_reqs, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs), &num_inv); + assert(num_inv == 2); + test_cmd_hwpt_check_iotlb_all(nested_hwpt_id[0], 0); + + /* Invalidate all iotlb entries for nested_hwpt_id[1] and verify */ + num_inv = 1; + inv_reqs[0].flags = IOMMU_TEST_INVALIDATE_FLAG_ALL; + test_cmd_hwpt_invalidate(nested_hwpt_id[1], inv_reqs, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs), &num_inv); + assert(num_inv == 1); + test_cmd_hwpt_check_iotlb_all(nested_hwpt_id[1], 0); + /* Attach device to nested_hwpt_id[0] that then will be busy */ test_cmd_mock_domain_replace(self->stdev_id, nested_hwpt_id[0]); EXPECT_ERRNO(EBUSY, diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index fe0a0f566b678..c646264aa41fd 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -219,6 +219,37 @@ static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, test_cmd_hwpt_check_iotlb(hwpt_id, i, expected); \ }) +static int _test_cmd_hwpt_invalidate(int fd, __u32 hwpt_id, void *reqs, + uint32_t data_type, uint32_t lreq, + uint32_t *nreqs) +{ + struct iommu_hwpt_invalidate cmd = { + .size = sizeof(cmd), + .hwpt_id = hwpt_id, + .data_type = data_type, + .data_uptr = (uint64_t)reqs, + .entry_len = lreq, + .entry_num = *nreqs, + }; + int rc = ioctl(fd, IOMMU_HWPT_INVALIDATE, &cmd); + *nreqs = cmd.entry_num; + return rc; +} + +#define test_cmd_hwpt_invalidate(hwpt_id, reqs, data_type, lreq, nreqs) \ + ({ \ + ASSERT_EQ(0, \ + _test_cmd_hwpt_invalidate(self->fd, hwpt_id, reqs, \ + data_type, lreq, nreqs)); \ + }) +#define test_err_hwpt_invalidate(_errno, hwpt_id, reqs, data_type, lreq, \ + nreqs) \ + ({ \ + EXPECT_ERRNO(_errno, _test_cmd_hwpt_invalidate( \ + self->fd, hwpt_id, reqs, \ + data_type, lreq, nreqs)); \ + }) + static int _test_cmd_access_replace_ioas(int fd, __u32 access_id, unsigned int ioas_id) { -- GitLab From 393a5778b72a7b551493d0fd3fbe0282154058fe Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 10 Jan 2024 20:10:14 -0800 Subject: [PATCH 0813/1290] iommufd: Add data structure for Intel VT-d stage-1 cache invalidation This adds the data structure invalidating caches for the nested domain allocated with IOMMU_HWPT_DATA_VTD_S1 type. Link: https://lore.kernel.org/r/20240111041015.47920-8-yi.l.liu@intel.com Reviewed-by: Kevin Tian Signed-off-by: Lu Baolu Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- include/uapi/linux/iommufd.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 824560c50ec6d..1dfeaa2e649ee 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -623,6 +623,42 @@ enum iommu_hwpt_invalidate_data_type { IOMMU_HWPT_INVALIDATE_DATA_VTD_S1, }; +/** + * enum iommu_hwpt_vtd_s1_invalidate_flags - Flags for Intel VT-d + * stage-1 cache invalidation + * @IOMMU_VTD_INV_FLAGS_LEAF: Indicates whether the invalidation applies + * to all-levels page structure cache or just + * the leaf PTE cache. + */ +enum iommu_hwpt_vtd_s1_invalidate_flags { + IOMMU_VTD_INV_FLAGS_LEAF = 1 << 0, +}; + +/** + * struct iommu_hwpt_vtd_s1_invalidate - Intel VT-d cache invalidation + * (IOMMU_HWPT_INVALIDATE_DATA_VTD_S1) + * @addr: The start address of the range to be invalidated. It needs to + * be 4KB aligned. + * @npages: Number of contiguous 4K pages to be invalidated. + * @flags: Combination of enum iommu_hwpt_vtd_s1_invalidate_flags + * @__reserved: Must be 0 + * + * The Intel VT-d specific invalidation data for user-managed stage-1 cache + * invalidation in nested translation. Userspace uses this structure to + * tell the impacted cache scope after modifying the stage-1 page table. + * + * Invalidating all the caches related to the page table by setting @addr + * to be 0 and @npages to be U64_MAX. + * + * The device TLB will be invalidated automatically if ATS is enabled. + */ +struct iommu_hwpt_vtd_s1_invalidate { + __aligned_u64 addr; + __aligned_u64 npages; + __u32 flags; + __u32 __reserved; +}; + /** * struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE) * @size: sizeof(struct iommu_hwpt_invalidate) -- GitLab From 77ed045e88e5cba6d8dec3c5c7cd52105c46b50b Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 8 Dec 2023 13:50:35 +0100 Subject: [PATCH 0814/1290] s390/compat: change default for CONFIG_COMPAT to "n" 31 bit support has been removed from the kernel more than eight years ago. The last 31 bit distribution is many years older. There shouldn't be any 31 bit code around anymore. Therefore avoid providing an unused and only partially tested user space interface and change the default for CONFIG_COMPAT from "yes" to "no". Acked-by: Christian Borntraeger Signed-off-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/Kconfig | 6 ++++-- arch/s390/configs/zfcpdump_defconfig | 1 - 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 8f39f04247966..3408b48423ad6 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -443,7 +443,7 @@ config COMMAND_LINE_SIZE line. config COMPAT - def_bool y + def_bool n prompt "Kernel support for 31 bit emulation" select ARCH_WANT_OLD_COMPAT_IPC select COMPAT_OLD_SIGACTION @@ -454,7 +454,9 @@ config COMPAT Select this option if you want to enable your system kernel to handle system-calls from ELF binaries for 31 bit ESA. This option (and some other stuff like libraries and such) is needed for - executing 31 bit applications. It is safe to say "Y". + executing 31 bit applications. + + If unsure say N. config SMP def_bool y diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig index 47028450eee15..30d2a16876650 100644 --- a/arch/s390/configs/zfcpdump_defconfig +++ b/arch/s390/configs/zfcpdump_defconfig @@ -10,7 +10,6 @@ CONFIG_BLK_DEV_INITRD=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_CRASH_DUMP=y CONFIG_MARCH_Z13=y -# CONFIG_COMPAT is not set CONFIG_NR_CPUS=2 CONFIG_HZ_100=y # CONFIG_CHSC_SCH is not set -- GitLab From 0130a0d3a61889060e912a533e996201c4d9a2e5 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 12 Dec 2023 13:59:42 +0100 Subject: [PATCH 0815/1290] s390/kexec: do not automatically select KEXEC option Following commit dccf78d39f10 ("kernel/Kconfig.kexec: drop select of KEXEC for CRASH_DUMP") also drop automatic KEXEC selection for s390 while set CONFIG_KEXEC=y explicitly for defconfig and debug_defconfig targets. zfcpdump_defconfig target gets CONFIG_KEXEC unset as result, which is right and consistent with CONFIG_KEXEC_FILE besides. Acked-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/Kconfig | 1 - arch/s390/configs/debug_defconfig | 1 + arch/s390/configs/defconfig | 1 + 3 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 3408b48423ad6..fe565f3a3a917 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -216,7 +216,6 @@ config S390 select HAVE_VIRT_CPU_ACCOUNTING_IDLE select IOMMU_HELPER if PCI select IOMMU_SUPPORT if PCI - select KEXEC select MMU_GATHER_MERGE_VMAS select MMU_GATHER_NO_GATHER select MMU_GATHER_RCU_TABLE_FREE diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 6de44ede4e14d..689970baf7ac6 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -40,6 +40,7 @@ CONFIG_SCHED_AUTOGROUP=y CONFIG_EXPERT=y # CONFIG_SYSFS_SYSCALL is not set CONFIG_PROFILING=y +CONFIG_KEXEC=y CONFIG_KEXEC_FILE=y CONFIG_KEXEC_SIG=y CONFIG_CRASH_DUMP=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index bcae47da6b7cd..229be2965e6a8 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -38,6 +38,7 @@ CONFIG_SCHED_AUTOGROUP=y CONFIG_EXPERT=y # CONFIG_SYSFS_SYSCALL is not set CONFIG_PROFILING=y +CONFIG_KEXEC=y CONFIG_KEXEC_FILE=y CONFIG_KEXEC_SIG=y CONFIG_CRASH_DUMP=y -- GitLab From 80df7d6af7f6d229b34cf237b2cc9024c07111cd Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Tue, 28 Nov 2023 16:22:49 +0100 Subject: [PATCH 0816/1290] s390/pci: fix max size calculation in zpci_memcpy_toio() The zpci_get_max_write_size() helper is used to determine the maximum size a PCI store or load can use at a given __iomem address. For the PCI block store the following restrictions apply: 1. The dst + len must not cross a 4K boundary in the (pseudo-)MMIO space 2. len must not exceed ZPCI_MAX_WRITE_SIZE 3. len must be a multiple of 8 bytes 4. The src address must be double word (8 byte) aligned 5. The dst address must be double word (8 byte) aligned Otherwise only a normal PCI store which takes its src value from a register can be used. For these PCI store restriction 1 still applies. Similarly 1 also applies to PCI loads. It turns out zpci_max_write_size() instead implements stricter conditions which prevents PCI block stores from being used where they can and should be used. In particular instead of conditions 4 and 5 it wrongly enforces both dst and src to be size aligned. This indirectly covers condition 1 but also prevents many legal PCI block stores. On top of the functional shortcomings the zpci_get_max_write_size() is misnamed as it is used for both read and write size calculations. Rename it to zpci_get_max_io_size() and implement the listed conditions explicitly. Reviewed-by: Matthew Rosato Fixes: cd24834130ac ("s390/pci: base support") Signed-off-by: Niklas Schnelle [agordeev@linux.ibm.com replaced spaces with tabs] Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/pci_io.h | 32 ++++++++++++++++++-------------- arch/s390/pci/pci_mmio.c | 12 ++++++------ 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/arch/s390/include/asm/pci_io.h b/arch/s390/include/asm/pci_io.h index 287bb88f76986..2686bee800e3d 100644 --- a/arch/s390/include/asm/pci_io.h +++ b/arch/s390/include/asm/pci_io.h @@ -11,6 +11,8 @@ /* I/O size constraints */ #define ZPCI_MAX_READ_SIZE 8 #define ZPCI_MAX_WRITE_SIZE 128 +#define ZPCI_BOUNDARY_SIZE (1 << 12) +#define ZPCI_BOUNDARY_MASK (ZPCI_BOUNDARY_SIZE - 1) /* I/O Map */ #define ZPCI_IOMAP_SHIFT 48 @@ -125,16 +127,18 @@ out: int zpci_write_block(volatile void __iomem *dst, const void *src, unsigned long len); -static inline u8 zpci_get_max_write_size(u64 src, u64 dst, int len, int max) +static inline int zpci_get_max_io_size(u64 src, u64 dst, int len, int max) { - int count = len > max ? max : len, size = 1; + int offset = dst & ZPCI_BOUNDARY_MASK; + int size; - while (!(src & 0x1) && !(dst & 0x1) && ((size << 1) <= count)) { - dst = dst >> 1; - src = src >> 1; - size = size << 1; - } - return size; + size = min3(len, ZPCI_BOUNDARY_SIZE - offset, max); + if (IS_ALIGNED(src, 8) && IS_ALIGNED(dst, 8) && IS_ALIGNED(size, 8)) + return size; + + if (size >= 8) + return 8; + return rounddown_pow_of_two(size); } static inline int zpci_memcpy_fromio(void *dst, @@ -144,9 +148,9 @@ static inline int zpci_memcpy_fromio(void *dst, int size, rc = 0; while (n > 0) { - size = zpci_get_max_write_size((u64 __force) src, - (u64) dst, n, - ZPCI_MAX_READ_SIZE); + size = zpci_get_max_io_size((u64 __force) src, + (u64) dst, n, + ZPCI_MAX_READ_SIZE); rc = zpci_read_single(dst, src, size); if (rc) break; @@ -166,9 +170,9 @@ static inline int zpci_memcpy_toio(volatile void __iomem *dst, return -EINVAL; while (n > 0) { - size = zpci_get_max_write_size((u64 __force) dst, - (u64) src, n, - ZPCI_MAX_WRITE_SIZE); + size = zpci_get_max_io_size((u64 __force) dst, + (u64) src, n, + ZPCI_MAX_WRITE_SIZE); if (size > 8) /* main path */ rc = zpci_write_block(dst, src, size); else diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c index 5880893329310..a90499c087f0c 100644 --- a/arch/s390/pci/pci_mmio.c +++ b/arch/s390/pci/pci_mmio.c @@ -97,9 +97,9 @@ static inline int __memcpy_toio_inuser(void __iomem *dst, return -EINVAL; while (n > 0) { - size = zpci_get_max_write_size((u64 __force) dst, - (u64 __force) src, n, - ZPCI_MAX_WRITE_SIZE); + size = zpci_get_max_io_size((u64 __force) dst, + (u64 __force) src, n, + ZPCI_MAX_WRITE_SIZE); if (size > 8) /* main path */ rc = __pcistb_mio_inuser(dst, src, size, &status); else @@ -242,9 +242,9 @@ static inline int __memcpy_fromio_inuser(void __user *dst, u8 status; while (n > 0) { - size = zpci_get_max_write_size((u64 __force) src, - (u64 __force) dst, n, - ZPCI_MAX_READ_SIZE); + size = zpci_get_max_io_size((u64 __force) src, + (u64 __force) dst, n, + ZPCI_MAX_READ_SIZE); rc = __pcilg_mio_inuser(dst, src, size, &status); if (rc) break; -- GitLab From d124e484691a737e174474d230dc6a4b385ba3db Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 3 Jan 2024 13:15:13 +0100 Subject: [PATCH 0817/1290] s390/mm,fault: remove not needed tsk variable tsk is only used as an intermediate variable for current. Remove tsk and use current directly instead at the only place where it is used. Signed-off-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/mm/fault.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index ab4098886e562..ac4c78546d973 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -280,7 +280,6 @@ static void do_sigbus(struct pt_regs *regs) static void do_exception(struct pt_regs *regs, int access) { struct vm_area_struct *vma; - struct task_struct *tsk; unsigned long address; struct mm_struct *mm; enum fault_type type; @@ -289,7 +288,6 @@ static void do_exception(struct pt_regs *regs, int access) vm_fault_t fault; bool is_write; - tsk = current; /* * The instruction that caused the program check has * been nullified. Don't signal single step via SIGTRAP. @@ -297,7 +295,7 @@ static void do_exception(struct pt_regs *regs, int access) clear_thread_flag(TIF_PER_TRAP); if (kprobe_page_fault(regs, 14)) return; - mm = tsk->mm; + mm = current->mm; address = get_fault_address(regs); is_write = fault_is_write(regs); type = get_fault_type(regs); -- GitLab From 74ca896113531c4a3bba278fd7991da743b7e63b Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 5 Jan 2024 15:28:47 +0100 Subject: [PATCH 0818/1290] s390/fpu: remove __load_fpu_regs() export __load_fpu_regs() is only called from core kernel code. Therefore remove the not needed export. Signed-off-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/kernel/fpu.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/s390/kernel/fpu.c b/arch/s390/kernel/fpu.c index 9e7c15fccfea9..a4f3449cc8141 100644 --- a/arch/s390/kernel/fpu.c +++ b/arch/s390/kernel/fpu.c @@ -208,7 +208,6 @@ void __load_fpu_regs(void) } clear_cpu_flag(CIF_FPU); } -EXPORT_SYMBOL(__load_fpu_regs); void load_fpu_regs(void) { -- GitLab From ba69655fffdb8ada391494cfb8ddfe6e96784b79 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 9 Jan 2024 11:59:47 +0100 Subject: [PATCH 0819/1290] s390/ptrace: remove leftover comment The code which validates floating point control register contents was reworked with commit 702644249d3e ("s390/fpu: get rid of test_fp_ctl()"). There is still a comment which refers to the old implementation - remove it in order to avoid confusion. Signed-off-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/kernel/ptrace.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 2e6754b62b209..f1897a8bb2210 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -917,7 +917,6 @@ static int s390_fpregs_set(struct task_struct *target, else memcpy(&fprs, target->thread.fpu.fprs, sizeof(fprs)); - /* If setting FPC, must validate it first. */ if (count > 0 && pos < offsetof(s390_fp_regs, fprs)) { u32 ufpc[2] = { target->thread.fpu.fpc, 0 }; rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ufpc, -- GitLab From f6f3721244a8224fa97952b34fbb21e4ee40e8a8 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 10 Jan 2024 20:10:15 -0800 Subject: [PATCH 0820/1290] iommu/vt-d: Add iotlb flush for nested domain This implements the .cache_invalidate_user() callback to support iotlb flush for nested domain. Link: https://lore.kernel.org/r/20240111041015.47920-9-yi.l.liu@intel.com Reviewed-by: Kevin Tian Signed-off-by: Lu Baolu Co-developed-by: Yi Liu Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/iommu/intel/nested.c | 88 ++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index b5a5563ab32c6..f26c7f1c46cca 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -73,9 +73,97 @@ static void intel_nested_domain_free(struct iommu_domain *domain) kfree(to_dmar_domain(domain)); } +static void nested_flush_dev_iotlb(struct dmar_domain *domain, u64 addr, + unsigned int mask) +{ + struct device_domain_info *info; + unsigned long flags; + u16 sid, qdep; + + spin_lock_irqsave(&domain->lock, flags); + list_for_each_entry(info, &domain->devices, link) { + if (!info->ats_enabled) + continue; + sid = info->bus << 8 | info->devfn; + qdep = info->ats_qdep; + qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, + qdep, addr, mask); + quirk_extra_dev_tlb_flush(info, addr, mask, + IOMMU_NO_PASID, qdep); + } + spin_unlock_irqrestore(&domain->lock, flags); +} + +static void intel_nested_flush_cache(struct dmar_domain *domain, u64 addr, + unsigned long npages, bool ih) +{ + struct iommu_domain_info *info; + unsigned int mask; + unsigned long i; + + xa_for_each(&domain->iommu_array, i, info) + qi_flush_piotlb(info->iommu, + domain_id_iommu(domain, info->iommu), + IOMMU_NO_PASID, addr, npages, ih); + + if (!domain->has_iotlb_device) + return; + + if (npages == U64_MAX) + mask = 64 - VTD_PAGE_SHIFT; + else + mask = ilog2(__roundup_pow_of_two(npages)); + + nested_flush_dev_iotlb(domain, addr, mask); +} + +static int intel_nested_cache_invalidate_user(struct iommu_domain *domain, + struct iommu_user_data_array *array) +{ + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct iommu_hwpt_vtd_s1_invalidate inv_entry; + u32 index, processed = 0; + int ret = 0; + + if (array->type != IOMMU_HWPT_INVALIDATE_DATA_VTD_S1) { + ret = -EINVAL; + goto out; + } + + for (index = 0; index < array->entry_num; index++) { + ret = iommu_copy_struct_from_user_array(&inv_entry, array, + IOMMU_HWPT_INVALIDATE_DATA_VTD_S1, + index, __reserved); + if (ret) + break; + + if ((inv_entry.flags & ~IOMMU_VTD_INV_FLAGS_LEAF) || + inv_entry.__reserved) { + ret = -EOPNOTSUPP; + break; + } + + if (!IS_ALIGNED(inv_entry.addr, VTD_PAGE_SIZE) || + ((inv_entry.npages == U64_MAX) && inv_entry.addr)) { + ret = -EINVAL; + break; + } + + intel_nested_flush_cache(dmar_domain, inv_entry.addr, + inv_entry.npages, + inv_entry.flags & IOMMU_VTD_INV_FLAGS_LEAF); + processed++; + } + +out: + array->entry_num = processed; + return ret; +} + static const struct iommu_domain_ops intel_nested_domain_ops = { .attach_dev = intel_nested_attach_dev, .free = intel_nested_domain_free, + .cache_invalidate_user = intel_nested_cache_invalidate_user, }; struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, -- GitLab From 47f2bd2ff382e5fe766b1322e354558a8da4a470 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 3 Jan 2024 11:27:22 -0400 Subject: [PATCH 0821/1290] iommufd/selftest: Check the bus type during probe This relied on the probe function only being invoked by the bus type mock was registered on. The removal of the bus ops broke this assumption and the probe could be called on non-mock bus types like PCI. Check the bus type directly in probe. Fixes: 17de3f5fdd35 ("iommu: Retire bus ops") Link: https://lore.kernel.org/r/0-v1-82d59f7eab8c+40c-iommufd_mock_bus_jgg@nvidia.com Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/selftest.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index db648c81507fa..d9e9920c7eba4 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -25,6 +25,19 @@ static struct iommu_domain_ops domain_nested_ops; size_t iommufd_test_memory_limit = 65536; +struct mock_bus_type { + struct bus_type bus; + struct notifier_block nb; +}; + +static struct mock_bus_type iommufd_mock_bus_type = { + .bus = { + .name = "iommufd_mock", + }, +}; + +static atomic_t mock_dev_num; + enum { MOCK_DIRTY_TRACK = 1, MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2, @@ -437,6 +450,8 @@ static struct iommu_device mock_iommu_device = { static struct iommu_device *mock_probe_device(struct device *dev) { + if (dev->bus != &iommufd_mock_bus_type.bus) + return ERR_PTR(-ENODEV); return &mock_iommu_device; } @@ -576,19 +591,6 @@ get_md_pagetable_nested(struct iommufd_ucmd *ucmd, u32 mockpt_id, return hwpt; } -struct mock_bus_type { - struct bus_type bus; - struct notifier_block nb; -}; - -static struct mock_bus_type iommufd_mock_bus_type = { - .bus = { - .name = "iommufd_mock", - }, -}; - -static atomic_t mock_dev_num; - static void mock_dev_release(struct device *dev) { struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); -- GitLab From 3f302388d45855c0b24802e7b414e3fb29f172e3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 11 Jan 2024 13:34:33 -0700 Subject: [PATCH 0822/1290] io_uring/rsrc: improve code generation for fixed file assignment For the normal read/write path, we have already locked the ring submission side when assigning the file. This causes branch mispredictions when we then check and try and lock again in io_req_set_rsrc_node(). As this is a very hot path, this matters. Add a basic helper that already assumes we already have it locked, and use that in io_file_get_fixed(). Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 5 +++-- io_uring/rsrc.h | 14 +++++++++----- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4afb911fc042b..50c9f04bc193c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2000,9 +2000,10 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, goto out; fd = array_index_nospec(fd, ctx->nr_user_files); slot = io_fixed_file_slot(&ctx->file_table, fd); - file = io_slot_file(slot); + if (!req->rsrc_node) + __io_req_set_rsrc_node(req, ctx); req->flags |= io_slot_flags(slot); - io_req_set_rsrc_node(req, ctx, 0); + file = io_slot_file(slot); out: io_ring_submit_unlock(ctx, issue_flags); return file; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 7238b9cfe33b6..c6f199bbee284 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -102,17 +102,21 @@ static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx, node->refs++; } +static inline void __io_req_set_rsrc_node(struct io_kiocb *req, + struct io_ring_ctx *ctx) +{ + lockdep_assert_held(&ctx->uring_lock); + req->rsrc_node = ctx->rsrc_node; + io_charge_rsrc_node(ctx, ctx->rsrc_node); +} + static inline void io_req_set_rsrc_node(struct io_kiocb *req, struct io_ring_ctx *ctx, unsigned int issue_flags) { if (!req->rsrc_node) { io_ring_submit_lock(ctx, issue_flags); - - lockdep_assert_held(&ctx->uring_lock); - - req->rsrc_node = ctx->rsrc_node; - io_charge_rsrc_node(ctx, ctx->rsrc_node); + __io_req_set_rsrc_node(req, ctx); io_ring_submit_unlock(ctx, issue_flags); } } -- GitLab From e1b1d282d5ccabbf17e5556191af248a35293e16 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 8 Jan 2024 10:16:02 -0800 Subject: [PATCH 0823/1290] net: fill in MODULE_DESCRIPTION()s for SLIP W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to the Serial Line (SLIP) protocol modules. Signed-off-by: Breno Leitao Link: https://lore.kernel.org/r/20240108181610.2697017-3-leitao@debian.org Signed-off-by: Jakub Kicinski --- drivers/net/slip/slhc.c | 1 + drivers/net/slip/slip.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/net/slip/slhc.c b/drivers/net/slip/slhc.c index ba93bab948e09..18df7ca661981 100644 --- a/drivers/net/slip/slhc.c +++ b/drivers/net/slip/slhc.c @@ -752,4 +752,5 @@ EXPORT_SYMBOL(slhc_compress); EXPORT_SYMBOL(slhc_uncompress); EXPORT_SYMBOL(slhc_toss); +MODULE_DESCRIPTION("Compression helpers for SLIP (serial line)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/net/slip/slip.c b/drivers/net/slip/slip.c index e4280e37fec97..0aba3569ccc0d 100644 --- a/drivers/net/slip/slip.c +++ b/drivers/net/slip/slip.c @@ -1437,5 +1437,6 @@ out: } #endif +MODULE_DESCRIPTION("SLIP (serial line) protocol module"); MODULE_LICENSE("GPL"); MODULE_ALIAS_LDISC(N_SLIP); -- GitLab From 417d8c571cb49fcace83a6b6477da2970802bf26 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 8 Jan 2024 10:16:03 -0800 Subject: [PATCH 0824/1290] net: fill in MODULE_DESCRIPTION()s for HSR W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to High-availability Seamless Redundancy (HSR) driver. Signed-off-by: Breno Leitao Link: https://lore.kernel.org/r/20240108181610.2697017-4-leitao@debian.org Signed-off-by: Jakub Kicinski --- net/hsr/hsr_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index b099c31501509..cb83c8feb7465 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -167,4 +167,5 @@ static void __exit hsr_exit(void) module_init(hsr_init); module_exit(hsr_exit); +MODULE_DESCRIPTION("High-availability Seamless Redundancy (HSR) driver"); MODULE_LICENSE("GPL"); -- GitLab From 95c236cc5fc9f09fc4cf58767fa7c01f2425ad6b Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 8 Jan 2024 10:16:04 -0800 Subject: [PATCH 0825/1290] net: fill in MODULE_DESCRIPTION()s for NFC W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to all NFC Controller Interface (NCI) modules. Signed-off-by: Breno Leitao Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240108181610.2697017-5-leitao@debian.org Signed-off-by: Jakub Kicinski --- net/nfc/digital_core.c | 1 + net/nfc/nci/core.c | 1 + net/nfc/nci/spi.c | 1 + 3 files changed, 3 insertions(+) diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c index d63d2e5dc60c9..dae378f1d52b6 100644 --- a/net/nfc/digital_core.c +++ b/net/nfc/digital_core.c @@ -858,4 +858,5 @@ void nfc_digital_unregister_device(struct nfc_digital_dev *ddev) } EXPORT_SYMBOL(nfc_digital_unregister_device); +MODULE_DESCRIPTION("NFC Digital protocol stack"); MODULE_LICENSE("GPL"); diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 6c9592d051206..97348cedb16b3 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -1577,4 +1577,5 @@ static void nci_cmd_work(struct work_struct *work) } } +MODULE_DESCRIPTION("NFC Controller Interface"); MODULE_LICENSE("GPL"); diff --git a/net/nfc/nci/spi.c b/net/nfc/nci/spi.c index b68150c971d0b..6a93533c480e6 100644 --- a/net/nfc/nci/spi.c +++ b/net/nfc/nci/spi.c @@ -319,4 +319,5 @@ done: } EXPORT_SYMBOL_GPL(nci_spi_read); +MODULE_DESCRIPTION("NFC Controller Interface (NCI) SPI link layer"); MODULE_LICENSE("GPL"); -- GitLab From d8610e431fe53ee3a1a11a7f25528b03f8a0b1c7 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 8 Jan 2024 10:16:05 -0800 Subject: [PATCH 0826/1290] net: fill in MODULE_DESCRIPTION()s for Sun RPC W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to Sun RPC modules. Signed-off-by: Breno Leitao Reviewed-by: Jeff Layton Link: https://lore.kernel.org/r/20240108181610.2697017-6-leitao@debian.org Signed-off-by: Jakub Kicinski --- net/sunrpc/auth_gss/auth_gss.c | 1 + net/sunrpc/auth_gss/gss_krb5_mech.c | 1 + net/sunrpc/sunrpc_syms.c | 1 + 3 files changed, 3 insertions(+) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 1af71fbb0d805..c7af0220f82f4 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -2280,6 +2280,7 @@ static void __exit exit_rpcsec_gss(void) } MODULE_ALIAS("rpc-auth-6"); +MODULE_DESCRIPTION("Sun RPC Kerberos RPCSEC_GSS client authentication"); MODULE_LICENSE("GPL"); module_param_named(expired_cred_retry_delay, gss_expired_cred_retry_delay, diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index e31cfdf7eadcb..64cff717c3d9b 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -650,6 +650,7 @@ static void __exit cleanup_kerberos_module(void) gss_mech_unregister(&gss_kerberos_mech); } +MODULE_DESCRIPTION("Sun RPC Kerberos 5 module"); MODULE_LICENSE("GPL"); module_init(init_kerberos_module); module_exit(cleanup_kerberos_module); diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 691c0000e9eae..bab6cab294052 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -148,6 +148,7 @@ cleanup_sunrpc(void) #endif rcu_barrier(); /* Wait for completion of call_rcu()'s */ } +MODULE_DESCRIPTION("Sun RPC core"); MODULE_LICENSE("GPL"); fs_initcall(init_sunrpc); /* Ensure we're initialised before nfs */ module_exit(cleanup_sunrpc); -- GitLab From ade98756128a63dbb801b9d3948c6954cc81b473 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 8 Jan 2024 10:16:08 -0800 Subject: [PATCH 0827/1290] net: fill in MODULE_DESCRIPTION()s for ds26522 module W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to Slic Maxim ds26522 card driver. Signed-off-by: Breno Leitao Link: https://lore.kernel.org/r/20240108181610.2697017-9-leitao@debian.org Signed-off-by: Jakub Kicinski --- drivers/net/wan/slic_ds26522.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wan/slic_ds26522.c b/drivers/net/wan/slic_ds26522.c index 8a51cfcff99e5..cbb99fc5ea9fe 100644 --- a/drivers/net/wan/slic_ds26522.c +++ b/drivers/net/wan/slic_ds26522.c @@ -28,6 +28,7 @@ static struct spi_device *g_spi; +MODULE_DESCRIPTION("Slic Maxim DS26522 driver"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Zhao Qiang"); -- GitLab From c155eca07647bac84cbce690b4257605f5740dda Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 8 Jan 2024 10:16:09 -0800 Subject: [PATCH 0828/1290] net: fill in MODULE_DESCRIPTION()s for s2io W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to the Neterion 10GbE (s2io) driver. Signed-off-by: Breno Leitao Link: https://lore.kernel.org/r/20240108181610.2697017-10-leitao@debian.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/neterion/s2io.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c index 61d8bfd12d5fd..55408f16fbbc4 100644 --- a/drivers/net/ethernet/neterion/s2io.c +++ b/drivers/net/ethernet/neterion/s2io.c @@ -414,6 +414,7 @@ static const u64 fix_mac[] = { END_SIGN }; +MODULE_DESCRIPTION("Neterion 10GbE driver"); MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_VERSION); -- GitLab From da14d1fed9c1e2f56a58dcd980d1395121c4665f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 9 Jan 2024 08:45:11 -0800 Subject: [PATCH 0829/1290] MAINTAINERS: eth: mtk: move John to CREDITS John is still active in other bits of the kernel but not much on the MediaTek ethernet switch side. Our scripts report: Subsystem MEDIATEK ETHERNET DRIVER Changes 81 / 384 (21%) Last activity: 2023-12-21 Felix Fietkau : Author c6d96df9fa2c 2023-05-02 00:00:00 42 Tags c6d96df9fa2c 2023-05-02 00:00:00 48 John Crispin : Sean Wang : Author 880c2d4b2fdf 2019-06-03 00:00:00 5 Tags a5d75538295b 2020-04-07 00:00:00 7 Mark Lee : Author 8d66a8183d0c 2019-11-14 00:00:00 4 Tags 8d66a8183d0c 2019-11-14 00:00:00 4 Lorenzo Bianconi : Author 7cb8cd4daacf 2023-12-21 00:00:00 98 Tags 7cb8cd4daacf 2023-12-21 00:00:00 112 Top reviewers: [18]: horms@kernel.org [15]: leonro@nvidia.com [8]: rmk+kernel@armlinux.org.uk INACTIVE MAINTAINER John Crispin Reviewed-by: Simon Horman Signed-off-by: John Crispin Link: https://lore.kernel.org/r/20240109164517.3063131-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- CREDITS | 4 ++++ MAINTAINERS | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CREDITS b/CREDITS index d61ba9b5117f1..d174c65e10316 100644 --- a/CREDITS +++ b/CREDITS @@ -814,6 +814,10 @@ D: Support for Xircom PGSDB9 (firmware and host driver) S: Bucharest S: Romania +N: John Crispin +E: john@phrozen.org +D: MediaTek MT7623 Gigabit ethernet support + N: Laurence Culhane E: loz@holmes.demon.co.uk D: Wrote the initial alpha SLIP code diff --git a/MAINTAINERS b/MAINTAINERS index c36618d4659ec..f04fe483ecfbf 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13509,7 +13509,6 @@ F: drivers/dma/mediatek/ MEDIATEK ETHERNET DRIVER M: Felix Fietkau -M: John Crispin M: Sean Wang M: Mark Lee M: Lorenzo Bianconi -- GitLab From b59d8485fe7fda864e10800085ce1d19c321922a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 9 Jan 2024 08:45:12 -0800 Subject: [PATCH 0830/1290] MAINTAINERS: eth: mt7530: move Landen Chao to CREDITS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mt7530 is a pretty active driver and last we have heard from Landen Chao on the list was March. There were total of 4 message from them in the last 2.5 years. I think it's time to move to CREDITS. Subsystem MEDIATEK SWITCH DRIVER Changes 94 / 169 (55%) Last activity: 2023-10-11 Arınç ÜNAL : Author e94b590abfff 2023-08-19 00:00:00 12 Tags e94b590abfff 2023-08-19 00:00:00 16 Daniel Golle : Author 91daa4f62ce8 2023-04-19 00:00:00 17 Tags ac49b992578d 2023-10-11 00:00:00 20 Landen Chao : DENG Qingfang : Author 342afce10d6f 2021-10-18 00:00:00 24 Tags 342afce10d6f 2021-10-18 00:00:00 25 Sean Wang : Tags c288575f7810 2020-09-14 00:00:00 5 Top reviewers: [46]: f.fainelli@gmail.com [29]: andrew@lunn.ch [19]: olteanv@gmail.com INACTIVE MAINTAINER Landen Chao Acked-by: Arınç ÜNAL Link: https://lore.kernel.org/r/20240109164517.3063131-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- CREDITS | 4 ++++ MAINTAINERS | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CREDITS b/CREDITS index d174c65e10316..76bc0091f9cd4 100644 --- a/CREDITS +++ b/CREDITS @@ -677,6 +677,10 @@ D: Media subsystem (V4L/DVB) drivers and core D: EDAC drivers and EDAC 3.0 core rework S: Brazil +N: Landen Chao +E: Landen.Chao@mediatek.com +D: MT7531 Ethernet switch support + N: Raymond Chen E: raymondc@microsoft.com D: Author of Configure script diff --git a/MAINTAINERS b/MAINTAINERS index f04fe483ecfbf..b29c132f4754d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13664,7 +13664,6 @@ F: include/soc/mediatek/smi.h MEDIATEK SWITCH DRIVER M: Arınç ÜNAL M: Daniel Golle -M: Landen Chao M: DENG Qingfang M: Sean Wang L: netdev@vger.kernel.org -- GitLab From 009a98bca6347d0bd9cf0c31691331e68f0ee380 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 9 Jan 2024 08:45:13 -0800 Subject: [PATCH 0831/1290] MAINTAINERS: eth: mvneta: move Thomas to CREDITS Thomas is still active in other bits of the kernel and beyond but not as much on the Marvell Ethernet devices. Our scripts report: Subsystem MARVELL MVNETA ETHERNET DRIVER Changes 54 / 176 (30%) (No activity) Top reviewers: [12]: hawk@kernel.org [9]: toke@redhat.com [9]: john.fastabend@gmail.com INACTIVE MAINTAINER Thomas Petazzoni Acked-by: Thomas Petazzoni Link: https://lore.kernel.org/r/20240109164517.3063131-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- CREDITS | 4 ++++ MAINTAINERS | 3 +-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CREDITS b/CREDITS index 76bc0091f9cd4..4d6beeaaca3c4 100644 --- a/CREDITS +++ b/CREDITS @@ -3049,6 +3049,10 @@ S: Demonstratsii 8-382 S: Tula 300000 S: Russia +N: Thomas Petazzoni +E: thomas.petazzoni@bootlin.com +D: Driver for the Marvell Armada 370/XP network unit. + N: Gordon Peters E: GordPeters@smarttech.com D: Isochronous receive for IEEE 1394 driver (OHCI module). diff --git a/MAINTAINERS b/MAINTAINERS index b29c132f4754d..ceb9b669dc6c5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12875,9 +12875,8 @@ S: Maintained F: drivers/thermal/armada_thermal.c MARVELL MVNETA ETHERNET DRIVER -M: Thomas Petazzoni L: netdev@vger.kernel.org -S: Maintained +S: Orphan F: drivers/net/ethernet/marvell/mvneta.* MARVELL MVPP2 ETHERNET DRIVER -- GitLab From 384a35866f3a2b75f0c12e09e1bc486ef96cb5f5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 9 Jan 2024 08:45:14 -0800 Subject: [PATCH 0832/1290] MAINTAINERS: eth: mark Cavium liquidio as an Orphan We haven't seen much review activity from the liquidio maintainers for years. Reflect that reality in MAINTAINERS. Our scripts report: Subsystem CAVIUM LIQUIDIO NETWORK DRIVER Changes 30 / 87 (34%) Last activity: 2019-01-28 Derek Chickles : Tags ac93e2fa8550 2019-01-28 00:00:00 1 Satanand Burla : Felix Manlunas : Tags ac93e2fa8550 2019-01-28 00:00:00 1 Top reviewers: [5]: simon.horman@corigine.com [4]: keescook@chromium.org [4]: jiri@nvidia.com INACTIVE MAINTAINER Satanand Burla Acked-by: Derek Chickles Link: https://lore.kernel.org/r/20240109164517.3063131-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index ceb9b669dc6c5..1ef18977427ef 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4697,11 +4697,8 @@ F: drivers/i2c/busses/i2c-octeon* F: drivers/i2c/busses/i2c-thunderx* CAVIUM LIQUIDIO NETWORK DRIVER -M: Derek Chickles -M: Satanand Burla -M: Felix Manlunas L: netdev@vger.kernel.org -S: Supported +S: Orphan W: http://www.marvell.com F: drivers/net/ethernet/cavium/liquidio/ -- GitLab From 0bfcdce867f70a8309fe120eb5a9fff2fc54e8b6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 9 Jan 2024 08:45:15 -0800 Subject: [PATCH 0833/1290] MAINTAINERS: Bluetooth: retire Johan (for now?) Johan moved to maintaining the Zephyr Bluetooth stack, and we haven't heard from him on the ML in 3 years (according to lore), and seen any tags in git in 4 years. Trade the MAINTAINER entry for CREDITS, we can revert whenever Johan comes back to Linux hacking :) Subsystem BLUETOOTH SUBSYSTEM Changes 173 / 986 (17%) Last activity: 2023-12-22 Marcel Holtmann : Author 91cb4c19118a 2022-01-27 00:00:00 52 Committer edcb185fa9c4 2022-05-23 00:00:00 446 Tags 000c2fa2c144 2023-04-23 00:00:00 523 Johan Hedberg : Luiz Augusto von Dentz : Author d03376c18592 2023-12-22 00:00:00 241 Committer da9065caa594 2023-12-22 00:00:00 341 Tags da9065caa594 2023-12-22 00:00:00 493 Top reviewers: [33]: alainm@chromium.org [31]: mcchou@chromium.org [27]: abhishekpandit@chromium.org INACTIVE MAINTAINER Johan Hedberg Link: https://lore.kernel.org/r/20240109164517.3063131-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- CREDITS | 4 ++++ MAINTAINERS | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CREDITS b/CREDITS index 4d6beeaaca3c4..3c392f7ebc6ea 100644 --- a/CREDITS +++ b/CREDITS @@ -1542,6 +1542,10 @@ N: Andrew Haylett E: ajh@primag.co.uk D: Selection mechanism +N: Johan Hedberg +E: johan.hedberg@gmail.com +D: Bluetooth subsystem maintainer + N: Andre Hedrick E: andre@linux-ide.org E: andre@linuxdiskcert.org diff --git a/MAINTAINERS b/MAINTAINERS index 1ef18977427ef..80422f6393522 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3609,7 +3609,6 @@ F: drivers/mtd/devices/block2mtd.c BLUETOOTH DRIVERS M: Marcel Holtmann -M: Johan Hedberg M: Luiz Augusto von Dentz L: linux-bluetooth@vger.kernel.org S: Supported -- GitLab From bd93edbfd70cdea6e2313c87c3bae5b05bbb947e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 9 Jan 2024 08:45:16 -0800 Subject: [PATCH 0834/1290] MAINTAINERS: mark ax25 as Orphan We haven't heard from Ralf for two years, according to lore. We get a constant stream of "fixes" to ax25 from people using code analysis tools. Nobody is reviewing those, let's reflect this reality in MAINTAINERS. Subsystem AX.25 NETWORK LAYER Changes 9 / 59 (15%) (No activity) Top reviewers: [2]: mkl@pengutronix.de [2]: edumazet@google.com [2]: stefan@datenfreihafen.org INACTIVE MAINTAINER Ralf Baechle Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240109164517.3063131-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- CREDITS | 1 + MAINTAINERS | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CREDITS b/CREDITS index 3c392f7ebc6ea..f7888de50220a 100644 --- a/CREDITS +++ b/CREDITS @@ -179,6 +179,7 @@ E: ralf@gnu.org P: 1024/AF7B30C1 CF 97 C2 CC 6D AE A7 FE C8 BA 9C FC 88 DE 32 C3 D: Linux/MIPS port D: Linux/68k hacker +D: AX25 maintainer S: Hauptstrasse 19 S: 79837 St. Blasien S: Germany diff --git a/MAINTAINERS b/MAINTAINERS index 80422f6393522..b736b708b34bb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3376,9 +3376,8 @@ F: Documentation/devicetree/bindings/iio/adc/avia-hx711.yaml F: drivers/iio/adc/hx711.c AX.25 NETWORK LAYER -M: Ralf Baechle L: linux-hams@vger.kernel.org -S: Maintained +S: Orphan W: https://linux-ax25.in-berlin.de F: include/net/ax25.h F: include/uapi/linux/ax25.h -- GitLab From f9678f5825dd852b7201132e36691fefb17d49ce Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 9 Jan 2024 08:45:17 -0800 Subject: [PATCH 0835/1290] MAINTAINERS: ibmvnic: drop Dany from reviewers I missed that Dany uses a different email address when tagging patches (drt@linux.ibm.com) and asked him if he's still actively working on ibmvnic. He doesn't really fall under our removal criteria, but he admitted that he already moved on to other projects. Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240109164517.3063131-8-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index b736b708b34bb..4194f414dd2c5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10155,7 +10155,6 @@ IBM Power SRIOV Virtual NIC Device Driver M: Haren Myneni M: Rick Lindsley R: Nick Child -R: Dany Madden R: Thomas Falcon L: netdev@vger.kernel.org S: Supported -- GitLab From b3739fb3a9e6633b233d829ee799323d75162775 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 11 Jan 2024 17:27:53 +0100 Subject: [PATCH 0836/1290] wangxunx: select CONFIG_PHYLINK where needed The ngbe driver needs phylink: arm-linux-gnueabi-ld: drivers/net/ethernet/wangxun/libwx/wx_ethtool.o: in function `wx_nway_reset': wx_ethtool.c:(.text+0x458): undefined reference to `phylink_ethtool_nway_reset' arm-linux-gnueabi-ld: drivers/net/ethernet/wangxun/ngbe/ngbe_main.o: in function `ngbe_remove': ngbe_main.c:(.text+0x7c): undefined reference to `phylink_destroy' arm-linux-gnueabi-ld: drivers/net/ethernet/wangxun/ngbe/ngbe_main.o: in function `ngbe_open': ngbe_main.c:(.text+0xf90): undefined reference to `phylink_connect_phy' arm-linux-gnueabi-ld: drivers/net/ethernet/wangxun/ngbe/ngbe_mdio.o: in function `ngbe_mdio_init': ngbe_mdio.c:(.text+0x314): undefined reference to `phylink_create' Add the missing Kconfig description for this. Fixes: bc2426d74aa3 ("net: ngbe: convert phylib to phylink") Signed-off-by: Arnd Bergmann Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20240111162828.68564-1-arnd@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/wangxun/Kconfig b/drivers/net/ethernet/wangxun/Kconfig index 23cd610bd3766..85cdbdd44fec7 100644 --- a/drivers/net/ethernet/wangxun/Kconfig +++ b/drivers/net/ethernet/wangxun/Kconfig @@ -26,7 +26,7 @@ config NGBE tristate "Wangxun(R) GbE PCI Express adapters support" depends on PCI select LIBWX - select PHYLIB + select PHYLINK help This driver supports Wangxun(R) GbE PCI Express family of adapters. -- GitLab From e689a876969833cb1317f8752cd064595e7b61e2 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Wed, 10 Jan 2024 21:34:10 +0000 Subject: [PATCH 0837/1290] selftests/net/tcp-ao: Use LDLIBS instead of LDFLAGS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The rules to link selftests are: > $(OUTPUT)/%_ipv4: %.c > $(LINK.c) $^ $(LDLIBS) -o $@ > > $(OUTPUT)/%_ipv6: %.c > $(LINK.c) -DIPV6_TEST $^ $(LDLIBS) -o $@ The intel test robot uses only selftest's Makefile, not the top linux Makefile: > make W=1 O=/tmp/kselftest -C tools/testing/selftests So, $(LINK.c) is determined by environment, rather than by kernel Makefiles. On my machine (as well as other people that ran tcp-ao selftests) GNU/Make implicit definition does use $(LDFLAGS): > [dima@Mindolluin ~]$ make -p -f/dev/null | grep '^LINK.c\>' > make: *** No targets. Stop. > LINK.c = $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(TARGET_ARCH) But, according to build robot report, it's not the case for them. While I could just avoid using pre-defined $(LINK.c), it's also used by selftests/lib.mk by default. Anyways, according to GNU/Make documentation [1], I should have used $(LDLIBS) instead of $(LDFLAGS) in the first place, so let's just do it: > LDFLAGS > Extra flags to give to compilers when they are supposed to invoke > the linker, ‘ld’, such as -L. Libraries (-lfoo) should be added > to the LDLIBS variable instead. > LDLIBS > Library flags or names given to compilers when they are supposed > to invoke the linker, ‘ld’. LOADLIBES is a deprecated (but still > supported) alternative to LDLIBS. Non-library linker flags, such > as -L, should go in the LDFLAGS variable. [1]: https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html Fixes: cfbab37b3da0 ("selftests/net: Add TCP-AO library") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202401011151.veyYTJzq-lkp@intel.com/ Signed-off-by: Dmitry Safonov Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240110-tcp_ao-selftests-makefile-v1-1-aa07d043f052@arista.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tcp_ao/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/tcp_ao/Makefile b/tools/testing/selftests/net/tcp_ao/Makefile index 8e60bae67aa9f..522d991e310eb 100644 --- a/tools/testing/selftests/net/tcp_ao/Makefile +++ b/tools/testing/selftests/net/tcp_ao/Makefile @@ -52,5 +52,5 @@ $(OUTPUT)/%_ipv6: %.c $(OUTPUT)/icmps-accept_ipv4: CFLAGS+= -DTEST_ICMPS_ACCEPT $(OUTPUT)/icmps-accept_ipv6: CFLAGS+= -DTEST_ICMPS_ACCEPT -$(OUTPUT)/bench-lookups_ipv4: LDFLAGS+= -lm -$(OUTPUT)/bench-lookups_ipv6: LDFLAGS+= -lm +$(OUTPUT)/bench-lookups_ipv4: LDLIBS+= -lm +$(OUTPUT)/bench-lookups_ipv6: LDLIBS+= -lm -- GitLab From b33fb5b801c6db408b774a68e7c8722796b59ecc Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Wed, 10 Jan 2024 14:14:00 +0800 Subject: [PATCH 0838/1290] net: qualcomm: rmnet: fix global oob in rmnet_policy The variable rmnet_link_ops assign a *bigger* maxtype which leads to a global out-of-bounds read when parsing the netlink attributes. See bug trace below: ================================================================== BUG: KASAN: global-out-of-bounds in validate_nla lib/nlattr.c:386 [inline] BUG: KASAN: global-out-of-bounds in __nla_validate_parse+0x24af/0x2750 lib/nlattr.c:600 Read of size 1 at addr ffffffff92c438d0 by task syz-executor.6/84207 CPU: 0 PID: 84207 Comm: syz-executor.6 Tainted: G N 6.1.0 #3 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x8b/0xb3 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:284 [inline] print_report+0x172/0x475 mm/kasan/report.c:395 kasan_report+0xbb/0x1c0 mm/kasan/report.c:495 validate_nla lib/nlattr.c:386 [inline] __nla_validate_parse+0x24af/0x2750 lib/nlattr.c:600 __nla_parse+0x3e/0x50 lib/nlattr.c:697 nla_parse_nested_deprecated include/net/netlink.h:1248 [inline] __rtnl_newlink+0x50a/0x1880 net/core/rtnetlink.c:3485 rtnl_newlink+0x64/0xa0 net/core/rtnetlink.c:3594 rtnetlink_rcv_msg+0x43c/0xd70 net/core/rtnetlink.c:6091 netlink_rcv_skb+0x14f/0x410 net/netlink/af_netlink.c:2540 netlink_unicast_kernel net/netlink/af_netlink.c:1319 [inline] netlink_unicast+0x54e/0x800 net/netlink/af_netlink.c:1345 netlink_sendmsg+0x930/0xe50 net/netlink/af_netlink.c:1921 sock_sendmsg_nosec net/socket.c:714 [inline] sock_sendmsg+0x154/0x190 net/socket.c:734 ____sys_sendmsg+0x6df/0x840 net/socket.c:2482 ___sys_sendmsg+0x110/0x1b0 net/socket.c:2536 __sys_sendmsg+0xf3/0x1c0 net/socket.c:2565 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3b/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7fdcf2072359 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 f1 19 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fdcf13e3168 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007fdcf219ff80 RCX: 00007fdcf2072359 RDX: 0000000000000000 RSI: 0000000020000200 RDI: 0000000000000003 RBP: 00007fdcf20bd493 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007fffbb8d7bdf R14: 00007fdcf13e3300 R15: 0000000000022000 The buggy address belongs to the variable: rmnet_policy+0x30/0xe0 The buggy address belongs to the physical page: page:0000000065bdeb3c refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x155243 flags: 0x200000000001000(reserved|node=0|zone=2) raw: 0200000000001000 ffffea00055490c8 ffffea00055490c8 0000000000000000 raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffffffff92c43780: f9 f9 f9 f9 00 00 00 02 f9 f9 f9 f9 00 00 00 07 ffffffff92c43800: f9 f9 f9 f9 00 00 00 05 f9 f9 f9 f9 06 f9 f9 f9 >ffffffff92c43880: f9 f9 f9 f9 00 00 00 00 00 00 f9 f9 f9 f9 f9 f9 ^ ffffffff92c43900: 00 00 00 00 00 00 00 00 07 f9 f9 f9 f9 f9 f9 f9 ffffffff92c43980: 00 00 00 07 f9 f9 f9 f9 00 00 00 05 f9 f9 f9 f9 According to the comment of `nla_parse_nested_deprecated`, the maxtype should be len(destination array) - 1. Hence use `IFLA_RMNET_MAX` here. Fixes: 14452ca3b5ce ("net: qualcomm: rmnet: Export mux_id and flags to netlink") Signed-off-by: Lin Ma Reviewed-by: Subash Abhinov Kasiviswanathan Reviewed-by: Simon Horman Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20240110061400.3356108-1-linma@zju.edu.cn Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c index 39d24e07f3067..5b69b9268c757 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c @@ -396,7 +396,7 @@ nla_put_failure: struct rtnl_link_ops rmnet_link_ops __read_mostly = { .kind = "rmnet", - .maxtype = __IFLA_RMNET_MAX, + .maxtype = IFLA_RMNET_MAX, .priv_size = sizeof(struct rmnet_priv), .setup = rmnet_vnd_setup, .validate = rmnet_rtnl_validate, -- GitLab From 844f104790bd69c2e4dbb9ee3eba46fde1fcea7b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 10 Jan 2024 02:33:54 +0200 Subject: [PATCH 0839/1290] net: dsa: fix netdev_priv() dereference before check on non-DSA netdevice events After the blamed commit, we started doing this dereference for every NETDEV_CHANGEUPPER and NETDEV_PRECHANGEUPPER event in the system. static inline struct dsa_port *dsa_user_to_port(const struct net_device *dev) { struct dsa_user_priv *p = netdev_priv(dev); return p->dp; } Which is obviously bogus, because not all net_devices have a netdev_priv() of type struct dsa_user_priv. But struct dsa_user_priv is fairly small, and p->dp means dereferencing 8 bytes starting with offset 16. Most drivers allocate that much private memory anyway, making our access not fault, and we discard the bogus data quickly afterwards, so this wasn't caught. But the dummy interface is somewhat special in that it calls alloc_netdev() with a priv size of 0. So every netdev_priv() dereference is invalid, and we get this when we emit a NETDEV_PRECHANGEUPPER event with a VLAN as its new upper: $ ip link add dummy1 type dummy $ ip link add link dummy1 name dummy1.100 type vlan id 100 [ 43.309174] ================================================================== [ 43.316456] BUG: KASAN: slab-out-of-bounds in dsa_user_prechangeupper+0x30/0xe8 [ 43.323835] Read of size 8 at addr ffff3f86481d2990 by task ip/374 [ 43.330058] [ 43.342436] Call trace: [ 43.366542] dsa_user_prechangeupper+0x30/0xe8 [ 43.371024] dsa_user_netdevice_event+0xb38/0xee8 [ 43.375768] notifier_call_chain+0xa4/0x210 [ 43.379985] raw_notifier_call_chain+0x24/0x38 [ 43.384464] __netdev_upper_dev_link+0x3ec/0x5d8 [ 43.389120] netdev_upper_dev_link+0x70/0xa8 [ 43.393424] register_vlan_dev+0x1bc/0x310 [ 43.397554] vlan_newlink+0x210/0x248 [ 43.401247] rtnl_newlink+0x9fc/0xe30 [ 43.404942] rtnetlink_rcv_msg+0x378/0x580 Avoid the kernel oops by dereferencing after the type check, as customary. Fixes: 4c3f80d22b2e ("net: dsa: walk through all changeupper notifier functions") Reported-and-tested-by: syzbot+d81bcd883824180500c8@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/0000000000001d4255060e87545c@google.com/ Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240110003354.2796778-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dsa/user.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/dsa/user.c b/net/dsa/user.c index b738a466e2dcc..b15e71cc342c7 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -2806,13 +2806,14 @@ EXPORT_SYMBOL_GPL(dsa_user_dev_check); static int dsa_user_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { - struct dsa_port *dp = dsa_user_to_port(dev); struct netlink_ext_ack *extack; int err = NOTIFY_DONE; + struct dsa_port *dp; if (!dsa_user_dev_check(dev)) return err; + dp = dsa_user_to_port(dev); extack = netdev_notifier_info_to_extack(&info->info); if (netif_is_bridge_master(info->upper_dev)) { @@ -2865,11 +2866,13 @@ static int dsa_user_changeupper(struct net_device *dev, static int dsa_user_prechangeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { - struct dsa_port *dp = dsa_user_to_port(dev); + struct dsa_port *dp; if (!dsa_user_dev_check(dev)) return NOTIFY_DONE; + dp = dsa_user_to_port(dev); + if (netif_is_bridge_master(info->upper_dev) && !info->linking) dsa_port_pre_bridge_leave(dp, info->upper_dev); else if (netif_is_lag_master(info->upper_dev) && !info->linking) -- GitLab From 8722014311e613244f33952354956a82fa4b0472 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 9 Jan 2024 15:10:48 +0000 Subject: [PATCH 0840/1290] rxrpc: Fix use of Don't Fragment flag rxrpc normally has the Don't Fragment flag set on the UDP packets it transmits, except when it has decided that DATA packets aren't getting through - in which case it turns it off just for the DATA transmissions. This can be a problem, however, for RESPONSE packets that convey authentication and crypto data from the client to the server as ticket may be larger than can fit in the MTU. In such a case, rxrpc gets itself into an infinite loop as the sendmsg returns an error (EMSGSIZE), which causes rxkad_send_response() to return -EAGAIN - and the CHALLENGE packet is put back on the Rx queue to retry, leading to the I/O thread endlessly attempting to perform the transmission. Fix this by disabling DF on RESPONSE packets for now. The use of DF and best data MTU determination needs reconsidering at some point in the future. Fixes: 17926a79320a ("[AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both") Reported-by: Marc Dionne Signed-off-by: David Howells cc: linux-afs@lists.infradead.org Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/1581852.1704813048@warthog.procyon.org.uk Signed-off-by: Jakub Kicinski --- net/rxrpc/ar-internal.h | 1 + net/rxrpc/local_object.c | 13 ++++++++++++- net/rxrpc/output.c | 6 ++---- net/rxrpc/rxkad.c | 2 ++ 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 2f8b39a614c31..dbeb75c298573 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -1079,6 +1079,7 @@ void rxrpc_send_version_request(struct rxrpc_local *local, /* * local_object.c */ +void rxrpc_local_dont_fragment(const struct rxrpc_local *local, bool set); struct rxrpc_local *rxrpc_lookup_local(struct net *, const struct sockaddr_rxrpc *); struct rxrpc_local *rxrpc_get_local(struct rxrpc_local *, enum rxrpc_local_trace); struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *, enum rxrpc_local_trace); diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index c553a30e9c838..34d3073681353 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -36,6 +36,17 @@ static void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, int err, return ipv6_icmp_error(sk, skb, err, port, info, payload); } +/* + * Set or clear the Don't Fragment flag on a socket. + */ +void rxrpc_local_dont_fragment(const struct rxrpc_local *local, bool set) +{ + if (set) + ip_sock_set_mtu_discover(local->socket->sk, IP_PMTUDISC_DO); + else + ip_sock_set_mtu_discover(local->socket->sk, IP_PMTUDISC_DONT); +} + /* * Compare a local to an address. Return -ve, 0 or +ve to indicate less than, * same or greater than. @@ -203,7 +214,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) ip_sock_set_recverr(usk); /* we want to set the don't fragment bit */ - ip_sock_set_mtu_discover(usk, IP_PMTUDISC_DO); + rxrpc_local_dont_fragment(local, true); /* We want receive timestamps. */ sock_enable_timestamps(usk); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 5e53429c69228..a0906145e8293 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -494,14 +494,12 @@ send_fragmentable: switch (conn->local->srx.transport.family) { case AF_INET6: case AF_INET: - ip_sock_set_mtu_discover(conn->local->socket->sk, - IP_PMTUDISC_DONT); + rxrpc_local_dont_fragment(conn->local, false); rxrpc_inc_stat(call->rxnet, stat_tx_data_send_frag); ret = do_udp_sendmsg(conn->local->socket, &msg, len); conn->peer->last_tx_at = ktime_get_seconds(); - ip_sock_set_mtu_discover(conn->local->socket->sk, - IP_PMTUDISC_DO); + rxrpc_local_dont_fragment(conn->local, true); break; default: diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 1bf571a66e020..b52dedcebce0a 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -724,7 +724,9 @@ static int rxkad_send_response(struct rxrpc_connection *conn, serial = atomic_inc_return(&conn->serial); whdr.serial = htonl(serial); + rxrpc_local_dont_fragment(conn->local, false); ret = kernel_sendmsg(conn->local->socket, &msg, iov, 3, len); + rxrpc_local_dont_fragment(conn->local, true); if (ret < 0) { trace_rxrpc_tx_fail(conn->debug_id, serial, ret, rxrpc_tx_point_rxkad_response); -- GitLab From ec4ffd100ffb396eca13ebe7d18938ea80f399c3 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 8 Jan 2024 10:41:02 +0100 Subject: [PATCH 0841/1290] Revert "net: rtnetlink: Enslave device before bringing it up" This reverts commit a4abfa627c3865c37e036bccb681619a50d3d93c. The patch broke: > ip link set dummy0 up > ip link set dummy0 master bond0 down This last command is useful to be able to enslave an interface with only one netlink message. After discussion, there is no good reason to support: > ip link set dummy0 down > ip link set dummy0 master bond0 up because the bond interface already set the slave up when it is up. Cc: stable@vger.kernel.org Fixes: a4abfa627c38 ("net: rtnetlink: Enslave device before bringing it up") Signed-off-by: Nicolas Dichtel Reviewed-by: Jiri Pirko Reviewed-by: Hangbin Liu Link: https://lore.kernel.org/r/20240108094103.2001224-2-nicolas.dichtel@6wind.com Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 5f6ed6da3cfc0..f6f29eb03ec27 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2899,13 +2899,6 @@ static int do_setlink(const struct sk_buff *skb, call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); } - if (tb[IFLA_MASTER]) { - err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); - if (err) - goto errout; - status |= DO_SETLINK_MODIFIED; - } - if (ifm->ifi_flags || ifm->ifi_change) { err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), extack); @@ -2913,6 +2906,13 @@ static int do_setlink(const struct sk_buff *skb, goto errout; } + if (tb[IFLA_MASTER]) { + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); + if (err) + goto errout; + status |= DO_SETLINK_MODIFIED; + } + if (tb[IFLA_CARRIER]) { err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); if (err) -- GitLab From a159cbe81d3b88bf35cd4edb4e6040d015972fdd Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 8 Jan 2024 10:41:03 +0100 Subject: [PATCH 0842/1290] selftests: rtnetlink: check enslaving iface in a bond The goal is to check the following two sequences: > ip link set dummy0 up > ip link set dummy0 master bond0 down Signed-off-by: Nicolas Dichtel Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20240108094103.2001224-3-nicolas.dichtel@6wind.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/rtnetlink.sh | 28 ++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh index a10a32952f216..a31be0eaaa50f 100755 --- a/tools/testing/selftests/net/rtnetlink.sh +++ b/tools/testing/selftests/net/rtnetlink.sh @@ -28,6 +28,7 @@ ALL_TESTS=" kci_test_neigh_get kci_test_bridge_parent_id kci_test_address_proto + kci_test_enslave_bonding " devdummy="test-dummy0" @@ -1241,6 +1242,33 @@ kci_test_address_proto() return $ret } +kci_test_enslave_bonding() +{ + local testns="testns" + local bond="bond123" + local dummy="dummy123" + local ret=0 + + run_cmd ip netns add "$testns" + if [ $ret -ne 0 ]; then + end_test "SKIP bonding tests: cannot add net namespace $testns" + return $ksft_skip + fi + + run_cmd ip -netns $testns link add dev $bond type bond mode balance-rr + run_cmd ip -netns $testns link add dev $dummy type dummy + run_cmd ip -netns $testns link set dev $dummy up + run_cmd ip -netns $testns link set dev $dummy master $bond down + if [ $ret -ne 0 ]; then + end_test "FAIL: initially up interface added to a bond and set down" + ip netns del "$testns" + return 1 + fi + + end_test "PASS: enslave interface in a bond" + ip netns del "$testns" +} + kci_test_rtnl() { local current_test -- GitLab From a0cb76a770083a22167659e64ba69af6425b1d9b Mon Sep 17 00:00:00 2001 From: Nithin Dabilpuram Date: Mon, 8 Jan 2024 13:00:36 +0530 Subject: [PATCH 0843/1290] octeontx2-af: CN10KB: Fix FIFO length calculation for RPM2 RPM0 and RPM1 on the CN10KB SoC have 8 LMACs each, whereas RPM2 has only 4 LMACs. Similarly, the RPM0 and RPM1 have 256KB FIFO, whereas RPM2 has 128KB FIFO. This patch fixes an issue with improper TX credit programming for the RPM2 link. Fixes: b9d0fedc6234 ("octeontx2-af: cn10kb: Add RPM_USX MAC support") Signed-off-by: Nithin Dabilpuram Signed-off-by: Naveen Mamindlapalli Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240108073036.8766-1-naveenm@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/af/rpm.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rpm.c b/drivers/net/ethernet/marvell/octeontx2/af/rpm.c index 4728ba34b0e34..76218f1cb4595 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rpm.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rpm.c @@ -506,6 +506,7 @@ u32 rpm2_get_lmac_fifo_len(void *rpmd, int lmac_id) rpm_t *rpm = rpmd; u8 num_lmacs; u32 fifo_len; + u16 max_lmac; lmac_info = rpm_read(rpm, 0, RPM2_CMRX_RX_LMACS); /* LMACs are divided into two groups and each group @@ -513,7 +514,11 @@ u32 rpm2_get_lmac_fifo_len(void *rpmd, int lmac_id) * Group0 lmac_id range {0..3} * Group1 lmac_id range {4..7} */ - fifo_len = rpm->mac_ops->fifo_len / 2; + max_lmac = (rpm_read(rpm, 0, CGX_CONST) >> 24) & 0xFF; + if (max_lmac > 4) + fifo_len = rpm->mac_ops->fifo_len / 2; + else + fifo_len = rpm->mac_ops->fifo_len; if (lmac_id < 4) { num_lmacs = hweight8(lmac_info & 0xF); -- GitLab From e3fe8d28c67bf6c291e920c6d04fa22afa14e6e4 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Thu, 4 Jan 2024 10:09:02 +0800 Subject: [PATCH 0844/1290] =?UTF-8?q?virtio=5Fnet:=20Fix=20"=E2=80=98%d?= =?UTF-8?q?=E2=80=99=20directive=20writing=20between=201=20and=2011=20byte?= =?UTF-8?q?s=20into=20a=20region=20of=20size=2010"=20warnings?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the warnings when building virtio_net driver. " drivers/net/virtio_net.c: In function ‘init_vqs’: drivers/net/virtio_net.c:4551:48: warning: ‘%d’ directive writing between 1 and 11 bytes into a region of size 10 [-Wformat-overflow=] 4551 | sprintf(vi->rq[i].name, "input.%d", i); | ^~ In function ‘virtnet_find_vqs’, inlined from ‘init_vqs’ at drivers/net/virtio_net.c:4645:8: drivers/net/virtio_net.c:4551:41: note: directive argument in the range [-2147483643, 65534] 4551 | sprintf(vi->rq[i].name, "input.%d", i); | ^~~~~~~~~~ drivers/net/virtio_net.c:4551:17: note: ‘sprintf’ output between 8 and 18 bytes into a destination of size 16 4551 | sprintf(vi->rq[i].name, "input.%d", i); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/net/virtio_net.c: In function ‘init_vqs’: drivers/net/virtio_net.c:4552:49: warning: ‘%d’ directive writing between 1 and 11 bytes into a region of size 9 [-Wformat-overflow=] 4552 | sprintf(vi->sq[i].name, "output.%d", i); | ^~ In function ‘virtnet_find_vqs’, inlined from ‘init_vqs’ at drivers/net/virtio_net.c:4645:8: drivers/net/virtio_net.c:4552:41: note: directive argument in the range [-2147483643, 65534] 4552 | sprintf(vi->sq[i].name, "output.%d", i); | ^~~~~~~~~~~ drivers/net/virtio_net.c:4552:17: note: ‘sprintf’ output between 9 and 19 bytes into a destination of size 16 4552 | sprintf(vi->sq[i].name, "output.%d", i); " Reviewed-by: Xuan Zhuo Signed-off-by: Zhu Yanjun Link: https://lore.kernel.org/r/20240104020902.2753599-1-yanjun.zhu@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 3cb8aa1938841..d7ce4a1011ea2 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -4295,10 +4295,11 @@ static int virtnet_find_vqs(struct virtnet_info *vi) { vq_callback_t **callbacks; struct virtqueue **vqs; - int ret = -ENOMEM; - int i, total_vqs; const char **names; + int ret = -ENOMEM; + int total_vqs; bool *ctx; + u16 i; /* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by @@ -4335,8 +4336,8 @@ static int virtnet_find_vqs(struct virtnet_info *vi) for (i = 0; i < vi->max_queue_pairs; i++) { callbacks[rxq2vq(i)] = skb_recv_done; callbacks[txq2vq(i)] = skb_xmit_done; - sprintf(vi->rq[i].name, "input.%d", i); - sprintf(vi->sq[i].name, "output.%d", i); + sprintf(vi->rq[i].name, "input.%u", i); + sprintf(vi->sq[i].name, "output.%u", i); names[rxq2vq(i)] = vi->rq[i].name; names[txq2vq(i)] = vi->sq[i].name; if (ctx) -- GitLab From 64e47d8afb5ca533b27efc006405e5bcae2c4a7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sanju=C3=A1n=20Garc=C3=ADa=2C=20Jorge?= Date: Fri, 5 Jan 2024 08:55:43 +0000 Subject: [PATCH 0845/1290] net: ethernet: ti: am65-cpsw: Fix max mtu to fit ethernet frames The value of AM65_CPSW_MAX_PACKET_SIZE represents the maximum length of a received frame. This value is written to the register AM65_CPSW_PORT_REG_RX_MAXLEN. The maximum MTU configured on the network device should then leave some room for the ethernet headers and frame check. Otherwise, if the network interface is configured to its maximum mtu possible, the frames will be larger than AM65_CPSW_MAX_PACKET_SIZE and will get dropped as oversized. The switch supports ethernet frame sizes between 64 and 2024 bytes (including VLAN) as stated in the technical reference manual, so define AM65_CPSW_MAX_PACKET_SIZE with that maximum size. Fixes: 93a76530316a ("net: ethernet: ti: introduce am65x/j721e gigabit eth subsystem driver") Signed-off-by: Jorge Sanjuan Garcia Reviewed-by: Horatiu Vultur Reviewed-by: Siddharth Vadapalli Link: https://lore.kernel.org/r/20240105085530.14070-2-jorge.sanjuangarcia@duagon.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index faa0561e988ec..9d2f4ac783e43 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -56,7 +56,7 @@ #define AM65_CPSW_MAX_PORTS 8 #define AM65_CPSW_MIN_PACKET_SIZE VLAN_ETH_ZLEN -#define AM65_CPSW_MAX_PACKET_SIZE (VLAN_ETH_FRAME_LEN + ETH_FCS_LEN) +#define AM65_CPSW_MAX_PACKET_SIZE 2024 #define AM65_CPSW_REG_CTL 0x004 #define AM65_CPSW_REG_STAT_PORT_EN 0x014 @@ -2244,7 +2244,8 @@ am65_cpsw_nuss_init_port_ndev(struct am65_cpsw_common *common, u32 port_idx) eth_hw_addr_set(port->ndev, port->slave.mac_addr); port->ndev->min_mtu = AM65_CPSW_MIN_PACKET_SIZE; - port->ndev->max_mtu = AM65_CPSW_MAX_PACKET_SIZE; + port->ndev->max_mtu = AM65_CPSW_MAX_PACKET_SIZE - + (VLAN_ETH_HLEN + ETH_FCS_LEN); port->ndev->hw_features = NETIF_F_SG | NETIF_F_RXCSUM | NETIF_F_HW_CSUM | -- GitLab From bec161add35b478a7746bf58bcdea6faa19129ef Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 7 Jan 2024 14:42:41 +0000 Subject: [PATCH 0846/1290] amt: do not use overwrapped cb area amt driver uses skb->cb for storing tunnel information. This job is worked before TC layer and then amt driver load tunnel info from skb->cb after TC layer. So, its cb area should not be overwrapped with CB area used by TC. In order to not use cb area used by TC, it skips the biggest cb structure used by TC, which was qdisc_skb_cb. But it's not anymore. Currently, biggest structure of TC's CB is tc_skb_cb. So, it should skip size of tc_skb_cb instead of qdisc_skb_cb. Fixes: ec624fe740b4 ("net/sched: Extend qdisc control block with tc control block") Signed-off-by: Taehee Yoo Acked-by: Paolo Abeni Reviewed-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/20240107144241.4169520-1-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/amt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/amt.c b/drivers/net/amt.c index 53415e83821ce..68e79b1272f6b 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include #include @@ -80,11 +80,11 @@ static struct mld2_grec mldv2_zero_grec; static struct amt_skb_cb *amt_skb_cb(struct sk_buff *skb) { - BUILD_BUG_ON(sizeof(struct amt_skb_cb) + sizeof(struct qdisc_skb_cb) > + BUILD_BUG_ON(sizeof(struct amt_skb_cb) + sizeof(struct tc_skb_cb) > sizeof_field(struct sk_buff, cb)); return (struct amt_skb_cb *)((void *)skb->cb + - sizeof(struct qdisc_skb_cb)); + sizeof(struct tc_skb_cb)); } static void __amt_source_gc_work(void) -- GitLab From b271fee9a41ca1474d30639fd6cc912c9901d0f8 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 19 Dec 2023 01:02:28 +0900 Subject: [PATCH 0847/1290] btrfs: zoned: factor out prepare_allocation_zoned() Factor out prepare_allocation_zoned() for further extension. While at it, optimize the if-branch a bit. Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f396aba92c579..d260b970bec77 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4298,6 +4298,24 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, return 0; } +static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl) +{ + if (ffe_ctl->for_treelog) { + spin_lock(&fs_info->treelog_bg_lock); + if (fs_info->treelog_bg) + ffe_ctl->hint_byte = fs_info->treelog_bg; + spin_unlock(&fs_info->treelog_bg_lock); + } else if (ffe_ctl->for_data_reloc) { + spin_lock(&fs_info->relocation_bg_lock); + if (fs_info->data_reloc_bg) + ffe_ctl->hint_byte = fs_info->data_reloc_bg; + spin_unlock(&fs_info->relocation_bg_lock); + } + + return 0; +} + static int prepare_allocation(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, struct btrfs_space_info *space_info, @@ -4308,19 +4326,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, return prepare_allocation_clustered(fs_info, ffe_ctl, space_info, ins); case BTRFS_EXTENT_ALLOC_ZONED: - if (ffe_ctl->for_treelog) { - spin_lock(&fs_info->treelog_bg_lock); - if (fs_info->treelog_bg) - ffe_ctl->hint_byte = fs_info->treelog_bg; - spin_unlock(&fs_info->treelog_bg_lock); - } - if (ffe_ctl->for_data_reloc) { - spin_lock(&fs_info->relocation_bg_lock); - if (fs_info->data_reloc_bg) - ffe_ctl->hint_byte = fs_info->data_reloc_bg; - spin_unlock(&fs_info->relocation_bg_lock); - } - return 0; + return prepare_allocation_zoned(fs_info, ffe_ctl); default: BUG(); } -- GitLab From 02444f2ac26eae6385a65fcd66915084d15dffba Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 19 Dec 2023 01:02:29 +0900 Subject: [PATCH 0848/1290] btrfs: zoned: optimize hint byte for zoned allocator Writing sequentially to a huge file on btrfs on a SMR HDD revealed a decline of the performance (220 MiB/s to 30 MiB/s after 500 minutes). The performance goes down because of increased latency of the extent allocation, which is induced by a traversing of a lot of full block groups. So, this patch optimizes the ffe_ctl->hint_byte by choosing a block group with sufficient size from the active block group list, which does not contain full block groups. After applying the patch, the performance is maintained well. Fixes: 2eda57089ea3 ("btrfs: zoned: implement sequential extent allocation") CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d260b970bec77..6d680031211a1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4311,6 +4311,24 @@ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, if (fs_info->data_reloc_bg) ffe_ctl->hint_byte = fs_info->data_reloc_bg; spin_unlock(&fs_info->relocation_bg_lock); + } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { + struct btrfs_block_group *block_group; + + spin_lock(&fs_info->zone_active_bgs_lock); + list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { + /* + * No lock is OK here because avail is monotinically + * decreasing, and this is just a hint. + */ + u64 avail = block_group->zone_capacity - block_group->alloc_offset; + + if (block_group_bits(block_group, ffe_ctl->flags) && + avail >= ffe_ctl->num_bytes) { + ffe_ctl->hint_byte = block_group->start; + break; + } + } + spin_unlock(&fs_info->zone_active_bgs_lock); } return 0; -- GitLab From 6ff09b6b8c2fb6b3edda4ffaa173153a40653067 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Thu, 21 Dec 2023 11:47:45 +0300 Subject: [PATCH 0849/1290] btrfs: fix kvcalloc() arguments order in btrfs_ioctl_send() When compiling with gcc version 14.0.0 20231220 (experimental) and W=1, I've noticed the following warning: fs/btrfs/send.c: In function 'btrfs_ioctl_send': fs/btrfs/send.c:8208:44: warning: 'kvcalloc' sizes specified with 'sizeof' in the earlier argument and not in the later argument [-Wcalloc-transposed-args] 8208 | sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), | ^ Since 'n' and 'size' arguments of 'kvcalloc()' are multiplied to calculate the final size, their actual order doesn't affect the result and so this is not a bug. But it's still worth to fix it. Signed-off-by: Dmitry Antipov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 4e36550618e58..2d7519a6ce72d 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -8205,8 +8205,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) goto out; } - sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), - arg->clone_sources_count + 1, + sctx->clone_roots = kvcalloc(arg->clone_sources_count + 1, + sizeof(*sctx->clone_roots), GFP_KERNEL); if (!sctx->clone_roots) { ret = -ENOMEM; -- GitLab From f03e274a8b29d1d1c1bbd7f764766cb5ca537ab7 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Wed, 3 Jan 2024 13:31:27 +0300 Subject: [PATCH 0850/1290] btrfs: ref-verify: free ref cache before clearing mount opt As clearing REF_VERIFY mount option indicates there were some errors in a ref-verify process, a ref cache is not relevant anymore and should be freed. btrfs_free_ref_cache() requires REF_VERIFY option being set so call it just before clearing the mount option. Found by Linux Verification Center (linuxtesting.org) with Syzkaller. Reported-by: syzbot+be14ed7728594dc8bd42@syzkaller.appspotmail.com Fixes: fd708b81d972 ("Btrfs: add a extent ref verify tool") CC: stable@vger.kernel.org # 5.4+ Closes: https://lore.kernel.org/lkml/000000000000e5a65c05ee832054@google.com/ Reported-by: syzbot+c563a3c79927971f950f@syzkaller.appspotmail.com Closes: https://lore.kernel.org/lkml/0000000000007fe09705fdc6086c@google.com/ Reviewed-by: Anand Jain Signed-off-by: Fedor Pchelkin Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ref-verify.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 6486f0d7e9931..8c4fc98ca9ce7 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -889,8 +889,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, out_unlock: spin_unlock(&fs_info->ref_verify_lock); out: - if (ret) + if (ret) { + btrfs_free_ref_cache(fs_info); btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); + } return ret; } @@ -1021,8 +1023,8 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) } } if (ret) { - btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); btrfs_free_ref_cache(fs_info); + btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); } btrfs_free_path(path); return ret; -- GitLab From d967c914a633ee797255261808720f791b658f24 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 22 Dec 2023 01:46:16 +0900 Subject: [PATCH 0851/1290] btrfs: fix unbalanced unlock of mapping_tree_lock The error path of btrfs_get_chunk_map() releases fs_info->mapping_tree_lock. But, it is taken and released in btrfs_find_chunk_map(). So, there is no need to do so. Fixes: 7dc66abb5a47 ("btrfs: use a dedicated data structure for chunk maps") Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4c32497311d2f..d67785be2c778 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3087,7 +3087,6 @@ struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, map = btrfs_find_chunk_map(fs_info, logical, length); if (unlikely(!map)) { - read_unlock(&fs_info->mapping_tree_lock); btrfs_crit(fs_info, "unable to find chunk map for logical %llu length %llu", logical, length); @@ -3095,7 +3094,6 @@ struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, } if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) { - read_unlock(&fs_info->mapping_tree_lock); btrfs_crit(fs_info, "found a bad chunk map, wanted %llu-%llu, found %llu-%llu", logical, logical + length, map->start, -- GitLab From b18f3b60b35a8c01c9a2a0f0d6424c6d73971dc3 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 22 Dec 2023 13:56:34 +0900 Subject: [PATCH 0852/1290] btrfs: zoned: fix lock ordering in btrfs_zone_activate() The btrfs CI reported a lockdep warning as follows by running generic generic/129. WARNING: possible circular locking dependency detected 6.7.0-rc5+ #1 Not tainted ------------------------------------------------------ kworker/u5:5/793427 is trying to acquire lock: ffff88813256d028 (&cache->lock){+.+.}-{2:2}, at: btrfs_zone_finish_one_bg+0x5e/0x130 but task is already holding lock: ffff88810a23a318 (&fs_info->zone_active_bgs_lock){+.+.}-{2:2}, at: btrfs_zone_finish_one_bg+0x34/0x130 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&fs_info->zone_active_bgs_lock){+.+.}-{2:2}: ... -> #0 (&cache->lock){+.+.}-{2:2}: ... This is because we take fs_info->zone_active_bgs_lock after a block_group's lock in btrfs_zone_activate() while doing the opposite in other places. Fix the issue by expanding the fs_info->zone_active_bgs_lock's critical section and taking it before a block_group's lock. Fixes: a7e1ac7bdc5a ("btrfs: zoned: reserve zones for an active metadata/system block group") CC: stable@vger.kernel.org # 6.6 Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 12066afc235c3..ac9bbe0c4ffe6 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2072,6 +2072,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) map = block_group->physical_map; + spin_lock(&fs_info->zone_active_bgs_lock); spin_lock(&block_group->lock); if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { ret = true; @@ -2084,7 +2085,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) goto out_unlock; } - spin_lock(&fs_info->zone_active_bgs_lock); for (i = 0; i < map->num_stripes; i++) { struct btrfs_zoned_device_info *zinfo; int reserved = 0; @@ -2104,20 +2104,17 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) */ if (atomic_read(&zinfo->active_zones_left) <= reserved) { ret = false; - spin_unlock(&fs_info->zone_active_bgs_lock); goto out_unlock; } if (!btrfs_dev_set_active_zone(device, physical)) { /* Cannot activate the zone */ ret = false; - spin_unlock(&fs_info->zone_active_bgs_lock); goto out_unlock; } if (!is_data) zinfo->reserved_active_zones--; } - spin_unlock(&fs_info->zone_active_bgs_lock); /* Successfully activated all the zones */ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); @@ -2125,8 +2122,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) /* For the active block group list */ btrfs_get_block_group(block_group); - - spin_lock(&fs_info->zone_active_bgs_lock); list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs); spin_unlock(&fs_info->zone_active_bgs_lock); @@ -2134,6 +2129,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) out_unlock: spin_unlock(&block_group->lock); + spin_unlock(&fs_info->zone_active_bgs_lock); return ret; } -- GitLab From 7081929ab2572920e94d70be3d332e5c9f97095a Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 4 Jan 2024 11:48:46 -0800 Subject: [PATCH 0853/1290] btrfs: don't abort filesystem when attempting to snapshot deleted subvolume If the source file descriptor to the snapshot ioctl refers to a deleted subvolume, we get the following abort: BTRFS: Transaction aborted (error -2) WARNING: CPU: 0 PID: 833 at fs/btrfs/transaction.c:1875 create_pending_snapshot+0x1040/0x1190 [btrfs] Modules linked in: pata_acpi btrfs ata_piix libata scsi_mod virtio_net blake2b_generic xor net_failover virtio_rng failover scsi_common rng_core raid6_pq libcrc32c CPU: 0 PID: 833 Comm: t_snapshot_dele Not tainted 6.7.0-rc6 #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-1.fc39 04/01/2014 RIP: 0010:create_pending_snapshot+0x1040/0x1190 [btrfs] RSP: 0018:ffffa09c01337af8 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff9982053e7c78 RCX: 0000000000000027 RDX: ffff99827dc20848 RSI: 0000000000000001 RDI: ffff99827dc20840 RBP: ffffa09c01337c00 R08: 0000000000000000 R09: ffffa09c01337998 R10: 0000000000000003 R11: ffffffffb96da248 R12: fffffffffffffffe R13: ffff99820535bb28 R14: ffff99820b7bd000 R15: ffff99820381ea80 FS: 00007fe20aadabc0(0000) GS:ffff99827dc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000559a120b502f CR3: 00000000055b6000 CR4: 00000000000006f0 Call Trace: ? create_pending_snapshot+0x1040/0x1190 [btrfs] ? __warn+0x81/0x130 ? create_pending_snapshot+0x1040/0x1190 [btrfs] ? report_bug+0x171/0x1a0 ? handle_bug+0x3a/0x70 ? exc_invalid_op+0x17/0x70 ? asm_exc_invalid_op+0x1a/0x20 ? create_pending_snapshot+0x1040/0x1190 [btrfs] ? create_pending_snapshot+0x1040/0x1190 [btrfs] create_pending_snapshots+0x92/0xc0 [btrfs] btrfs_commit_transaction+0x66b/0xf40 [btrfs] btrfs_mksubvol+0x301/0x4d0 [btrfs] btrfs_mksnapshot+0x80/0xb0 [btrfs] __btrfs_ioctl_snap_create+0x1c2/0x1d0 [btrfs] btrfs_ioctl_snap_create_v2+0xc4/0x150 [btrfs] btrfs_ioctl+0x8a6/0x2650 [btrfs] ? kmem_cache_free+0x22/0x340 ? do_sys_openat2+0x97/0xe0 __x64_sys_ioctl+0x97/0xd0 do_syscall_64+0x46/0xf0 entry_SYSCALL_64_after_hwframe+0x6e/0x76 RIP: 0033:0x7fe20abe83af RSP: 002b:00007ffe6eff1360 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007fe20abe83af RDX: 00007ffe6eff23c0 RSI: 0000000050009417 RDI: 0000000000000003 RBP: 0000000000000003 R08: 0000000000000000 R09: 00007fe20ad16cd0 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007ffe6eff13c0 R14: 00007fe20ad45000 R15: 0000559a120b6d58 ---[ end trace 0000000000000000 ]--- BTRFS: error (device vdc: state A) in create_pending_snapshot:1875: errno=-2 No such entry BTRFS info (device vdc: state EA): forced readonly BTRFS warning (device vdc: state EA): Skipping commit of aborted transaction. BTRFS: error (device vdc: state EA) in cleanup_transaction:2055: errno=-2 No such entry This happens because create_pending_snapshot() initializes the new root item as a copy of the source root item. This includes the refs field, which is 0 for a deleted subvolume. The call to btrfs_insert_root() therefore inserts a root with refs == 0. btrfs_get_new_fs_root() then finds the root and returns -ENOENT if refs == 0, which causes create_pending_snapshot() to abort. Fix it by checking the source root's refs before attempting the snapshot, but after locking subvol_sem to avoid racing with deletion. CC: stable@vger.kernel.org # 4.14+ Reviewed-by: Sweet Tea Dorminy Reviewed-by: Anand Jain Signed-off-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4e50b62db2a8f..fea5d37528b80 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -790,6 +790,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, return -EOPNOTSUPP; } + if (btrfs_root_refs(&root->root_item) == 0) + return -ENOENT; + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return -EINVAL; -- GitLab From 3324d0547861b16cf436d54abba7052e0c8aa9de Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 4 Jan 2024 11:48:47 -0800 Subject: [PATCH 0854/1290] btrfs: avoid copying BTRFS_ROOT_SUBVOL_DEAD flag to snapshot of subvolume being deleted Sweet Tea spotted a race between subvolume deletion and snapshotting that can result in the root item for the snapshot having the BTRFS_ROOT_SUBVOL_DEAD flag set. The race is: Thread 1 | Thread 2 ----------------------------------------------|---------- btrfs_delete_subvolume | btrfs_set_root_flags(BTRFS_ROOT_SUBVOL_DEAD)| |btrfs_mksubvol | down_read(subvol_sem) | create_snapshot | ... | create_pending_snapshot | copy root item from source down_write(subvol_sem) | This flag is only checked in send and swap activate, which this would cause to fail mysteriously. create_snapshot() now checks the root refs to reject a deleted subvolume, so we can fix this by locking subvol_sem earlier so that the BTRFS_ROOT_SUBVOL_DEAD flag and the root refs are updated atomically. CC: stable@vger.kernel.org # 4.14+ Reported-by: Sweet Tea Dorminy Reviewed-by: Sweet Tea Dorminy Reviewed-by: Anand Jain Signed-off-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b3e39610cc95a..7bcc1c03437a8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4458,6 +4458,8 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) u64 root_flags; int ret; + down_write(&fs_info->subvol_sem); + /* * Don't allow to delete a subvolume with send in progress. This is * inside the inode lock so the error handling that has to drop the bit @@ -4469,25 +4471,25 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) btrfs_warn(fs_info, "attempt to delete subvolume %llu during send", dest->root_key.objectid); - return -EPERM; + ret = -EPERM; + goto out_up_write; } if (atomic_read(&dest->nr_swapfiles)) { spin_unlock(&dest->root_item_lock); btrfs_warn(fs_info, "attempt to delete subvolume %llu with active swapfile", root->root_key.objectid); - return -EPERM; + ret = -EPERM; + goto out_up_write; } root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags | BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(&dest->root_item_lock); - down_write(&fs_info->subvol_sem); - ret = may_destroy_subvol(dest); if (ret) - goto out_up_write; + goto out_undead; btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); /* @@ -4497,7 +4499,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) */ ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); if (ret) - goto out_up_write; + goto out_undead; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { @@ -4563,15 +4565,17 @@ out_end_trans: inode->i_flags |= S_DEAD; out_release: btrfs_subvolume_release_metadata(root, &block_rsv); -out_up_write: - up_write(&fs_info->subvol_sem); +out_undead: if (ret) { spin_lock(&dest->root_item_lock); root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(&dest->root_item_lock); - } else { + } +out_up_write: + up_write(&fs_info->subvol_sem); + if (!ret) { d_invalidate(dentry); btrfs_prune_dentries(dest); ASSERT(dest->send_in_progress == 0); -- GitLab From 173431b274a9a54fc10b273b46e67f46bcf62d2e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 10 Jan 2024 08:58:26 +1030 Subject: [PATCH 0855/1290] btrfs: defrag: reject unknown flags of btrfs_ioctl_defrag_range_args Add extra sanity check for btrfs_ioctl_defrag_range_args::flags. This is not really to enhance fuzzing tests, but as a preparation for future expansion on btrfs_ioctl_defrag_range_args. In the future we're going to add new members, allowing more fine tuning for btrfs defrag. Without the -ENONOTSUPP error, there would be no way to detect if the kernel supports those new defrag features. CC: stable@vger.kernel.org # 4.14+ Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 4 ++++ include/uapi/linux/btrfs.h | 3 +++ 2 files changed, 7 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fea5d37528b80..5d42319b43f2d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2602,6 +2602,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EFAULT; goto out; } + if (range.flags & ~BTRFS_DEFRAG_RANGE_FLAGS_SUPP) { + ret = -EOPNOTSUPP; + goto out; + } /* compression requires us to start the IO */ if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { range.flags |= BTRFS_DEFRAG_RANGE_START_IO; diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 7c29d82db9ee0..f8bc34a6bcfa2 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -614,6 +614,9 @@ struct btrfs_ioctl_clone_range_args { */ #define BTRFS_DEFRAG_RANGE_COMPRESS 1 #define BTRFS_DEFRAG_RANGE_START_IO 2 +#define BTRFS_DEFRAG_RANGE_FLAGS_SUPP (BTRFS_DEFRAG_RANGE_COMPRESS | \ + BTRFS_DEFRAG_RANGE_START_IO) + struct btrfs_ioctl_defrag_range_args { /* start of the defrag operation */ __u64 start; -- GitLab From 567a1e852e872e702b18d271a3dbce2a75efbaff Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Tue, 2 Jan 2024 00:52:45 -0800 Subject: [PATCH 0856/1290] scsi: fcoe: Fix unsigned comparison with zero in store_ctlr_mode() ctlr->mode is of unsigned type, it is never less than zero. Fix this by using an extra variable called 'res', to store return value from sysfs_match_string() and assign that to ctlr->mode on the success path. Fixes: edc22a7c8688 ("scsi: fcoe: Use sysfs_match_string() over fcoe_parse_mode()") Signed-off-by: Harshit Mogalapalli Link: https://lore.kernel.org/r/20240102085245.600570-1-harshit.m.mogalapalli@oracle.com Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/fcoe/fcoe_sysfs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/fcoe/fcoe_sysfs.c b/drivers/scsi/fcoe/fcoe_sysfs.c index 408a806bf4c2d..c64a085a7ee2f 100644 --- a/drivers/scsi/fcoe/fcoe_sysfs.c +++ b/drivers/scsi/fcoe/fcoe_sysfs.c @@ -263,6 +263,7 @@ static ssize_t store_ctlr_mode(struct device *dev, const char *buf, size_t count) { struct fcoe_ctlr_device *ctlr = dev_to_ctlr(dev); + int res; if (count > FCOE_MAX_MODENAME_LEN) return -EINVAL; @@ -279,12 +280,13 @@ static ssize_t store_ctlr_mode(struct device *dev, return -ENOTSUPP; } - ctlr->mode = sysfs_match_string(fip_conn_type_names, buf); - if (ctlr->mode < 0 || ctlr->mode == FIP_CONN_TYPE_UNKNOWN) { + res = sysfs_match_string(fip_conn_type_names, buf); + if (res < 0 || res == FIP_CONN_TYPE_UNKNOWN) { LIBFCOE_SYSFS_DBG(ctlr, "Unknown mode %s provided.\n", buf); return -EINVAL; } + ctlr->mode = res; ctlr->f->set_fcoe_ctlr_mode(ctlr); LIBFCOE_SYSFS_DBG(ctlr, "Mode changed to %s.\n", buf); -- GitLab From 38945c2b006b23a1a7a0c88d76e3294c6199891c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 10 Jan 2024 21:41:55 +0300 Subject: [PATCH 0857/1290] scsi: fnic: unlock on error path in fnic_queuecommand() Call spin_unlock_irqrestore(&fnic->wq_copy_lock[hwq], flags) before returning. Fixes: c81df08cd294 ("scsi: fnic: Add support for multiqueue (MQ) in fnic driver") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/5360fa20-74bc-4c22-a78e-ea8b18c5410d@moroto.mountain Reviewed-by: Karan Tilak Kumar Signed-off-by: Martin K. Petersen --- drivers/scsi/fnic/fnic_scsi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c index 4d6db4509e755..8d7fc5284293b 100644 --- a/drivers/scsi/fnic/fnic_scsi.c +++ b/drivers/scsi/fnic/fnic_scsi.c @@ -546,6 +546,7 @@ int fnic_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *sc) if (fnic->sw_copy_wq[hwq].io_req_table[blk_mq_unique_tag_to_tag(mqtag)] != NULL) { WARN(1, "fnic<%d>: %s: hwq: %d tag 0x%x already exists\n", fnic->fnic_num, __func__, hwq, blk_mq_unique_tag_to_tag(mqtag)); + spin_unlock_irqrestore(&fnic->wq_copy_lock[hwq], flags); return SCSI_MLQUEUE_HOST_BUSY; } -- GitLab From 6df0e077d76bd144c533b61d6182676aae6b0a85 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 11 Jan 2024 13:05:32 +0100 Subject: [PATCH 0858/1290] scsi: core: Kick the requeue list after inserting when flushing When libata calls ata_link_abort() to abort all ata queued commands, it calls blk_abort_request() on the SCSI command representing each QC. This causes scsi_timeout() to be called, which calls scsi_eh_scmd_add() for each SCSI command. scsi_eh_scmd_add() sets the SCSI host to state recovery, and then adds the command to shost->eh_cmd_q. This will wake up the SCSI EH, and eventually the libata EH strategy handler will be called, which calls scsi_eh_flush_done_q() to either flush retry or flush finish each failed command. The commands that are flush retried by scsi_eh_flush_done_q() are done so using scsi_queue_insert(). Before commit 8b566edbdbfb ("scsi: core: Only kick the requeue list if necessary"), __scsi_queue_insert() called blk_mq_requeue_request() with the second argument set to true, indicating that it should always kick/run the requeue list after inserting. After commit 8b566edbdbfb ("scsi: core: Only kick the requeue list if necessary"), __scsi_queue_insert() does not kick/run the requeue list after inserting, if the current SCSI host state is recovery (which is the case in the libata example above). This optimization is probably fine in most cases, as I can only assume that most often someone will eventually kick/run the queues. However, that is not the case for scsi_eh_flush_done_q(), where we can see that the request gets inserted to the requeue list, but the queue is never started after the request has been inserted, leading to the block layer waiting for the completion of command that never gets to run. Since scsi_eh_flush_done_q() is called by SCSI EH context, the SCSI host state is most likely always in recovery when this function is called. Thus, let scsi_eh_flush_done_q() explicitly kick the requeue list after inserting a flush retry command, so that scsi_eh_flush_done_q() keeps the same behavior as before commit 8b566edbdbfb ("scsi: core: Only kick the requeue list if necessary"). Simple reproducer for the libata example above: $ hdparm -Y /dev/sda $ echo 1 > /sys/class/scsi_device/0\:0\:0\:0/device/delete Fixes: 8b566edbdbfb ("scsi: core: Only kick the requeue list if necessary") Reported-by: Kevin Locke Closes: https://lore.kernel.org/linux-scsi/ZZw3Th70wUUvCiCY@kevinlocke.name/ Signed-off-by: Niklas Cassel Link: https://lore.kernel.org/r/20240111120533.3612509-1-cassel@kernel.org Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_error.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 1bac12ef238ef..44c1a89b717a9 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -2196,15 +2196,18 @@ void scsi_eh_flush_done_q(struct list_head *done_q) struct scsi_cmnd *scmd, *next; list_for_each_entry_safe(scmd, next, done_q, eh_entry) { + struct scsi_device *sdev = scmd->device; + list_del_init(&scmd->eh_entry); - if (scsi_device_online(scmd->device) && - !scsi_noretry_cmd(scmd) && scsi_cmd_retry_allowed(scmd) && - scsi_eh_should_retry_cmd(scmd)) { + if (scsi_device_online(sdev) && !scsi_noretry_cmd(scmd) && + scsi_cmd_retry_allowed(scmd) && + scsi_eh_should_retry_cmd(scmd)) { SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "%s: flush retry cmd\n", current->comm)); scsi_queue_insert(scmd, SCSI_MLQUEUE_EH_RETRY); + blk_mq_kick_requeue_list(sdev->request_queue); } else { /* * If just we got sense for the device (called -- GitLab From 83ab68168a3d990d5ff39ab030ad5754cbbccb25 Mon Sep 17 00:00:00 2001 From: Dmitry Bogdanov Date: Thu, 11 Jan 2024 15:59:41 +0300 Subject: [PATCH 0859/1290] scsi: target: core: Add TMF to tmr_list handling An abort that is responded to by iSCSI itself is added to tmr_list but does not go to target core. A LUN_RESET that goes through tmr_list takes a refcounter on the abort and waits for completion. However, the abort will be never complete because it was not started in target core. Unable to locate ITT: 0x05000000 on CID: 0 Unable to locate RefTaskTag: 0x05000000 on CID: 0. wait_for_tasks: Stopping tmf LUN_RESET with tag 0x0 ref_task_tag 0x0 i_state 34 t_state ISTATE_PROCESSING refcnt 2 transport_state active,stop,fabric_stop wait for tasks: tmf LUN_RESET with tag 0x0 ref_task_tag 0x0 i_state 34 t_state ISTATE_PROCESSING refcnt 2 transport_state active,stop,fabric_stop ... INFO: task kworker/0:2:49 blocked for more than 491 seconds. task:kworker/0:2 state:D stack: 0 pid: 49 ppid: 2 flags:0x00000800 Workqueue: events target_tmr_work [target_core_mod] Call Trace: __switch_to+0x2c4/0x470 _schedule+0x314/0x1730 schedule+0x64/0x130 schedule_timeout+0x168/0x430 wait_for_completion+0x140/0x270 target_put_cmd_and_wait+0x64/0xb0 [target_core_mod] core_tmr_lun_reset+0x30/0xa0 [target_core_mod] target_tmr_work+0xc8/0x1b0 [target_core_mod] process_one_work+0x2d4/0x5d0 worker_thread+0x78/0x6c0 To fix this, only add abort to tmr_list if it will be handled by target core. Signed-off-by: Dmitry Bogdanov Link: https://lore.kernel.org/r/20240111125941.8688-1-d.bogdanov@yadro.com Reviewed-by: Mike Christie Signed-off-by: Martin K. Petersen --- drivers/target/target_core_device.c | 5 ----- drivers/target/target_core_transport.c | 4 ++++ 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c index 506193e870c49..7a85e6477e465 100644 --- a/drivers/target/target_core_device.c +++ b/drivers/target/target_core_device.c @@ -147,7 +147,6 @@ int transport_lookup_tmr_lun(struct se_cmd *se_cmd) struct se_session *se_sess = se_cmd->se_sess; struct se_node_acl *nacl = se_sess->se_node_acl; struct se_tmr_req *se_tmr = se_cmd->se_tmr_req; - unsigned long flags; rcu_read_lock(); deve = target_nacl_find_deve(nacl, se_cmd->orig_fe_lun); @@ -178,10 +177,6 @@ out_unlock: se_cmd->se_dev = rcu_dereference_raw(se_lun->lun_se_dev); se_tmr->tmr_dev = rcu_dereference_raw(se_lun->lun_se_dev); - spin_lock_irqsave(&se_tmr->tmr_dev->se_tmr_lock, flags); - list_add_tail(&se_tmr->tmr_list, &se_tmr->tmr_dev->dev_tmr_list); - spin_unlock_irqrestore(&se_tmr->tmr_dev->se_tmr_lock, flags); - return 0; } EXPORT_SYMBOL(transport_lookup_tmr_lun); diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 670cfb7bd426a..73d0d6133ac8f 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -3629,6 +3629,10 @@ int transport_generic_handle_tmr( unsigned long flags; bool aborted = false; + spin_lock_irqsave(&cmd->se_dev->se_tmr_lock, flags); + list_add_tail(&cmd->se_tmr_req->tmr_list, &cmd->se_dev->dev_tmr_list); + spin_unlock_irqrestore(&cmd->se_dev->se_tmr_lock, flags); + spin_lock_irqsave(&cmd->t_state_lock, flags); if (cmd->transport_state & CMD_T_ABORTED) { aborted = true; -- GitLab From cdac6e1f716419ce307ad3e44a718557a5469c17 Mon Sep 17 00:00:00 2001 From: Chancel Liu Date: Thu, 11 Jan 2024 11:52:19 +0900 Subject: [PATCH 0860/1290] ALSA: aloop: Introduce a function to get if access is interleaved mode There's a use case that playback stream of a loopback cable works on RW_INTERLEAVED mode while capture stream works on MMAP_INTERLEAVED mode: aplay -Dhw:Loopback,0,0 S32_48K_2ch.wav; arecord -Dplughw:Loopback,1,0 -fS32_LE -r16000 -c2 cap.wav; The plug plugin handles only slave PCM support MMAP mode. Not only plug plugin but also other plugins like direct plugins(dmix/dsnoop/dshare) work on MMAP access mode. In this case capture stream is the slave PCM works on MMAP_INTERLEAVED mode. However loopback_check_format() rejects this access setting and return: arecord: pcm_read:2240: read error: Input/output error To fix it a function called is_access_interleaved() is introduced to get if access is interleaved mode. If both access of capture stream and playback stream is interleaved mode loopback_check_format() will allow this kind of access setting. Fixes: 462494565c27 ("ALSA: aloop: Add support for the non-interleaved access mode") Signed-off-by: Chancel Liu Link: https://lore.kernel.org/r/20240111025219.2678764-1-chancel.liu@nxp.com Signed-off-by: Takashi Iwai --- sound/drivers/aloop.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/sound/drivers/aloop.c b/sound/drivers/aloop.c index e87dc67f33c69..1c65e0a3b13ce 100644 --- a/sound/drivers/aloop.c +++ b/sound/drivers/aloop.c @@ -322,6 +322,17 @@ static int loopback_snd_timer_close_cable(struct loopback_pcm *dpcm) return 0; } +static bool is_access_interleaved(snd_pcm_access_t access) +{ + switch (access) { + case SNDRV_PCM_ACCESS_MMAP_INTERLEAVED: + case SNDRV_PCM_ACCESS_RW_INTERLEAVED: + return true; + default: + return false; + } +}; + static int loopback_check_format(struct loopback_cable *cable, int stream) { struct snd_pcm_runtime *runtime, *cruntime; @@ -341,7 +352,8 @@ static int loopback_check_format(struct loopback_cable *cable, int stream) check = runtime->format != cruntime->format || runtime->rate != cruntime->rate || runtime->channels != cruntime->channels || - runtime->access != cruntime->access; + is_access_interleaved(runtime->access) != + is_access_interleaved(cruntime->access); if (!check) return 0; if (stream == SNDRV_PCM_STREAM_CAPTURE) { @@ -369,7 +381,8 @@ static int loopback_check_format(struct loopback_cable *cable, int stream) &setup->channels_id); setup->channels = runtime->channels; } - if (setup->access != runtime->access) { + if (is_access_interleaved(setup->access) != + is_access_interleaved(runtime->access)) { snd_ctl_notify(card, SNDRV_CTL_EVENT_MASK_VALUE, &setup->access_id); setup->access = runtime->access; @@ -584,8 +597,7 @@ static void copy_play_buf(struct loopback_pcm *play, size = play->pcm_buffer_size - src_off; if (dst_off + size > capt->pcm_buffer_size) size = capt->pcm_buffer_size - dst_off; - if (runtime->access == SNDRV_PCM_ACCESS_RW_NONINTERLEAVED || - runtime->access == SNDRV_PCM_ACCESS_MMAP_NONINTERLEAVED) + if (!is_access_interleaved(runtime->access)) copy_play_buf_part_n(play, capt, size, src_off, dst_off); else memcpy(dst + dst_off, src + src_off, size); @@ -1544,8 +1556,7 @@ static int loopback_access_get(struct snd_kcontrol *kcontrol, mutex_lock(&loopback->cable_lock); access = loopback->setup[kcontrol->id.subdevice][kcontrol->id.device].access; - ucontrol->value.enumerated.item[0] = access == SNDRV_PCM_ACCESS_RW_NONINTERLEAVED || - access == SNDRV_PCM_ACCESS_MMAP_NONINTERLEAVED; + ucontrol->value.enumerated.item[0] = !is_access_interleaved(access); mutex_unlock(&loopback->cable_lock); return 0; -- GitLab From acd66c2126eb9b5da2d89ae07dbcd73b909c2111 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Wed, 10 Jan 2024 12:37:30 +0100 Subject: [PATCH 0861/1290] net: micrel: Fix PTP frame parsing for lan8841 The HW has the capability to check each frame if it is a PTP frame, which domain it is, which ptp frame type it is, different ip address in the frame. And if one of these checks fail then the frame is not timestamp. Most of these checks were disabled except checking the field minorVersionPTP inside the PTP header. Meaning that once a partner sends a frame compliant to 8021AS which has minorVersionPTP set to 1, then the frame was not timestamp because the HW expected by default a value of 0 in minorVersionPTP. Fix this issue by removing this check so the userspace can decide on this. Fixes: cafc3662ee3f ("net: micrel: Add PHC support for lan8841") Signed-off-by: Horatiu Vultur Reviewed-by: Divya Koppera Reviewed-by: Rahul Rameshbabu Signed-off-by: David S. Miller --- drivers/net/phy/micrel.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index d2aa3d0695e33..bf4053431dcb3 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -3338,8 +3338,10 @@ static int lan8814_probe(struct phy_device *phydev) #define LAN8841_ADC_CHANNEL_MASK 198 #define LAN8841_PTP_RX_PARSE_L2_ADDR_EN 370 #define LAN8841_PTP_RX_PARSE_IP_ADDR_EN 371 +#define LAN8841_PTP_RX_VERSION 374 #define LAN8841_PTP_TX_PARSE_L2_ADDR_EN 434 #define LAN8841_PTP_TX_PARSE_IP_ADDR_EN 435 +#define LAN8841_PTP_TX_VERSION 438 #define LAN8841_PTP_CMD_CTL 256 #define LAN8841_PTP_CMD_CTL_PTP_ENABLE BIT(2) #define LAN8841_PTP_CMD_CTL_PTP_DISABLE BIT(1) @@ -3383,6 +3385,12 @@ static int lan8841_config_init(struct phy_device *phydev) phy_write_mmd(phydev, KSZ9131RN_MMD_COMMON_CTRL_REG, LAN8841_PTP_RX_PARSE_IP_ADDR_EN, 0); + /* Disable checking for minorVersionPTP field */ + phy_write_mmd(phydev, KSZ9131RN_MMD_COMMON_CTRL_REG, + LAN8841_PTP_RX_VERSION, 0xff00); + phy_write_mmd(phydev, KSZ9131RN_MMD_COMMON_CTRL_REG, + LAN8841_PTP_TX_VERSION, 0xff00); + /* 100BT Clause 40 improvenent errata */ phy_write_mmd(phydev, LAN8841_MMD_ANALOG_REG, LAN8841_ANALOG_CONTROL_1, -- GitLab From e398822c4751017fe401f57409488f5948d12fb5 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Fri, 5 Jan 2024 10:52:42 +0200 Subject: [PATCH 0862/1290] net: phy: micrel: populate .soft_reset for KSZ9131 The RZ/G3S SMARC Module has 2 KSZ9131 PHYs. In this setup, the KSZ9131 PHY is used with the ravb Ethernet driver. It has been discovered that when bringing the Ethernet interface down/up continuously, e.g., with the following sh script: $ while :; do ifconfig eth0 down; ifconfig eth0 up; done the link speed and duplex are wrong after interrupting the bring down/up operation even though the Ethernet interface is up. To recover from this state the following configuration sequence is necessary (executed manually): $ ifconfig eth0 down $ ifconfig eth0 up The behavior has been identified also on the Microchip SAMA7G5-EK board which runs the macb driver and uses the same PHY. The order of PHY-related operations in ravb_open() is as follows: ravb_open() -> ravb_phy_start() -> ravb_phy_init() -> of_phy_connect() -> phy_connect_direct() -> phy_attach_direct() -> phy_init_hw() -> phydev->drv->soft_reset() phydev->drv->config_init() phydev->drv->config_intr() phy_resume() kszphy_resume() The order of PHY-related operations in ravb_close is as follows: ravb_close() -> phy_stop() -> phy_suspend() -> kszphy_suspend() -> genphy_suspend() // set BMCR_PDOWN bit in MII_BMCR In genphy_suspend() setting the BMCR_PDWN bit in MII_BMCR switches the PHY to Software Power-Down (SPD) mode (according to the KSZ9131 datasheet). Thus, when opening the interface after it has been previously closed (via ravb_close()), the phydev->drv->config_init() and phydev->drv->config_intr() reach the KSZ9131 PHY driver via the ksz9131_config_init() and kszphy_config_intr() functions. KSZ9131 specifies that the MII management interface remains operational during SPD (Software Power-Down), but (according to manual): - Only access to the standard registers (0 through 31) is supported. - Access to MMD address spaces other than MMD address space 1 is possible if the spd_clock_gate_override bit is set. - Access to MMD address space 1 is not possible. The spd_clock_gate_override bit is not used in the KSZ9131 driver. ksz9131_config_init() configures RGMII delay, pad skews and LEDs by accessesing MMD registers other than those in address space 1. The datasheet for the KSZ9131 does not specify what happens if registers from an unsupported address space are accessed while the PHY is in SPD. To fix the issue the .soft_reset method has been instantiated for KSZ9131, too. This resets the PHY to the default state before doing any configurations to it, thus switching it out of SPD. Fixes: bff5b4b37372 ("net: phy: micrel: add Microchip KSZ9131 initial driver") Signed-off-by: Claudiu Beznea Reviewed-by: Maxime Chevallier Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/micrel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index bf4053431dcb3..81c20eb4b54b9 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -4847,6 +4847,7 @@ static struct phy_driver ksphy_driver[] = { .flags = PHY_POLL_CABLE_TEST, .driver_data = &ksz9131_type, .probe = kszphy_probe, + .soft_reset = genphy_soft_reset, .config_init = ksz9131_config_init, .config_intr = kszphy_config_intr, .config_aneg = ksz9131_config_aneg, -- GitLab From a03cfad512ac24a35184d7d87ec0d5489e1cb763 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 12 Jan 2024 12:10:23 +0100 Subject: [PATCH 0863/1290] ALSA: oxygen: Fix right channel of capture volume mixer There was a typo in oxygen mixer code that didn't update the right channel value properly for the capture volume. Let's fix it. This trivial fix was originally reported on Bugzilla. Fixes: a3601560496d ("[ALSA] oxygen: add front panel controls") Cc: Link: https://bugzilla.kernel.org/show_bug.cgi?id=156561 Link: https://lore.kernel.org/r/20240112111023.6208-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/oxygen/oxygen_mixer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/oxygen/oxygen_mixer.c b/sound/pci/oxygen/oxygen_mixer.c index 46705ec77b481..eb3aca16359c5 100644 --- a/sound/pci/oxygen/oxygen_mixer.c +++ b/sound/pci/oxygen/oxygen_mixer.c @@ -718,7 +718,7 @@ static int ac97_fp_rec_volume_put(struct snd_kcontrol *ctl, oldreg = oxygen_read_ac97(chip, 1, AC97_REC_GAIN); newreg = oldreg & ~0x0707; newreg = newreg | (value->value.integer.value[0] & 7); - newreg = newreg | ((value->value.integer.value[0] & 7) << 8); + newreg = newreg | ((value->value.integer.value[1] & 7) << 8); change = newreg != oldreg; if (change) oxygen_write_ac97(chip, 1, AC97_REC_GAIN, newreg); -- GitLab From 907ee6681788556b9ade3ad0a1f6f4aea192399c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2024 11:33:11 -0800 Subject: [PATCH 0864/1290] net: fill in MODULE_DESCRIPTION()s for wx_lib W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add a description to Wangxun's common code lib. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/wangxun/libwx/wx_lib.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_lib.c b/drivers/net/ethernet/wangxun/libwx/wx_lib.c index 23355cc408fd7..8706223a6e5aa 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_lib.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_lib.c @@ -2769,4 +2769,5 @@ void wx_set_ring(struct wx *wx, u32 new_tx_count, } EXPORT_SYMBOL(wx_set_ring); +MODULE_DESCRIPTION("Common library for Wangxun(R) Ethernet drivers."); MODULE_LICENSE("GPL"); -- GitLab From b95df3bd1ea31fadc9e1471a036b4c08199aa0f0 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Wed, 10 Jan 2024 07:40:07 +0000 Subject: [PATCH 0865/1290] arm64: irq: include MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sorting include files in alphabetic order in drivers/tty/serial/samsung.c revealed the following error: In file included from drivers/tty/serial/samsung_tty.c:24: ./arch/arm64/include/asm/irq.h:9:43: error: unknown type name ‘cpumask_t’ 9 | void arch_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu); | ^~~~~~~~~ Include cpumask.h to avoid unknown type errors for parents of irq.h that don't include cpumask.h. Acked-by: Mark Rutland Signed-off-by: Tudor Ambarus Reviewed-by: Stephen Boyd Link: https://lore.kernel.org/r/20240110074007.4020016-1-tudor.ambarus@linaro.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/irq.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h index 50ce8b697ff36..e93548914c366 100644 --- a/arch/arm64/include/asm/irq.h +++ b/arch/arm64/include/asm/irq.h @@ -4,6 +4,8 @@ #ifndef __ASSEMBLER__ +#include + #include void arch_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu); -- GitLab From 8c5a19cb17a71e52303150335b459c7d2d28a155 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 10 Jan 2024 14:26:20 +0100 Subject: [PATCH 0866/1290] arm64: scs: Work around full LTO issue with dynamic SCS Full LTO takes the '-mbranch-protection=none' passed to the compiler when generating the dynamic shadow call stack patching code as a hint to stop emitting PAC instructions altogether. (Thin LTO appears unaffected by this) Work around this by stripping unwind tables from the object in question, which should be sufficient to prevent the patching code from attempting to patch itself. Fixes: 3b619e22c460 ("arm64: implement dynamic shadow call stack for Clang") Signed-off-by: Ard Biesheuvel Reviewed-by: Sami Tolvanen Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20240110132619.258809-2-ardb+git@google.com Signed-off-by: Will Deacon --- arch/arm64/kernel/Makefile | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index d95b3d6b471a7..e5d03a7039b4b 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -73,7 +73,13 @@ obj-$(CONFIG_ARM64_MTE) += mte.o obj-y += vdso-wrap.o obj-$(CONFIG_COMPAT_VDSO) += vdso32-wrap.o obj-$(CONFIG_UNWIND_PATCH_PAC_INTO_SCS) += patch-scs.o -CFLAGS_patch-scs.o += -mbranch-protection=none + +# We need to prevent the SCS patching code from patching itself. Using +# -mbranch-protection=none here to avoid the patchable PAC opcodes from being +# generated triggers an issue with full LTO on Clang, which stops emitting PAC +# instructions altogether. So instead, omit the unwind tables used by the +# patching code, so it will not be able to locate its own PAC instructions. +CFLAGS_patch-scs.o += -fno-asynchronous-unwind-tables -fno-unwind-tables # Force dependency (vdso*-wrap.S includes vdso.so through incbin) $(obj)/vdso-wrap.o: $(obj)/vdso/vdso.so -- GitLab From 3931261ecf46151a5e779c96fb3da00677b6dc37 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 11 Jan 2024 12:24:48 +0100 Subject: [PATCH 0867/1290] arm64: fpsimd: Bring cond_yield asm macro in line with new rules We no longer disable softirqs or preemption when doing kernel mode SIMD, and so for fully preemptible kernels, there is no longer a need to do any explicit yielding (and for non-preemptible kernels, yielding is not needed either). That leaves voluntary preemption, where only explicit yield calls may result in a reschedule. To retain the existing behavior for such a configuration, we should take the new situation into account, where the preempt count will be zero rather than one, and yielding to pending softirqs is unnecessary. Fixes: aefbab8e77eb ("arm64: fpsimd: Preserve/restore kernel mode NEON at context switch") Signed-off-by: Ard Biesheuvel Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20240111112447.577640-2-ardb+git@google.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/assembler.h | 25 +++++++++---------------- arch/arm64/kernel/asm-offsets.c | 2 -- 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 7b1975bf4b90e..513787e433299 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -760,32 +760,25 @@ alternative_endif .endm /* - * Check whether preempt/bh-disabled asm code should yield as soon as - * it is able. This is the case if we are currently running in task - * context, and either a softirq is pending, or the TIF_NEED_RESCHED - * flag is set and re-enabling preemption a single time would result in - * a preempt count of zero. (Note that the TIF_NEED_RESCHED flag is - * stored negated in the top word of the thread_info::preempt_count + * Check whether asm code should yield as soon as it is able. This is + * the case if we are currently running in task context, and the + * TIF_NEED_RESCHED flag is set. (Note that the TIF_NEED_RESCHED flag + * is stored negated in the top word of the thread_info::preempt_count * field) */ - .macro cond_yield, lbl:req, tmp:req, tmp2:req + .macro cond_yield, lbl:req, tmp:req, tmp2 +#ifdef CONFIG_PREEMPT_VOLUNTARY get_current_task \tmp ldr \tmp, [\tmp, #TSK_TI_PREEMPT] /* * If we are serving a softirq, there is no point in yielding: the * softirq will not be preempted no matter what we do, so we should - * run to completion as quickly as we can. + * run to completion as quickly as we can. The preempt_count field will + * have BIT(SOFTIRQ_SHIFT) set in this case, so the zero check will + * catch this case too. */ - tbnz \tmp, #SOFTIRQ_SHIFT, .Lnoyield_\@ -#ifdef CONFIG_PREEMPTION - sub \tmp, \tmp, #PREEMPT_DISABLE_OFFSET cbz \tmp, \lbl #endif - adr_l \tmp, irq_stat + IRQ_CPUSTAT_SOFTIRQ_PENDING - get_this_cpu_offset \tmp2 - ldr w\tmp, [\tmp, \tmp2] - cbnz w\tmp, \lbl // yield on pending softirq in task context -.Lnoyield_\@: .endm /* diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 5ff1942b04fcf..5a7dbbe0ce639 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -117,8 +117,6 @@ int main(void) DEFINE(DMA_FROM_DEVICE, DMA_FROM_DEVICE); BLANK(); DEFINE(PREEMPT_DISABLE_OFFSET, PREEMPT_DISABLE_OFFSET); - DEFINE(SOFTIRQ_SHIFT, SOFTIRQ_SHIFT); - DEFINE(IRQ_CPUSTAT_SOFTIRQ_PENDING, offsetof(irq_cpustat_t, __softirq_pending)); BLANK(); DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task)); BLANK(); -- GitLab From 546b7cde9b1dd36089649101b75266564600ffe5 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 10 Jan 2024 11:29:20 -0600 Subject: [PATCH 0868/1290] arm64: Rename ARM64_WORKAROUND_2966298 In preparation to apply ARM64_WORKAROUND_2966298 for multiple errata, rename the kconfig and capability. No functional change. Cc: stable@vger.kernel.org Signed-off-by: Rob Herring Reviewed-by: Mark Rutland Link: https://lore.kernel.org/r/20240110-arm-errata-a510-v1-1-d02bc51aeeee@kernel.org Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 4 ++++ arch/arm64/kernel/cpu_errata.c | 4 ++-- arch/arm64/kernel/entry.S | 2 +- arch/arm64/tools/cpucaps | 2 +- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index b67e6934316fd..96f31e235a1a0 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1037,8 +1037,12 @@ config ARM64_ERRATUM_2645198 If unsure, say Y. +config ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD + bool + config ARM64_ERRATUM_2966298 bool "Cortex-A520: 2966298: workaround for speculatively executed unprivileged load" + select ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD default y help This option adds the workaround for ARM Cortex-A520 erratum 2966298. diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index e29e0fea63fb6..cb5e0622168de 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -713,10 +713,10 @@ const struct arm64_cpu_capabilities arm64_errata[] = { MIDR_FIXED(MIDR_CPU_VAR_REV(1,1), BIT(25)), }, #endif -#ifdef CONFIG_ARM64_ERRATUM_2966298 +#ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD { .desc = "ARM erratum 2966298", - .capability = ARM64_WORKAROUND_2966298, + .capability = ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD, /* Cortex-A520 r0p0 - r0p1 */ ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A520, 0, 0, 1), }, diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index a6030913cd58c..544ab46649f3f 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -428,7 +428,7 @@ alternative_else_nop_endif ldp x28, x29, [sp, #16 * 14] .if \el == 0 -alternative_if ARM64_WORKAROUND_2966298 +alternative_if ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD tlbi vale1, xzr dsb nsh alternative_else_nop_endif diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 1e07d74d7a6c9..b912b1409fc09 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -84,7 +84,6 @@ WORKAROUND_2077057 WORKAROUND_2457168 WORKAROUND_2645198 WORKAROUND_2658417 -WORKAROUND_2966298 WORKAROUND_AMPERE_AC03_CPU_38 WORKAROUND_TRBE_OVERWRITE_FILL_MODE WORKAROUND_TSB_FLUSH_FAILURE @@ -100,3 +99,4 @@ WORKAROUND_NVIDIA_CARMEL_CNP WORKAROUND_QCOM_FALKOR_E1003 WORKAROUND_REPEAT_TLBI WORKAROUND_SPECULATIVE_AT +WORKAROUND_SPECULATIVE_UNPRIV_LOAD -- GitLab From f827bcdafa2a2ac21c91e47f587e8d0c76195409 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 10 Jan 2024 11:29:21 -0600 Subject: [PATCH 0869/1290] arm64: errata: Add Cortex-A510 speculative unprivileged load workaround Implement the workaround for ARM Cortex-A510 erratum 3117295. On an affected Cortex-A510 core, a speculatively executed unprivileged load might leak data from a privileged load via a cache side channel. The issue only exists for loads within a translation regime with the same translation (e.g. same ASID and VMID). Therefore, the issue only affects the return to EL0. The erratum and workaround are the same as ARM Cortex-A520 erratum 2966298, so reuse the existing workaround. Cc: stable@vger.kernel.org Signed-off-by: Rob Herring Reviewed-by: Mark Rutland Link: https://lore.kernel.org/r/20240110-arm-errata-a510-v1-2-d02bc51aeeee@kernel.org Signed-off-by: Will Deacon --- Documentation/arch/arm64/silicon-errata.rst | 2 ++ arch/arm64/Kconfig | 14 ++++++++++++++ arch/arm64/kernel/cpu_errata.c | 17 +++++++++++++++-- 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index f47f63bcf67c9..7acd64c61f50c 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -71,6 +71,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A510 | #2658417 | ARM64_ERRATUM_2658417 | +----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A510 | #3117295 | ARM64_ERRATUM_3117295 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A520 | #2966298 | ARM64_ERRATUM_2966298 | +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A53 | #826319 | ARM64_ERRATUM_826319 | diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 96f31e235a1a0..bfd2752493666 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1054,6 +1054,20 @@ config ARM64_ERRATUM_2966298 If unsure, say Y. +config ARM64_ERRATUM_3117295 + bool "Cortex-A510: 3117295: workaround for speculatively executed unprivileged load" + select ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD + default y + help + This option adds the workaround for ARM Cortex-A510 erratum 3117295. + + On an affected Cortex-A510 core, a speculatively executed unprivileged + load might leak data from a privileged level via a cache side channel. + + Work around this problem by executing a TLBI before returning to EL0. + + If unsure, say Y. + config CAVIUM_ERRATUM_22375 bool "Cavium erratum 22375, 24313" default y diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index cb5e0622168de..967c7c7a4e7db 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -416,6 +416,19 @@ static struct midr_range broken_aarch32_aes[] = { }; #endif /* CONFIG_ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE */ +#ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD +static const struct midr_range erratum_spec_unpriv_load_list[] = { +#ifdef CONFIG_ARM64_ERRATUM_3117295 + MIDR_ALL_VERSIONS(MIDR_CORTEX_A510), +#endif +#ifdef CONFIG_ARM64_ERRATUM_2966298 + /* Cortex-A520 r0p0 to r0p1 */ + MIDR_REV_RANGE(MIDR_CORTEX_A520, 0, 0, 1), +#endif + {}, +}; +#endif + const struct arm64_cpu_capabilities arm64_errata[] = { #ifdef CONFIG_ARM64_WORKAROUND_CLEAN_CACHE { @@ -715,10 +728,10 @@ const struct arm64_cpu_capabilities arm64_errata[] = { #endif #ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD { - .desc = "ARM erratum 2966298", + .desc = "ARM errata 2966298, 3117295", .capability = ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD, /* Cortex-A520 r0p0 - r0p1 */ - ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A520, 0, 0, 1), + ERRATA_MIDR_RANGE_LIST(erratum_spec_unpriv_load_list), }, #endif #ifdef CONFIG_AMPERE_ERRATUM_AC03_CPU_38 -- GitLab From 8d0e8a8aa3a1e844b7a969e9fe80937667507c6c Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Tue, 14 Nov 2023 10:53:22 +0100 Subject: [PATCH 0870/1290] s390/pai: rework paixxxx_getctr interface Simplify the interface for functions paicrypt_getctr() and paiext_getctr(). Change the first parameter from a pointer to a structure to a pointer to a structure member. The other members of the structure are not needed. No functional change. Signed-off-by: Thomas Richter Acked-by: Mete Durlu Signed-off-by: Alexander Gordeev --- arch/s390/kernel/perf_pai_crypto.c | 12 ++++++------ arch/s390/kernel/perf_pai_ext.c | 11 ++++++----- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c index 39a91b00438a7..92156e14fc6f4 100644 --- a/arch/s390/kernel/perf_pai_crypto.c +++ b/arch/s390/kernel/perf_pai_crypto.c @@ -111,11 +111,11 @@ static void paicrypt_event_destroy(struct perf_event *event) mutex_unlock(&pai_reserve_mutex); } -static u64 paicrypt_getctr(struct paicrypt_map *cpump, int nr, bool kernel) +static u64 paicrypt_getctr(unsigned long *page, int nr, bool kernel) { if (kernel) nr += PAI_CRYPTO_MAXCTR; - return cpump->page[nr]; + return page[nr]; } /* Read the counter values. Return value from location in CMP. For event @@ -129,13 +129,13 @@ static u64 paicrypt_getdata(struct perf_event *event, bool kernel) int i; if (event->attr.config != PAI_CRYPTO_BASE) { - return paicrypt_getctr(cpump, + return paicrypt_getctr(cpump->page, event->attr.config - PAI_CRYPTO_BASE, kernel); } for (i = 1; i <= paicrypt_cnt; i++) { - u64 val = paicrypt_getctr(cpump, i, kernel); + u64 val = paicrypt_getctr(cpump->page, i, kernel); if (!val) continue; @@ -383,9 +383,9 @@ static size_t paicrypt_copy(struct pai_userdata *userdata, u64 val = 0; if (!exclude_kernel) - val += paicrypt_getctr(cpump, i, true); + val += paicrypt_getctr(cpump->page, i, true); if (!exclude_user) - val += paicrypt_getctr(cpump, i, false); + val += paicrypt_getctr(cpump->page, i, false); if (val) { userdata[outidx].num = i; userdata[outidx].value = val; diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c index e7013a2e89605..9452205e6febc 100644 --- a/arch/s390/kernel/perf_pai_ext.c +++ b/arch/s390/kernel/perf_pai_ext.c @@ -276,9 +276,9 @@ static int paiext_event_init(struct perf_event *event) return 0; } -static u64 paiext_getctr(struct paiext_map *cpump, int nr) +static u64 paiext_getctr(unsigned long *area, int nr) { - return cpump->area[nr]; + return area[nr]; } /* Read the counter values. Return value from location in buffer. For event @@ -292,10 +292,11 @@ static u64 paiext_getdata(struct perf_event *event) int i; if (event->attr.config != PAI_NNPA_BASE) - return paiext_getctr(cpump, event->attr.config - PAI_NNPA_BASE); + return paiext_getctr(cpump->area, + event->attr.config - PAI_NNPA_BASE); for (i = 1; i <= paiext_cnt; i++) - sum += paiext_getctr(cpump, i); + sum += paiext_getctr(cpump->area, i); return sum; } @@ -392,7 +393,7 @@ static size_t paiext_copy(struct paiext_map *cpump) int i, outidx = 0; for (i = 1; i <= paiext_cnt; i++) { - u64 val = paiext_getctr(cpump, i); + u64 val = paiext_getctr(cpump->area, i); if (val) { userdata[outidx].num = i; -- GitLab From 0578a54110ff28dc2b3830e90ad2dd9c1e065e90 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Tue, 14 Nov 2023 12:00:38 +0100 Subject: [PATCH 0871/1290] s390/pai_crypto: split function paicrypt_push_sample Split function paicrypt_push_sample() into two parts. The first part determines the number of bytes to store as raw data in the perf sample record. The second part stores the raw data in the perf event's ring buffer. No functional change. Signed-off-by: Thomas Richter Acked-by: Mete Durlu Signed-off-by: Alexander Gordeev --- arch/s390/kernel/perf_pai_crypto.c | 43 +++++++++++++++++------------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c index 92156e14fc6f4..ee8a011c46e97 100644 --- a/arch/s390/kernel/perf_pai_crypto.c +++ b/arch/s390/kernel/perf_pai_crypto.c @@ -373,8 +373,7 @@ static void paicrypt_del(struct perf_event *event, int flags) * 2 bytes: Number of counter * 8 bytes: Value of counter */ -static size_t paicrypt_copy(struct pai_userdata *userdata, - struct paicrypt_map *cpump, +static size_t paicrypt_copy(struct pai_userdata *userdata, unsigned long *page, bool exclude_user, bool exclude_kernel) { int i, outidx = 0; @@ -383,9 +382,9 @@ static size_t paicrypt_copy(struct pai_userdata *userdata, u64 val = 0; if (!exclude_kernel) - val += paicrypt_getctr(cpump->page, i, true); + val += paicrypt_getctr(page, i, true); if (!exclude_user) - val += paicrypt_getctr(cpump->page, i, false); + val += paicrypt_getctr(page, i, false); if (val) { userdata[outidx].num = i; userdata[outidx].value = val; @@ -395,25 +394,14 @@ static size_t paicrypt_copy(struct pai_userdata *userdata, return outidx * sizeof(struct pai_userdata); } -static int paicrypt_push_sample(void) +static int paicrypt_push_sample(size_t rawsize, struct paicrypt_map *cpump, + struct perf_event *event) { - struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr); - struct paicrypt_map *cpump = mp->mapptr; - struct perf_event *event = cpump->event; struct perf_sample_data data; struct perf_raw_record raw; struct pt_regs regs; - size_t rawsize; int overflow; - if (!cpump->event) /* No event active */ - return 0; - rawsize = paicrypt_copy(cpump->save, cpump, - cpump->event->attr.exclude_user, - cpump->event->attr.exclude_kernel); - if (!rawsize) /* No incremented counters */ - return 0; - /* Setup perf sample */ memset(®s, 0, sizeof(regs)); memset(&raw, 0, sizeof(raw)); @@ -444,6 +432,25 @@ static int paicrypt_push_sample(void) return overflow; } +/* Check if there is data to be saved on schedule out of a task. */ +static int paicrypt_have_sample(void) +{ + struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr); + struct paicrypt_map *cpump = mp->mapptr; + struct perf_event *event = cpump->event; + size_t rawsize; + int rc = 0; + + if (!event) /* No event active */ + return 0; + rawsize = paicrypt_copy(cpump->save, cpump->page, + cpump->event->attr.exclude_user, + cpump->event->attr.exclude_kernel); + if (rawsize) /* No incremented counters */ + rc = paicrypt_push_sample(rawsize, cpump, event); + return rc; +} + /* Called on schedule-in and schedule-out. No access to event structure, * but for sampling only event CRYPTO_ALL is allowed. */ @@ -453,7 +460,7 @@ static void paicrypt_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sch * results on schedule_out and if page was dirty, clear values. */ if (!sched_in) - paicrypt_push_sample(); + paicrypt_have_sample(); } /* Attribute definitions for paicrypt interface. As with other CPU -- GitLab From cb1259b7b574bd90ef22dac2c6282327cdae31c6 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Wed, 15 Nov 2023 11:04:25 +0100 Subject: [PATCH 0872/1290] s390/pai: rework paiXXX_start and paiXXX_stop functions The PAI crypto counter and PAI NNPA counters start and stop functions are streamlined. Move the conditions to invoke start and stop functions to its respective function body and call them unconditionally. The start and stop functions now determine how to proceed. No functional change. Signed-off-by: Thomas Richter Acked-by: Mete Durlu Signed-off-by: Alexander Gordeev --- arch/s390/kernel/perf_pai_crypto.c | 29 ++++++++++++------------- arch/s390/kernel/perf_pai_ext.c | 35 ++++++++++++++---------------- 2 files changed, 30 insertions(+), 34 deletions(-) diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c index ee8a011c46e97..bf8a672b15a41 100644 --- a/arch/s390/kernel/perf_pai_crypto.c +++ b/arch/s390/kernel/perf_pai_crypto.c @@ -317,10 +317,14 @@ static void paicrypt_start(struct perf_event *event, int flags) * Events are added, deleted and re-added when 2 or more events * are active at the same time. */ - if (!event->hw.last_tag) { - event->hw.last_tag = 1; - sum = paicrypt_getall(event); /* Get current value */ - local64_set(&event->hw.prev_count, sum); + if (!event->attr.sample_period) { /* Counting */ + if (!event->hw.last_tag) { + event->hw.last_tag = 1; + sum = paicrypt_getall(event); /* Get current value */ + local64_set(&event->hw.prev_count, sum); + } + } else { /* Sampling */ + perf_sched_cb_inc(event->pmu); } } @@ -336,19 +340,18 @@ static int paicrypt_add(struct perf_event *event, int flags) local_ctl_set_bit(0, CR0_CRYPTOGRAPHY_COUNTER_BIT); } cpump->event = event; - if (flags & PERF_EF_START && !event->attr.sample_period) { - /* Only counting needs initial counter value */ + if (flags & PERF_EF_START) paicrypt_start(event, PERF_EF_RELOAD); - } event->hw.state = 0; - if (event->attr.sample_period) - perf_sched_cb_inc(event->pmu); return 0; } static void paicrypt_stop(struct perf_event *event, int flags) { - paicrypt_read(event); + if (!event->attr.sample_period) /* Counting */ + paicrypt_read(event); + else /* Sampling */ + perf_sched_cb_dec(event->pmu); event->hw.state = PERF_HES_STOPPED; } @@ -357,11 +360,7 @@ static void paicrypt_del(struct perf_event *event, int flags) struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr); struct paicrypt_map *cpump = mp->mapptr; - if (event->attr.sample_period) - perf_sched_cb_dec(event->pmu); - if (!event->attr.sample_period) - /* Only counting needs to read counter */ - paicrypt_stop(event, PERF_EF_UPDATE); + paicrypt_stop(event, PERF_EF_UPDATE); if (--cpump->active_events == 0) { local_ctl_clear_bit(0, CR0_CRYPTOGRAPHY_COUNTER_BIT); WRITE_ONCE(S390_lowcore.ccd, 0); diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c index 9452205e6febc..275d4fd3fe38b 100644 --- a/arch/s390/kernel/perf_pai_ext.c +++ b/arch/s390/kernel/perf_pai_ext.c @@ -321,11 +321,15 @@ static void paiext_start(struct perf_event *event, int flags) { u64 sum; - if (event->hw.last_tag) - return; - event->hw.last_tag = 1; - sum = paiext_getall(event); /* Get current value */ - local64_set(&event->hw.prev_count, sum); + if (!event->attr.sample_period) { /* Counting */ + if (!event->hw.last_tag) { + event->hw.last_tag = 1; + sum = paiext_getall(event); /* Get current value */ + local64_set(&event->hw.prev_count, sum); + } + } else { /* Sampling */ + perf_sched_cb_inc(event->pmu); + } } static int paiext_add(struct perf_event *event, int flags) @@ -342,21 +346,19 @@ static int paiext_add(struct perf_event *event, int flags) debug_sprintf_event(paiext_dbg, 4, "%s 1508 %llx acc %llx\n", __func__, S390_lowcore.aicd, pcb->acc); } - if (flags & PERF_EF_START && !event->attr.sample_period) { - /* Only counting needs initial counter value */ + cpump->event = event; + if (flags & PERF_EF_START) paiext_start(event, PERF_EF_RELOAD); - } event->hw.state = 0; - if (event->attr.sample_period) { - cpump->event = event; - perf_sched_cb_inc(event->pmu); - } return 0; } static void paiext_stop(struct perf_event *event, int flags) { - paiext_read(event); + if (!event->attr.sample_period) /* Counting */ + paiext_read(event); + else /* Sampling */ + perf_sched_cb_dec(event->pmu); event->hw.state = PERF_HES_STOPPED; } @@ -366,12 +368,7 @@ static void paiext_del(struct perf_event *event, int flags) struct paiext_map *cpump = mp->mapptr; struct paiext_cb *pcb = cpump->paiext_cb; - if (event->attr.sample_period) - perf_sched_cb_dec(event->pmu); - if (!event->attr.sample_period) { - /* Only counting needs to read counter */ - paiext_stop(event, PERF_EF_UPDATE); - } + paiext_stop(event, PERF_EF_UPDATE); if (--cpump->active_events == 0) { /* Disable CPU instruction lookup for PAIE1 control block */ local_ctl_clear_bit(0, CR0_PAI_EXTENSION_BIT); -- GitLab From 0dade41d16132ac11ac9e3ac0b0313d438037224 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Thu, 16 Nov 2023 10:43:14 +0100 Subject: [PATCH 0873/1290] s390/pai_ext: rework function paiext_copy argments Change the function paiext_copy() parameter from a pointer to a structure to two pointers to memory areas referenced. The other members of that structure are not needed. Signed-off-by: Thomas Richter Acked-by: Mete Durlu Signed-off-by: Alexander Gordeev --- arch/s390/kernel/perf_pai_ext.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c index 275d4fd3fe38b..e2e1ce2de695f 100644 --- a/arch/s390/kernel/perf_pai_ext.c +++ b/arch/s390/kernel/perf_pai_ext.c @@ -384,13 +384,12 @@ static void paiext_del(struct perf_event *event, int flags) * 2 bytes: Number of counter * 8 bytes: Value of counter */ -static size_t paiext_copy(struct paiext_map *cpump) +static size_t paiext_copy(struct pai_userdata *userdata, unsigned long *area) { - struct pai_userdata *userdata = cpump->save; int i, outidx = 0; for (i = 1; i <= paiext_cnt; i++) { - u64 val = paiext_getctr(cpump->area, i); + u64 val = paiext_getctr(area, i); if (val) { userdata[outidx].num = i; @@ -427,7 +426,7 @@ static int paiext_push_sample(void) size_t rawsize; int overflow; - rawsize = paiext_copy(cpump); + rawsize = paiext_copy(cpump->save, cpump->area); if (!rawsize) /* No incremented counters */ return 0; -- GitLab From 3046a1091137f55de473262b7f3a04c45f87873a Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Thu, 16 Nov 2023 12:12:12 +0100 Subject: [PATCH 0874/1290] s390/pai_ext: split function paiext_push_sample Split function paiext_push_sample() into two parts. The first part determines the number of bytes to store as raw data in the perf sample record. This is now function paiext_have_sample(). The second part stores the raw data in the perf event's ring buffer. No functional change. Signed-off-by: Thomas Richter Reviewed-by: Mete Durlu Signed-off-by: Alexander Gordeev --- arch/s390/kernel/perf_pai_ext.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c index e2e1ce2de695f..af7f2b538c8fd 100644 --- a/arch/s390/kernel/perf_pai_ext.c +++ b/arch/s390/kernel/perf_pai_ext.c @@ -415,21 +415,14 @@ static size_t paiext_copy(struct pai_userdata *userdata, unsigned long *area) * sched_task() callback. That callback is not active after paiext_del() * returns and has deleted the event on that CPU. */ -static int paiext_push_sample(void) +static int paiext_push_sample(size_t rawsize, struct paiext_map *cpump, + struct perf_event *event) { - struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); - struct paiext_map *cpump = mp->mapptr; - struct perf_event *event = cpump->event; struct perf_sample_data data; struct perf_raw_record raw; struct pt_regs regs; - size_t rawsize; int overflow; - rawsize = paiext_copy(cpump->save, cpump->area); - if (!rawsize) /* No incremented counters */ - return 0; - /* Setup perf sample */ memset(®s, 0, sizeof(regs)); memset(&raw, 0, sizeof(raw)); @@ -458,6 +451,23 @@ static int paiext_push_sample(void) return overflow; } +/* Check if there is data to be saved on schedule out of a task. */ +static int paiext_have_sample(void) +{ + struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); + struct paiext_map *cpump = mp->mapptr; + struct perf_event *event = cpump->event; + size_t rawsize; + int rc = 0; + + if (!event) + return 0; + rawsize = paiext_copy(cpump->save, cpump->area); + if (rawsize) /* Incremented counters */ + rc = paiext_push_sample(rawsize, cpump, event); + return rc; +} + /* Called on schedule-in and schedule-out. No access to event structure, * but for sampling only event NNPA_ALL is allowed. */ @@ -467,7 +477,7 @@ static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched * results on schedule_out and if page was dirty, clear values. */ if (!sched_in) - paiext_push_sample(); + paiext_have_sample(); } /* Attribute definitions for pai extension1 interface. As with other CPU -- GitLab From 813c2f2925ee9c10dc4acd5aa7410cd3357e8da8 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Fri, 12 Jan 2024 15:27:49 +0200 Subject: [PATCH 0875/1290] ASoC: SOF: icp3-dtrace: Revert "Fix wrong kfree() usage" The offending patch introduces memory leak when there is no error, the memory allocated for the temporary storage is not freed up. As I have commented, the original code was correct and cleaner to follow but it was not obvious from the patch that it will introduce regression. Fixes: 8c91ca76f448 ("ASoC: SOF: icp3-dtrace: Fix wrong kfree() usage") Link: https://lore.kernel.org/all/aec61f67-6b4f-49e6-b458-c332983a0ad6@linux.intel.com/ Signed-off-by: Peter Ujfalusi Link: https://msgid.link/r/20240112132749.28970-1-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc3-dtrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sound/soc/sof/ipc3-dtrace.c b/sound/soc/sof/ipc3-dtrace.c index 93b189c2d2ee2..0dca139322f3d 100644 --- a/sound/soc/sof/ipc3-dtrace.c +++ b/sound/soc/sof/ipc3-dtrace.c @@ -137,7 +137,6 @@ static int trace_filter_parse(struct snd_sof_dev *sdev, char *string, dev_err(sdev->dev, "Parsing filter entry '%s' failed with %d\n", entry, entry_len); - kfree(*out); return -EINVAL; } } @@ -209,13 +208,13 @@ static ssize_t dfsentry_trace_filter_write(struct file *file, const char __user ret = ipc3_trace_update_filter(sdev, num_elems, elems); if (ret < 0) { dev_err(sdev->dev, "Filter update failed: %d\n", ret); - kfree(elems); goto error; } } ret = count; error: kfree(string); + kfree(elems); return ret; } -- GitLab From 301bda18ac735eaaad5823dbdd067b3b2728c780 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Fri, 12 Jan 2024 06:10:15 +0000 Subject: [PATCH 0876/1290] ASoC: audio-graph-card2: fix index check on graph_parse_node_multi_nm() commit d685aea5e0a8 ("ASoC: audio-graph-card2: fix off by one in graph_parse_node_multi_nm()") uses ">=" instead of ">" for index check, but it was wrong. The nm_idx will be increment at end of loop, thus, ">" is correct. while (1) { ... => if (*nm_idx > nm_max) break; ... (*nm_idx)++; } Without this patch, "Multi-Codec-1" sample on ${LINUX}/sound/soc/generic/audio-graph-card2-custom-sample.dtsi will be error. Signed-off-by: Kuninori Morimoto Link: https://msgid.link/r/87o7drdqux.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- sound/soc/generic/audio-graph-card2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/generic/audio-graph-card2.c b/sound/soc/generic/audio-graph-card2.c index 9c94677f681a1..62606e20be9a3 100644 --- a/sound/soc/generic/audio-graph-card2.c +++ b/sound/soc/generic/audio-graph-card2.c @@ -556,7 +556,7 @@ static int graph_parse_node_multi_nm(struct snd_soc_dai_link *dai_link, struct device_node *mcodec_port; int codec_idx; - if (*nm_idx >= nm_max) + if (*nm_idx > nm_max) break; mcpu_ep_n = of_get_next_child(mcpu_port, mcpu_ep_n); -- GitLab From 5266caaf5660529e3da53004b8b7174cab6374ed Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 12 Jan 2024 20:26:26 +0800 Subject: [PATCH 0877/1290] blk-mq: fix IO hang from sbitmap wakeup race In blk_mq_mark_tag_wait(), __add_wait_queue() may be re-ordered with the following blk_mq_get_driver_tag() in case of getting driver tag failure. Then in __sbitmap_queue_wake_up(), waitqueue_active() may not observe the added waiter in blk_mq_mark_tag_wait() and wake up nothing, meantime blk_mq_mark_tag_wait() can't get driver tag successfully. This issue can be reproduced by running the following test in loop, and fio hang can be observed in < 30min when running it on my test VM in laptop. modprobe -r scsi_debug modprobe scsi_debug delay=0 dev_size_mb=4096 max_queue=1 host_max_queue=1 submit_queues=4 dev=`ls -d /sys/bus/pseudo/drivers/scsi_debug/adapter*/host*/target*/*/block/* | head -1 | xargs basename` fio --filename=/dev/"$dev" --direct=1 --rw=randrw --bs=4k --iodepth=1 \ --runtime=100 --numjobs=40 --time_based --name=test \ --ioengine=libaio Fix the issue by adding one explicit barrier in blk_mq_mark_tag_wait(), which is just fine in case of running out of tag. Cc: Jan Kara Cc: Kemeng Shi Reported-by: Changhui Zhong Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20240112122626.4181044-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index aa9a05fdd0237..a4c54c5895a13 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1852,6 +1852,22 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, wait->flags &= ~WQ_FLAG_EXCLUSIVE; __add_wait_queue(wq, wait); + /* + * Add one explicit barrier since blk_mq_get_driver_tag() may + * not imply barrier in case of failure. + * + * Order adding us to wait queue and allocating driver tag. + * + * The pair is the one implied in sbitmap_queue_wake_up() which + * orders clearing sbitmap tag bits and waitqueue_active() in + * __sbitmap_queue_wake_up(), since waitqueue_active() is lockless + * + * Otherwise, re-order of adding wait queue and getting driver tag + * may cause __sbitmap_queue_wake_up() to wake up nothing because + * the waitqueue_active() may not observe us in wait queue. + */ + smp_mb(); + /* * It's possible that a tag was freed in the window between the * allocation failure and adding the hardware queue to the wait -- GitLab From 25c1772a0493463408489b1fae65cf77fe46cac1 Mon Sep 17 00:00:00 2001 From: Christian Heusel Date: Fri, 12 Jan 2024 00:15:18 +0100 Subject: [PATCH 0878/1290] block: print symbolic error name instead of error code Utilize the %pe print specifier to get the symbolic error name as a string (i.e "-ENOMEM") in the log message instead of the error code to increase its readablility. This change was suggested in https://lore.kernel.org/all/92972476-0b1f-4d0a-9951-af3fc8bc6e65@suswa.mountain/ Signed-off-by: Christian Heusel Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20240111231521.1596838-1-christian@heusel.eu Signed-off-by: Jens Axboe --- block/partitions/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index e6ac73617f3e1..cab0d76a828e3 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -562,8 +562,8 @@ static bool blk_add_partition(struct gendisk *disk, part = add_partition(disk, p, from, size, state->parts[p].flags, &state->parts[p].info); if (IS_ERR(part) && PTR_ERR(part) != -ENXIO) { - printk(KERN_ERR " %s: p%d could not be added: %ld\n", - disk->disk_name, p, -PTR_ERR(part)); + printk(KERN_ERR " %s: p%d could not be added: %pe\n", + disk->disk_name, p, part); return true; } -- GitLab From 309ce6741430b5a7b5e85cd1a7569647f8d9dfa6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jan 2024 14:57:04 +0100 Subject: [PATCH 0879/1290] blk-mq: rename blk_mq_can_use_cached_rq blk_mq_can_use_cached_rq doesn't just check if we can use the request, but also performs the work to actually use it. Remove the _can in the naming, and improve the comment describing the function. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240111135705.2155518-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index a4c54c5895a13..f57b86d6de6ac 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2900,8 +2900,11 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, return NULL; } -/* return true if this @rq can be used for @bio */ -static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug, +/* + * Check if we can use the passed on request for submitting the passed in bio, + * and remove it from the request list if it can be used. + */ +static bool blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, struct bio *bio) { enum hctx_type type = blk_mq_get_hctx_type(bio->bi_opf); @@ -2979,7 +2982,7 @@ void blk_mq_submit_bio(struct bio *bio) return; if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) return; - if (blk_mq_can_use_cached_rq(rq, plug, bio)) + if (blk_mq_use_cached_rq(rq, plug, bio)) goto done; percpu_ref_get(&q->q_usage_counter); } else { -- GitLab From 454abb80e26ab85323a30e52aa7b0ee9aae1d38a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Amadeusz=20S=C5=82awi=C5=84ski?= Date: Fri, 12 Jan 2024 12:33:49 +0100 Subject: [PATCH 0880/1290] ALSA: hda: Properly setup HDMI stream MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 4005d1ba0a7e ("ASoC: soc-dai: don't call PCM audio ops if the stream is not supported") HDMI playback is broken with avs driver. This happens because for HDMI stream (unlike generic HDA one) channels_min for stream is not set when creating PCMs. Fix this by setting the value based on first available converter. Fixes: 4005d1ba0a7e ("ASoC: soc-dai: don't call PCM audio ops if the stream is not supported") Signed-off-by: Amadeusz Sławiński Link: https://lore.kernel.org/r/20240112113349.2905328-1-amadeuszx.slawinski@linux.intel.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_hdmi.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c index 200779296a1b8..495d63101186f 100644 --- a/sound/pci/hda/patch_hdmi.c +++ b/sound/pci/hda/patch_hdmi.c @@ -2301,6 +2301,7 @@ static int generic_hdmi_build_pcms(struct hda_codec *codec) codec_dbg(codec, "hdmi: pcm_num set to %d\n", pcm_num); for (idx = 0; idx < pcm_num; idx++) { + struct hdmi_spec_per_cvt *per_cvt; struct hda_pcm *info; struct hda_pcm_stream *pstr; @@ -2316,6 +2317,11 @@ static int generic_hdmi_build_pcms(struct hda_codec *codec) pstr = &info->stream[SNDRV_PCM_STREAM_PLAYBACK]; pstr->substreams = 1; pstr->ops = generic_ops; + + per_cvt = get_cvt(spec, 0); + pstr->channels_min = per_cvt->channels_min; + pstr->channels_max = per_cvt->channels_max; + /* pcm number is less than pcm_rec array size */ if (spec->pcm_used >= ARRAY_SIZE(spec->pcm_rec)) break; -- GitLab From 678164a5f0ecd6e0134ce246d64ae7345f8de59c Mon Sep 17 00:00:00 2001 From: Sean Young Date: Fri, 22 Dec 2023 13:13:11 +0000 Subject: [PATCH 0881/1290] pwm: bcm2835: Remove duplicate call to clk_rate_exclusive_put() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit devm_add_action_or_reset() already calls the action in the error case. Reported-by: Uwe Kleine-König Closes: https://lore.kernel.org/linux-pwm/fuku3b5ur6y4k4refd3vmeoenzjo6mwe3b3gtel34rhhhtvnsa@w4uktgbqsc3w/ Fixes: fcc760729359 ("pwm: bcm2835: Allow PWM driver to be used in atomic context") Signed-off-by: Sean Young Reviewed-by: Uwe Kleine-König Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20231222131312.174491-1-sean@mess.org Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-bcm2835.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/pwm/pwm-bcm2835.c b/drivers/pwm/pwm-bcm2835.c index 307c0bd5f8855..283cf27f25bae 100644 --- a/drivers/pwm/pwm-bcm2835.c +++ b/drivers/pwm/pwm-bcm2835.c @@ -160,10 +160,8 @@ static int bcm2835_pwm_probe(struct platform_device *pdev) ret = devm_add_action_or_reset(&pdev->dev, devm_clk_rate_exclusive_put, pc->clk); - if (ret) { - clk_rate_exclusive_put(pc->clk); + if (ret) return ret; - } pc->rate = clk_get_rate(pc->clk); if (!pc->rate) -- GitLab From a297d07b9a1e4fb8cda25a4a2363a507d294b7c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 9 Jan 2024 22:34:31 +0100 Subject: [PATCH 0882/1290] pwm: Fix out-of-bounds access in of_pwm_single_xlate() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With args->args_count == 2 args->args[2] is not defined. Actually the flags are contained in args->args[1]. Fixes: 3ab7b6ac5d82 ("pwm: Introduce single-PWM of_xlate function") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/243908750d306e018a3d4bf2eb745d53ab50f663.1704835845.git.u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König --- drivers/pwm/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index 9a4c720c88aad..f2728ee787d7a 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -152,7 +152,7 @@ of_pwm_single_xlate(struct pwm_chip *chip, const struct of_phandle_args *args) pwm->args.period = args->args[0]; pwm->args.polarity = PWM_POLARITY_NORMAL; - if (args->args_count == 2 && args->args[2] & PWM_POLARITY_INVERTED) + if (args->args_count == 2 && args->args[1] & PWM_POLARITY_INVERTED) pwm->args.polarity = PWM_POLARITY_INVERSED; return pwm; -- GitLab From 9320fc509b87b4d795fb37112931e2f4f8b5c55f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sat, 6 Jan 2024 15:13:03 +0100 Subject: [PATCH 0883/1290] pwm: jz4740: Don't use dev_err_probe() in .request() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dev_err_probe() is only supposed to be used in probe functions. While it probably doesn't hurt, both the EPROBE_DEFER handling and calling device_set_deferred_probe_reason() are conceptually wrong in the request callback. So replace the call by dev_err() and a separate return statement. This effectively reverts commit c0bfe9606e03 ("pwm: jz4740: Simplify with dev_err_probe()"). Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240106141302.1253365-2-u.kleine-koenig@pengutronix.de Fixes: c0bfe9606e03 ("pwm: jz4740: Simplify with dev_err_probe()") Cc: stable@vger.kernel.org Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-jz4740.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/pwm/pwm-jz4740.c b/drivers/pwm/pwm-jz4740.c index 80dcff237a155..3933418e551b4 100644 --- a/drivers/pwm/pwm-jz4740.c +++ b/drivers/pwm/pwm-jz4740.c @@ -61,9 +61,10 @@ static int jz4740_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm) snprintf(name, sizeof(name), "timer%u", pwm->hwpwm); clk = clk_get(chip->dev, name); - if (IS_ERR(clk)) - return dev_err_probe(chip->dev, PTR_ERR(clk), - "Failed to get clock\n"); + if (IS_ERR(clk)) { + dev_err(chip->dev, "error %pe: Failed to get clock\n", clk); + return PTR_ERR(clk); + } err = clk_prepare_enable(clk); if (err < 0) { -- GitLab From 73bf93edeeea866b0b6efbc8d2595bdaaba7f1a5 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 12 Jan 2024 14:27:09 +0800 Subject: [PATCH 0884/1290] cxl/core: use sysfs_emit() for attr's _show() sprintf() is deprecated for sysfs, use preferred sysfs_emit() instead. Signed-off-by: Shiyang Ruan Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Fan Ni Link: https://lore.kernel.org/r/20240112062709.2490947-1-ruansy.fnst@fujitsu.com Signed-off-by: Dan Williams --- drivers/cxl/core/memdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index 2f43d368ba073..dae8802ecdb01 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -114,7 +114,7 @@ static DEVICE_ATTR_RO(serial); static ssize_t numa_node_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", dev_to_node(dev)); + return sysfs_emit(buf, "%d\n", dev_to_node(dev)); } static DEVICE_ATTR_RO(numa_node); -- GitLab From cbdd50ec8b1daef6d71fb3325e436c8dec2d2e15 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Thu, 11 Jan 2024 19:24:29 +0300 Subject: [PATCH 0885/1290] net: liquidio: fix clang-specific W=1 build warnings When compiling with clang-18 and W=1, I've noticed the following warnings: drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c:1493:16: warning: cast from 'void (*)(struct octeon_device *, struct octeon_mbox_cmd *, void *)' to 'octeon_mbox_callback_t' (aka 'void (*)(void *, void *, void *)') converts to incompatible function type [-Wcast-function-type-strict] 1493 | mbox_cmd.fn = (octeon_mbox_callback_t)cn23xx_get_vf_stats_callback; | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ and: drivers/net/ethernet/cavium/liquidio/cn23xx_vf_device.c:432:16: warning: cast from 'void (*)(struct octeon_device *, struct octeon_mbox_cmd *, void *)' to 'octeon_mbox_callback_t' (aka 'void (*)(void *, void *, void *)') converts to incompatible function type [-Wcast-function-type-strict] 432 | mbox_cmd.fn = (octeon_mbox_callback_t)octeon_pfvf_hs_callback; | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fix both of the above by adjusting 'octeon_mbox_callback_t' to match actual callback definitions (at the cost of adding an extra forward declaration). Signed-off-by: Dmitry Antipov Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240111162432.124014-1-dmantipov@yandex.ru Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c | 2 +- drivers/net/ethernet/cavium/liquidio/cn23xx_vf_device.c | 2 +- drivers/net/ethernet/cavium/liquidio/octeon_mailbox.h | 5 ++++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c index 068ed52b66c94..b3c81a2e9d464 100644 --- a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c +++ b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c @@ -1490,7 +1490,7 @@ int cn23xx_get_vf_stats(struct octeon_device *oct, int vfidx, mbox_cmd.q_no = vfidx * oct->sriov_info.rings_per_vf; mbox_cmd.recv_len = 0; mbox_cmd.recv_status = 0; - mbox_cmd.fn = (octeon_mbox_callback_t)cn23xx_get_vf_stats_callback; + mbox_cmd.fn = cn23xx_get_vf_stats_callback; ctx.stats = stats; atomic_set(&ctx.status, 0); mbox_cmd.fn_arg = (void *)&ctx; diff --git a/drivers/net/ethernet/cavium/liquidio/cn23xx_vf_device.c b/drivers/net/ethernet/cavium/liquidio/cn23xx_vf_device.c index dd5d80fee24f0..d2fcb3da484e3 100644 --- a/drivers/net/ethernet/cavium/liquidio/cn23xx_vf_device.c +++ b/drivers/net/ethernet/cavium/liquidio/cn23xx_vf_device.c @@ -429,7 +429,7 @@ int cn23xx_octeon_pfvf_handshake(struct octeon_device *oct) mbox_cmd.q_no = 0; mbox_cmd.recv_len = 0; mbox_cmd.recv_status = 0; - mbox_cmd.fn = (octeon_mbox_callback_t)octeon_pfvf_hs_callback; + mbox_cmd.fn = octeon_pfvf_hs_callback; mbox_cmd.fn_arg = &status; octeon_mbox_write(oct, &mbox_cmd); diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_mailbox.h b/drivers/net/ethernet/cavium/liquidio/octeon_mailbox.h index d92bd7e164775..9ac85d22c6157 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_mailbox.h +++ b/drivers/net/ethernet/cavium/liquidio/octeon_mailbox.h @@ -57,7 +57,10 @@ union octeon_mbox_message { } s; }; -typedef void (*octeon_mbox_callback_t)(void *, void *, void *); +struct octeon_mbox_cmd; + +typedef void (*octeon_mbox_callback_t)(struct octeon_device *, + struct octeon_mbox_cmd *, void *); struct octeon_mbox_cmd { union octeon_mbox_message msg; -- GitLab From 89e23277f9c16df6f9f9c1a1a07f8f132339c15c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jan 2024 19:49:13 +0000 Subject: [PATCH 0886/1290] mptcp: mptcp_parse_option() fix for MPTCPOPT_MP_JOIN mptcp_parse_option() currently sets OPTIONS_MPTCP_MPJ, for the three possible cases handled for MPTCPOPT_MP_JOIN option. OPTIONS_MPTCP_MPJ is the combination of three flags: - OPTION_MPTCP_MPJ_SYN - OPTION_MPTCP_MPJ_SYNACK - OPTION_MPTCP_MPJ_ACK This is a problem, because backup, join_id, token, nonce and/or hmac fields could be left uninitialized in some cases. Distinguish the three cases, as following patches will need this step. Fixes: f296234c98a8 ("mptcp: Add handling of incoming MP_JOIN requests") Signed-off-by: Eric Dumazet Cc: Florian Westphal Cc: Peter Krystad Cc: Matthieu Baerts Cc: Mat Martineau Cc: Geliang Tang Reviewed-by: Simon Horman Acked-by: Paolo Abeni Reviewed-by: Mat Martineau Link: https://lore.kernel.org/r/20240111194917.4044654-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index c53914012d01d..d2527d189a799 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -123,8 +123,8 @@ static void mptcp_parse_option(const struct sk_buff *skb, break; case MPTCPOPT_MP_JOIN: - mp_opt->suboptions |= OPTIONS_MPTCP_MPJ; if (opsize == TCPOLEN_MPTCP_MPJ_SYN) { + mp_opt->suboptions |= OPTION_MPTCP_MPJ_SYN; mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP; mp_opt->join_id = *ptr++; mp_opt->token = get_unaligned_be32(ptr); @@ -135,6 +135,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->backup, mp_opt->join_id, mp_opt->token, mp_opt->nonce); } else if (opsize == TCPOLEN_MPTCP_MPJ_SYNACK) { + mp_opt->suboptions |= OPTION_MPTCP_MPJ_SYNACK; mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP; mp_opt->join_id = *ptr++; mp_opt->thmac = get_unaligned_be64(ptr); @@ -145,11 +146,10 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->backup, mp_opt->join_id, mp_opt->thmac, mp_opt->nonce); } else if (opsize == TCPOLEN_MPTCP_MPJ_ACK) { + mp_opt->suboptions |= OPTION_MPTCP_MPJ_ACK; ptr += 2; memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN); pr_debug("MP_JOIN hmac"); - } else { - mp_opt->suboptions &= ~OPTIONS_MPTCP_MPJ; } break; -- GitLab From c1665273bdc7c201766c65e561c06711f2e050dc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jan 2024 19:49:14 +0000 Subject: [PATCH 0887/1290] mptcp: strict validation before using mp_opt->hmac mp_opt->hmac contains uninitialized data unless OPTION_MPTCP_MPJ_ACK was set in mptcp_parse_option(). We must refine the condition before we call subflow_hmac_valid(). Fixes: f296234c98a8 ("mptcp: Add handling of incoming MP_JOIN requests") Signed-off-by: Eric Dumazet Cc: Florian Westphal Cc: Peter Krystad Cc: Matthieu Baerts Cc: Mat Martineau Cc: Geliang Tang Reviewed-by: Simon Horman Acked-by: Paolo Abeni Reviewed-by: Mat Martineau Link: https://lore.kernel.org/r/20240111194917.4044654-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 3eacd04e7099e..bb05477006a6e 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -788,7 +788,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, } else if (subflow_req->mp_join) { mptcp_get_options(skb, &mp_opt); - if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ) || + if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK) || !subflow_hmac_valid(req, &mp_opt) || !mptcp_can_accept_new_subflow(subflow_req->msk)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); -- GitLab From be1d9d9d38da922bd4beeec5b6dd821ff5a1dfeb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jan 2024 19:49:15 +0000 Subject: [PATCH 0888/1290] mptcp: use OPTION_MPTCP_MPJ_SYNACK in subflow_finish_connect() subflow_finish_connect() uses four fields (backup, join_id, thmac, none) that may contain garbage unless OPTION_MPTCP_MPJ_SYNACK has been set in mptcp_parse_option() Fixes: f296234c98a8 ("mptcp: Add handling of incoming MP_JOIN requests") Signed-off-by: Eric Dumazet Cc: Florian Westphal Cc: Peter Krystad Cc: Matthieu Baerts Cc: Mat Martineau Cc: Geliang Tang Reviewed-by: Simon Horman Acked-by: Paolo Abeni Reviewed-by: Mat Martineau Link: https://lore.kernel.org/r/20240111194917.4044654-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index bb05477006a6e..103ff5471993a 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -506,7 +506,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) } else if (subflow->request_join) { u8 hmac[SHA256_DIGEST_SIZE]; - if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ)) { + if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_SYNACK)) { subflow->reset_reason = MPTCP_RST_EMPTCP; goto do_reset; } -- GitLab From 66ff70df1a919a066942844bb095d6fcb748d78d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jan 2024 19:49:16 +0000 Subject: [PATCH 0889/1290] mptcp: use OPTION_MPTCP_MPJ_SYN in subflow_check_req() syzbot reported that subflow_check_req() was using uninitialized data in subflow_check_req() [1] This is because mp_opt.token is only set when OPTION_MPTCP_MPJ_SYN is also set. While we are are it, fix mptcp_subflow_init_cookie_req() to test for OPTION_MPTCP_MPJ_ACK. [1] BUG: KMSAN: uninit-value in subflow_token_join_request net/mptcp/subflow.c:91 [inline] BUG: KMSAN: uninit-value in subflow_check_req+0x1028/0x15d0 net/mptcp/subflow.c:209 subflow_token_join_request net/mptcp/subflow.c:91 [inline] subflow_check_req+0x1028/0x15d0 net/mptcp/subflow.c:209 subflow_v6_route_req+0x269/0x410 net/mptcp/subflow.c:367 tcp_conn_request+0x153a/0x4240 net/ipv4/tcp_input.c:7164 subflow_v6_conn_request+0x3ee/0x510 tcp_rcv_state_process+0x2e1/0x4ac0 net/ipv4/tcp_input.c:6659 tcp_v6_do_rcv+0x11bf/0x1fe0 net/ipv6/tcp_ipv6.c:1669 tcp_v6_rcv+0x480b/0x4fb0 net/ipv6/tcp_ipv6.c:1900 ip6_protocol_deliver_rcu+0xda6/0x2a60 net/ipv6/ip6_input.c:438 ip6_input_finish net/ipv6/ip6_input.c:483 [inline] NF_HOOK include/linux/netfilter.h:314 [inline] ip6_input+0x15d/0x430 net/ipv6/ip6_input.c:492 dst_input include/net/dst.h:461 [inline] ip6_rcv_finish+0x5db/0x870 net/ipv6/ip6_input.c:79 NF_HOOK include/linux/netfilter.h:314 [inline] ipv6_rcv+0xda/0x390 net/ipv6/ip6_input.c:310 __netif_receive_skb_one_core net/core/dev.c:5532 [inline] __netif_receive_skb+0x1a6/0x5a0 net/core/dev.c:5646 netif_receive_skb_internal net/core/dev.c:5732 [inline] netif_receive_skb+0x58/0x660 net/core/dev.c:5791 tun_rx_batched+0x3ee/0x980 drivers/net/tun.c:1555 tun_get_user+0x53af/0x66d0 drivers/net/tun.c:2002 tun_chr_write_iter+0x3af/0x5d0 drivers/net/tun.c:2048 call_write_iter include/linux/fs.h:2020 [inline] new_sync_write fs/read_write.c:491 [inline] vfs_write+0x8ef/0x1490 fs/read_write.c:584 ksys_write+0x20f/0x4c0 fs/read_write.c:637 __do_sys_write fs/read_write.c:649 [inline] __se_sys_write fs/read_write.c:646 [inline] __x64_sys_write+0x93/0xd0 fs/read_write.c:646 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b Local variable mp_opt created at: subflow_check_req+0x6d/0x15d0 net/mptcp/subflow.c:145 subflow_v6_route_req+0x269/0x410 net/mptcp/subflow.c:367 CPU: 1 PID: 5924 Comm: syz-executor.3 Not tainted 6.7.0-rc8-syzkaller-00055-g5eff55d725a4 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/17/2023 Fixes: f296234c98a8 ("mptcp: Add handling of incoming MP_JOIN requests") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Florian Westphal Cc: Peter Krystad Cc: Matthieu Baerts Cc: Mat Martineau Cc: Geliang Tang Reviewed-by: Simon Horman Acked-by: Paolo Abeni Reviewed-by: Mat Martineau Link: https://lore.kernel.org/r/20240111194917.4044654-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 103ff5471993a..13039ac8d1ab6 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -158,7 +158,7 @@ static int subflow_check_req(struct request_sock *req, mptcp_get_options(skb, &mp_opt); opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC); - opt_mp_join = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ); + opt_mp_join = !!(mp_opt.suboptions & OPTION_MPTCP_MPJ_SYN); if (opt_mp_capable) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); @@ -255,7 +255,7 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req, mptcp_get_options(skb, &mp_opt); opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC); - opt_mp_join = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ); + opt_mp_join = !!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK); if (opt_mp_capable && opt_mp_join) return -EINVAL; -- GitLab From 724b00c12957973656d312dce2a110c75ae2c680 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jan 2024 19:49:17 +0000 Subject: [PATCH 0890/1290] mptcp: refine opt_mp_capable determination OPTIONS_MPTCP_MPC is a combination of three flags. It would be better to be strict about testing what flag is expected, at least for code readability. mptcp_parse_option() already makes the distinction. - subflow_check_req() should use OPTION_MPTCP_MPC_SYN. - mptcp_subflow_init_cookie_req() should use OPTION_MPTCP_MPC_ACK. - subflow_finish_connect() should use OPTION_MPTCP_MPC_SYNACK - subflow_syn_recv_sock should use OPTION_MPTCP_MPC_ACK Suggested-by: Paolo Abeni Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Acked-by: Paolo Abeni Reviewed-by: Mat Martineau Fixes: 74c7dfbee3e1 ("mptcp: consolidate in_opt sub-options fields in a bitmask") Link: https://lore.kernel.org/r/20240111194917.4044654-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 13039ac8d1ab6..1117d1e84274a 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -157,7 +157,7 @@ static int subflow_check_req(struct request_sock *req, mptcp_get_options(skb, &mp_opt); - opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC); + opt_mp_capable = !!(mp_opt.suboptions & OPTION_MPTCP_MPC_SYN); opt_mp_join = !!(mp_opt.suboptions & OPTION_MPTCP_MPJ_SYN); if (opt_mp_capable) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); @@ -254,7 +254,7 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req, subflow_init_req(req, sk_listener); mptcp_get_options(skb, &mp_opt); - opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC); + opt_mp_capable = !!(mp_opt.suboptions & OPTION_MPTCP_MPC_ACK); opt_mp_join = !!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK); if (opt_mp_capable && opt_mp_join) return -EINVAL; @@ -486,7 +486,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) mptcp_get_options(skb, &mp_opt); if (subflow->request_mptcp) { - if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) { + if (!(mp_opt.suboptions & OPTION_MPTCP_MPC_SYNACK)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); mptcp_do_fallback(sk); @@ -783,7 +783,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, * options. */ mptcp_get_options(skb, &mp_opt); - if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) + if (!(mp_opt.suboptions & OPTION_MPTCP_MPC_ACK)) fallback = true; } else if (subflow_req->mp_join) { -- GitLab From 7b4f36cd22a65b750b4cb6ac14804fb7d6e6c67d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 12 Jan 2024 09:12:20 -0700 Subject: [PATCH 0891/1290] block: ensure we hold a queue reference when using queue limits q_usage_counter is the only thing preventing us from the limits changing under us in __bio_split_to_limits, but blk_mq_submit_bio doesn't hold it while calling into it. Move the splitting inside the region where we know we've got a queue reference. Ideally this could still remain a shared section of code, but let's keep the fix simple and defer any refactoring here to later. Reported-by: Christoph Hellwig Fixes: 900e08075202 ("block: move queue enter logic into blk_mq_submit_bio()") Reviewed-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index f57b86d6de6ac..e02c4b1af8c55 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2964,12 +2964,6 @@ void blk_mq_submit_bio(struct bio *bio) blk_status_t ret; bio = blk_queue_bounce(bio, q); - if (bio_may_exceed_limits(bio, &q->limits)) { - bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); - if (!bio) - return; - } - bio_set_ioprio(bio); if (plug) { @@ -2978,6 +2972,11 @@ void blk_mq_submit_bio(struct bio *bio) rq = NULL; } if (rq) { + if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { + bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); + if (!bio) + return; + } if (!bio_integrity_prep(bio)) return; if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) @@ -2988,6 +2987,11 @@ void blk_mq_submit_bio(struct bio *bio) } else { if (unlikely(bio_queue_enter(bio))) return; + if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { + bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); + if (!bio) + goto fail; + } if (!bio_integrity_prep(bio)) goto fail; } -- GitLab From 08300adac3b8dab9e2fd3be0155c7d3093c755f4 Mon Sep 17 00:00:00 2001 From: Sneh Shah Date: Tue, 9 Jan 2024 20:17:29 +0530 Subject: [PATCH 0892/1290] net: stmmac: Fix ethool link settings ops for integrated PCS Currently get/set_link_ksettings ethtool ops are dependent on PCS. When PCS is integrated, it will not have separate link config. Bypass configuring and checking PCS for integrated PCS. Fixes: aa571b6275fb ("net: stmmac: add new switch to struct plat_stmmacenet_data") Tested-by: Andrew Halaney # sa8775p-ride Signed-off-by: Sneh Shah Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index dd05437b51f91..0f8bfba4443d1 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -321,8 +321,9 @@ static int stmmac_ethtool_get_link_ksettings(struct net_device *dev, { struct stmmac_priv *priv = netdev_priv(dev); - if (priv->hw->pcs & STMMAC_PCS_RGMII || - priv->hw->pcs & STMMAC_PCS_SGMII) { + if (!(priv->plat->flags & STMMAC_FLAG_HAS_INTEGRATED_PCS) && + (priv->hw->pcs & STMMAC_PCS_RGMII || + priv->hw->pcs & STMMAC_PCS_SGMII)) { struct rgmii_adv adv; u32 supported, advertising, lp_advertising; @@ -407,8 +408,9 @@ stmmac_ethtool_set_link_ksettings(struct net_device *dev, { struct stmmac_priv *priv = netdev_priv(dev); - if (priv->hw->pcs & STMMAC_PCS_RGMII || - priv->hw->pcs & STMMAC_PCS_SGMII) { + if (!(priv->plat->flags & STMMAC_FLAG_HAS_INTEGRATED_PCS) && + (priv->hw->pcs & STMMAC_PCS_RGMII || + priv->hw->pcs & STMMAC_PCS_SGMII)) { /* Only support ANE */ if (cmd->base.autoneg != AUTONEG_ENABLE) return -EINVAL; -- GitLab From 482521d8e0c6520429478aa6866cd44128b33d5d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 12 Jan 2024 10:44:27 +0000 Subject: [PATCH 0893/1290] udp: annotate data-races around up->pending up->pending can be read without holding the socket lock, as pointed out by syzbot [1] Add READ_ONCE() in lockless contexts, and WRITE_ONCE() on write side. [1] BUG: KCSAN: data-race in udpv6_sendmsg / udpv6_sendmsg write to 0xffff88814e5eadf0 of 4 bytes by task 15547 on cpu 1: udpv6_sendmsg+0x1405/0x1530 net/ipv6/udp.c:1596 inet6_sendmsg+0x63/0x80 net/ipv6/af_inet6.c:657 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] __sys_sendto+0x257/0x310 net/socket.c:2192 __do_sys_sendto net/socket.c:2204 [inline] __se_sys_sendto net/socket.c:2200 [inline] __x64_sys_sendto+0x78/0x90 net/socket.c:2200 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b read to 0xffff88814e5eadf0 of 4 bytes by task 15551 on cpu 0: udpv6_sendmsg+0x22c/0x1530 net/ipv6/udp.c:1373 inet6_sendmsg+0x63/0x80 net/ipv6/af_inet6.c:657 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] ____sys_sendmsg+0x37c/0x4d0 net/socket.c:2586 ___sys_sendmsg net/socket.c:2640 [inline] __sys_sendmmsg+0x269/0x500 net/socket.c:2726 __do_sys_sendmmsg net/socket.c:2755 [inline] __se_sys_sendmmsg net/socket.c:2752 [inline] __x64_sys_sendmmsg+0x57/0x60 net/socket.c:2752 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b value changed: 0x00000000 -> 0x0000000a Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 15551 Comm: syz-executor.1 Tainted: G W 6.7.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/17/2023 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+8d482d0e407f665d9d10@syzkaller.appspotmail.com Link: https://lore.kernel.org/netdev/0000000000009e46c3060ebcdffd@google.com/ Signed-off-by: Eric Dumazet Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/ipv4/udp.c | 12 ++++++------ net/ipv6/udp.c | 16 ++++++++-------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 89e5a806b82e9..5f742d0b9e079 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -805,7 +805,7 @@ void udp_flush_pending_frames(struct sock *sk) if (up->pending) { up->len = 0; - up->pending = 0; + WRITE_ONCE(up->pending, 0); ip_flush_pending_frames(sk); } } @@ -993,7 +993,7 @@ int udp_push_pending_frames(struct sock *sk) out: up->len = 0; - up->pending = 0; + WRITE_ONCE(up->pending, 0); return err; } EXPORT_SYMBOL(udp_push_pending_frames); @@ -1070,7 +1070,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; fl4 = &inet->cork.fl.u.ip4; - if (up->pending) { + if (READ_ONCE(up->pending)) { /* * There are pending frames. * The socket lock must be held while it's corked. @@ -1269,7 +1269,7 @@ back_from_confirm: fl4->saddr = saddr; fl4->fl4_dport = dport; fl4->fl4_sport = inet->inet_sport; - up->pending = AF_INET; + WRITE_ONCE(up->pending, AF_INET); do_append_data: up->len += ulen; @@ -1281,7 +1281,7 @@ do_append_data: else if (!corkreq) err = udp_push_pending_frames(sk); else if (unlikely(skb_queue_empty(&sk->sk_write_queue))) - up->pending = 0; + WRITE_ONCE(up->pending, 0); release_sock(sk); out: @@ -1319,7 +1319,7 @@ void udp_splice_eof(struct socket *sock) struct sock *sk = sock->sk; struct udp_sock *up = udp_sk(sk); - if (!up->pending || udp_test_bit(CORK, sk)) + if (!READ_ONCE(up->pending) || udp_test_bit(CORK, sk)) return; lock_sock(sk); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 594e3f23c1290..3f2249b4cd5f6 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1135,7 +1135,7 @@ static void udp_v6_flush_pending_frames(struct sock *sk) udp_flush_pending_frames(sk); else if (up->pending) { up->len = 0; - up->pending = 0; + WRITE_ONCE(up->pending, 0); ip6_flush_pending_frames(sk); } } @@ -1313,7 +1313,7 @@ static int udp_v6_push_pending_frames(struct sock *sk) &inet_sk(sk)->cork.base); out: up->len = 0; - up->pending = 0; + WRITE_ONCE(up->pending, 0); return err; } @@ -1370,7 +1370,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) default: return -EINVAL; } - } else if (!up->pending) { + } else if (!READ_ONCE(up->pending)) { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; daddr = &sk->sk_v6_daddr; @@ -1401,8 +1401,8 @@ do_udp_sendmsg: return -EMSGSIZE; getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; - if (up->pending) { - if (up->pending == AF_INET) + if (READ_ONCE(up->pending)) { + if (READ_ONCE(up->pending) == AF_INET) return udp_sendmsg(sk, msg, len); /* * There are pending frames. @@ -1593,7 +1593,7 @@ back_from_confirm: goto out; } - up->pending = AF_INET6; + WRITE_ONCE(up->pending, AF_INET6); do_append_data: if (ipc6.dontfrag < 0) @@ -1607,7 +1607,7 @@ do_append_data: else if (!corkreq) err = udp_v6_push_pending_frames(sk); else if (unlikely(skb_queue_empty(&sk->sk_write_queue))) - up->pending = 0; + WRITE_ONCE(up->pending, 0); if (err > 0) err = inet6_test_bit(RECVERR6, sk) ? net_xmit_errno(err) : 0; @@ -1648,7 +1648,7 @@ static void udpv6_splice_eof(struct socket *sock) struct sock *sk = sock->sk; struct udp_sock *up = udp_sk(sk); - if (!up->pending || udp_test_bit(CORK, sk)) + if (!READ_ONCE(up->pending) || udp_test_bit(CORK, sk)) return; lock_sock(sk); -- GitLab From e18405d0be8001fa4c5f9e61471f6ffd59c7a1b3 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 12 Jan 2024 12:39:30 +0100 Subject: [PATCH 0894/1290] net: sched: track device in tcf_block_get/put_ext() only for clsact binder types Clsact/ingress qdisc is not the only one using shared block, red is also using it. The device tracking was originally introduced by commit 913b47d3424e ("net/sched: Introduce tc block netdev tracking infra") for clsact/ingress only. Commit 94e2557d086a ("net: sched: move block device tracking into tcf_block_get/put_ext()") mistakenly enabled that for red as well. Fix that by adding a check for the binder type being clsact when adding device to the block->ports xarray. Reported-by: Ido Schimmel Closes: https://lore.kernel.org/all/ZZ6JE0odnu1lLPtu@shredder/ Fixes: 94e2557d086a ("net: sched: move block device tracking into tcf_block_get/put_ext()") Signed-off-by: Jiri Pirko Tested-by: Ido Schimmel Acked-by: Jamal Hadi Salim Tested-by: Victor Nogueira Signed-off-by: David S. Miller --- net/sched/cls_api.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index e3236a3169c32..92a12e3d0fe63 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1424,6 +1424,14 @@ static void tcf_block_owner_del(struct tcf_block *block, WARN_ON(1); } +static bool tcf_block_tracks_dev(struct tcf_block *block, + struct tcf_block_ext_info *ei) +{ + return tcf_block_shared(block) && + (ei->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS || + ei->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS); +} + int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, struct tcf_block_ext_info *ei, struct netlink_ext_ack *extack) @@ -1462,7 +1470,7 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, if (err) goto err_block_offload_bind; - if (tcf_block_shared(block)) { + if (tcf_block_tracks_dev(block, ei)) { err = xa_insert(&block->ports, dev->ifindex, dev, GFP_KERNEL); if (err) { NL_SET_ERR_MSG(extack, "block dev insert failed"); @@ -1516,7 +1524,7 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, if (!block) return; - if (tcf_block_shared(block)) + if (tcf_block_tracks_dev(block, ei)) xa_erase(&block->ports, dev->ifindex); tcf_chain0_head_change_cb_del(block, ei); tcf_block_owner_del(block, q, ei->binder_type); -- GitLab From 118a8cf504d7dfa519562d000f423ee3ca75d2c4 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Sat, 13 Jan 2024 23:06:02 +0800 Subject: [PATCH 0895/1290] erofs: fix inconsistent per-file compression format EROFS can select compression algorithms on a per-file basis, and each per-file compression algorithm needs to be marked in the on-disk superblock for initialization. However, syzkaller can generate inconsistent crafted images that use an unsupported algorithmtype for specific inodes, e.g. use MicroLZMA algorithmtype even it's not set in `sbi->available_compr_algs`. This can lead to an unexpected "BUG: kernel NULL pointer dereference" if the corresponding decompressor isn't built-in. Fix this by checking against `sbi->available_compr_algs` for each m_algorithmformat request. Incorrect !erofs_sb_has_compr_cfgs preset bitmap is now fixed together since it was harmless previously. Reported-by: Fixes: 8f89926290c4 ("erofs: get compression algorithms directly on mapping") Fixes: 622ceaddb764 ("erofs: lzma compression support") Reviewed-by: Yue Hu Link: https://lore.kernel.org/r/20240113150602.1471050-1-hsiangkao@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/decompressor.c | 2 +- fs/erofs/zmap.c | 23 +++++++++++++---------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 021be5feb1bcf..af98e88908eea 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -398,7 +398,7 @@ int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb) int size, ret = 0; if (!erofs_sb_has_compr_cfgs(sbi)) { - sbi->available_compr_algs = Z_EROFS_COMPRESSION_LZ4; + sbi->available_compr_algs = 1 << Z_EROFS_COMPRESSION_LZ4; return z_erofs_load_lz4_config(sb, dsb, NULL, 0); } diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 7b55111fd5337..7a1a24ae4a2d8 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -458,7 +458,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, .map = map, }; int err = 0; - unsigned int lclusterbits, endoff; + unsigned int lclusterbits, endoff, afmt; unsigned long initial_lcn; unsigned long long ofs, end; @@ -547,17 +547,20 @@ static int z_erofs_do_map_blocks(struct inode *inode, err = -EFSCORRUPTED; goto unmap_out; } - if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER) - map->m_algorithmformat = - Z_EROFS_COMPRESSION_INTERLACED; - else - map->m_algorithmformat = - Z_EROFS_COMPRESSION_SHIFTED; - } else if (m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) { - map->m_algorithmformat = vi->z_algorithmtype[1]; + afmt = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER ? + Z_EROFS_COMPRESSION_INTERLACED : + Z_EROFS_COMPRESSION_SHIFTED; } else { - map->m_algorithmformat = vi->z_algorithmtype[0]; + afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ? + vi->z_algorithmtype[1] : vi->z_algorithmtype[0]; + if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) { + erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu", + afmt, vi->nid); + err = -EFSCORRUPTED; + goto unmap_out; + } } + map->m_algorithmformat = afmt; if ((flags & EROFS_GET_BLOCKS_FIEMAP) || ((flags & EROFS_GET_BLOCKS_READMORE) && -- GitLab From 9181d6f8a2bb32d158de66a84164fac05e3ddd18 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 12 Jan 2024 12:28:16 +0000 Subject: [PATCH 0896/1290] net: add more sanity check in virtio_net_hdr_to_skb() syzbot/KMSAN reports access to uninitialized data from gso_features_check() [1] The repro use af_packet, injecting a gso packet and hdrlen == 0. We could fix the issue making gso_features_check() more careful while dealing with NETIF_F_TSO_MANGLEID in fast path. Or we can make sure virtio_net_hdr_to_skb() pulls minimal network and transport headers as intended. Note that for GSO packets coming from untrusted sources, SKB_GSO_DODGY bit forces a proper header validation (and pull) before the packet can hit any device ndo_start_xmit(), thus we do not need a precise disection at virtio_net_hdr_to_skb() stage. [1] BUG: KMSAN: uninit-value in skb_gso_segment include/net/gso.h:83 [inline] BUG: KMSAN: uninit-value in validate_xmit_skb+0x10f2/0x1930 net/core/dev.c:3629 skb_gso_segment include/net/gso.h:83 [inline] validate_xmit_skb+0x10f2/0x1930 net/core/dev.c:3629 __dev_queue_xmit+0x1eac/0x5130 net/core/dev.c:4341 dev_queue_xmit include/linux/netdevice.h:3134 [inline] packet_xmit+0x9c/0x6b0 net/packet/af_packet.c:276 packet_snd net/packet/af_packet.c:3087 [inline] packet_sendmsg+0x8b1d/0x9f30 net/packet/af_packet.c:3119 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2584 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2638 __sys_sendmsg net/socket.c:2667 [inline] __do_sys_sendmsg net/socket.c:2676 [inline] __se_sys_sendmsg net/socket.c:2674 [inline] __x64_sys_sendmsg+0x307/0x490 net/socket.c:2674 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b Uninit was created at: slab_post_alloc_hook+0x129/0xa70 mm/slab.h:768 slab_alloc_node mm/slub.c:3478 [inline] kmem_cache_alloc_node+0x5e9/0xb10 mm/slub.c:3523 kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:560 __alloc_skb+0x318/0x740 net/core/skbuff.c:651 alloc_skb include/linux/skbuff.h:1286 [inline] alloc_skb_with_frags+0xc8/0xbd0 net/core/skbuff.c:6334 sock_alloc_send_pskb+0xa80/0xbf0 net/core/sock.c:2780 packet_alloc_skb net/packet/af_packet.c:2936 [inline] packet_snd net/packet/af_packet.c:3030 [inline] packet_sendmsg+0x70e8/0x9f30 net/packet/af_packet.c:3119 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2584 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2638 __sys_sendmsg net/socket.c:2667 [inline] __do_sys_sendmsg net/socket.c:2676 [inline] __se_sys_sendmsg net/socket.c:2674 [inline] __x64_sys_sendmsg+0x307/0x490 net/socket.c:2674 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b CPU: 0 PID: 5025 Comm: syz-executor279 Not tainted 6.7.0-rc7-syzkaller-00003-gfbafc3e621c3 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/17/2023 Reported-by: syzbot+7f4d0ea3df4d4fa9a65f@syzkaller.appspotmail.com Link: https://lore.kernel.org/netdev/0000000000005abd7b060eb160cd@google.com/ Fixes: 9274124f023b ("net: stricter validation of untrusted gso packets") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/virtio_net.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 27cc1d4643219..4dfa9b69ca8d9 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -3,6 +3,8 @@ #define _LINUX_VIRTIO_NET_H #include +#include +#include #include #include #include @@ -49,6 +51,7 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, const struct virtio_net_hdr *hdr, bool little_endian) { + unsigned int nh_min_len = sizeof(struct iphdr); unsigned int gso_type = 0; unsigned int thlen = 0; unsigned int p_off = 0; @@ -65,6 +68,7 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, gso_type = SKB_GSO_TCPV6; ip_proto = IPPROTO_TCP; thlen = sizeof(struct tcphdr); + nh_min_len = sizeof(struct ipv6hdr); break; case VIRTIO_NET_HDR_GSO_UDP: gso_type = SKB_GSO_UDP; @@ -100,7 +104,8 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, if (!skb_partial_csum_set(skb, start, off)) return -EINVAL; - p_off = skb_transport_offset(skb) + thlen; + nh_min_len = max_t(u32, nh_min_len, skb_transport_offset(skb)); + p_off = nh_min_len + thlen; if (!pskb_may_pull(skb, p_off)) return -EINVAL; } else { @@ -140,7 +145,7 @@ retry: skb_set_transport_header(skb, keys.control.thoff); } else if (gso_type) { - p_off = thlen; + p_off = nh_min_len + thlen; if (!pskb_may_pull(skb, p_off)) return -EINVAL; } -- GitLab From 894d7508316e7ad722df597d68b4b1797a9eee11 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Fri, 12 Jan 2024 17:13:14 +0100 Subject: [PATCH 0897/1290] net: netdev_queue: netdev_txq_completed_mb(): fix wake condition netif_txq_try_stop() uses "get_desc >= start_thrs" as the check for the call to netif_tx_start_queue(). Use ">=" i netdev_txq_completed_mb(), too. Fixes: c91c46de6bbc ("net: provide macros for commonly copied lockless queue stop/wake code") Signed-off-by: Marc Kleine-Budde Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/netdev_queues.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index d68b0a4834315..8b8ed4e13d74d 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -128,7 +128,7 @@ netdev_txq_completed_mb(struct netdev_queue *dev_queue, netdev_txq_completed_mb(txq, pkts, bytes); \ \ _res = -1; \ - if (pkts && likely(get_desc > start_thrs)) { \ + if (pkts && likely(get_desc >= start_thrs)) { \ _res = 1; \ if (unlikely(netif_tx_queue_stopped(txq)) && \ !(down_cond)) { \ -- GitLab From 19ca0823f6eaad01d18f664a00550abe912c034c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 12 Jan 2024 11:05:28 -0800 Subject: [PATCH 0898/1290] bpf: iter_udp: Retry with a larger batch size without going back to the previous bucket The current logic is to use a default size 16 to batch the whole bucket. If it is too small, it will retry with a larger batch size. The current code accidentally does a state->bucket-- before retrying. This goes back to retry with the previous bucket which has already been done. This patch fixed it. It is hard to create a selftest. I added a WARN_ON(state->bucket < 0), forced a particular port to be hashed to the first bucket, created >16 sockets, and observed the for-loop went back to the "-1" bucket. Cc: Aditi Ghag Fixes: c96dac8d369f ("bpf: udp: Implement batching for sockets iterator") Acked-by: Yonghong Song Signed-off-by: Martin KaFai Lau Reviewed-by: Aditi Ghag Link: https://lore.kernel.org/r/20240112190530.3751661-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- net/ipv4/udp.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 5f742d0b9e079..79050d83e7366 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -3213,7 +3213,6 @@ again: /* After allocating a larger batch, retry one more time to grab * the whole bucket. */ - state->bucket--; goto again; } done: -- GitLab From 2242fd537fab52d5f4d2fbb1845f047c01fad0cf Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 12 Jan 2024 11:05:29 -0800 Subject: [PATCH 0899/1290] bpf: Avoid iter->offset making backward progress in bpf_iter_udp There is a bug in the bpf_iter_udp_batch() function that stops the userspace from making forward progress. The case that triggers the bug is the userspace passed in a very small read buffer. When the bpf prog does bpf_seq_printf, the userspace read buffer is not enough to capture the whole bucket. When the read buffer is not large enough, the kernel will remember the offset of the bucket in iter->offset such that the next userspace read() can continue from where it left off. The kernel will skip the number (== "iter->offset") of sockets in the next read(). However, the code directly decrements the "--iter->offset". This is incorrect because the next read() may not consume the whole bucket either and then the next-next read() will start from offset 0. The net effect is the userspace will keep reading from the beginning of a bucket and the process will never finish. "iter->offset" must always go forward until the whole bucket is consumed. This patch fixes it by using a local variable "resume_offset" and "resume_bucket". "iter->offset" is always reset to 0 before it may be used. "iter->offset" will be advanced to the "resume_offset" when it continues from the "resume_bucket" (i.e. "state->bucket == resume_bucket"). This brings it closer to the bpf_iter_tcp's offset handling which does not suffer the same bug. Cc: Aditi Ghag Fixes: c96dac8d369f ("bpf: udp: Implement batching for sockets iterator") Acked-by: Yonghong Song Signed-off-by: Martin KaFai Lau Reviewed-by: Aditi Ghag Link: https://lore.kernel.org/r/20240112190530.3751661-3-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- net/ipv4/udp.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 79050d83e7366..148ffb007969f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -3137,16 +3137,18 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq) struct bpf_udp_iter_state *iter = seq->private; struct udp_iter_state *state = &iter->state; struct net *net = seq_file_net(seq); + int resume_bucket, resume_offset; struct udp_table *udptable; unsigned int batch_sks = 0; bool resized = false; struct sock *sk; + resume_bucket = state->bucket; + resume_offset = iter->offset; + /* The current batch is done, so advance the bucket. */ - if (iter->st_bucket_done) { + if (iter->st_bucket_done) state->bucket++; - iter->offset = 0; - } udptable = udp_get_table_seq(seq, net); @@ -3166,19 +3168,19 @@ again: for (; state->bucket <= udptable->mask; state->bucket++) { struct udp_hslot *hslot2 = &udptable->hash2[state->bucket]; - if (hlist_empty(&hslot2->head)) { - iter->offset = 0; + if (hlist_empty(&hslot2->head)) continue; - } + iter->offset = 0; spin_lock_bh(&hslot2->lock); udp_portaddr_for_each_entry(sk, &hslot2->head) { if (seq_sk_match(seq, sk)) { /* Resume from the last iterated socket at the * offset in the bucket before iterator was stopped. */ - if (iter->offset) { - --iter->offset; + if (state->bucket == resume_bucket && + iter->offset < resume_offset) { + ++iter->offset; continue; } if (iter->end_sk < iter->max_sk) { @@ -3192,9 +3194,6 @@ again: if (iter->end_sk) break; - - /* Reset the current bucket's offset before moving to the next bucket. */ - iter->offset = 0; } /* All done: no batch made. */ -- GitLab From dbd7db7787ba1743a505a495e479550932836fa4 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 12 Jan 2024 11:05:30 -0800 Subject: [PATCH 0900/1290] selftests/bpf: Test udp and tcp iter batching The patch adds a test to exercise the bpf_iter_udp batching logic. It specifically tests the case that there are multiple so_reuseport udp_sk in a bucket of the udp_table. The test creates two sets of so_reuseport sockets and each set on a different port. Meaning there will be two buckets in the udp_table. The test does the following: 1. read() 3 out of 4 sockets in the first bucket. 2. close() all sockets in the first bucket. This will ensure the current bucket's offset in the kernel does not affect the read() of the following bucket. 3. read() all 4 sockets in the second bucket. The test also reads one udp_sk at a time from the bpf_iter_udp prog. The true case in "do_test(..., bool onebyone)". This is the buggy case that the previous patch fixed. It also tests the "false" case in "do_test(..., bool onebyone)", meaning the userspace reads the whole bucket. There is no bug in this case but adding this test also while at it. Considering the way to have multiple tcp_sk in the same bucket is similar (by using so_reuseport), this patch also tests the bpf_iter_tcp even though the bpf_iter_tcp batching logic works correctly. Both IP v4 and v6 are exercising the same bpf_iter batching code path, so only v6 is tested. Acked-by: Yonghong Song Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240112190530.3751661-4-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/sock_iter_batch.c | 135 ++++++++++++++++++ .../selftests/bpf/progs/bpf_tracing_net.h | 3 + .../selftests/bpf/progs/sock_iter_batch.c | 91 ++++++++++++ .../testing/selftests/bpf/progs/test_jhash.h | 31 ++++ 4 files changed, 260 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/sock_iter_batch.c create mode 100644 tools/testing/selftests/bpf/progs/sock_iter_batch.c diff --git a/tools/testing/selftests/bpf/prog_tests/sock_iter_batch.c b/tools/testing/selftests/bpf/prog_tests/sock_iter_batch.c new file mode 100644 index 0000000000000..0c365f36c73b5 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/sock_iter_batch.c @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2024 Meta + +#include +#include "network_helpers.h" +#include "sock_iter_batch.skel.h" + +#define TEST_NS "sock_iter_batch_netns" + +static const int nr_soreuse = 4; + +static void do_test(int sock_type, bool onebyone) +{ + int err, i, nread, to_read, total_read, iter_fd = -1; + int first_idx, second_idx, indices[nr_soreuse]; + struct bpf_link *link = NULL; + struct sock_iter_batch *skel; + int *fds[2] = {}; + + skel = sock_iter_batch__open(); + if (!ASSERT_OK_PTR(skel, "sock_iter_batch__open")) + return; + + /* Prepare 2 buckets of sockets in the kernel hashtable */ + for (i = 0; i < ARRAY_SIZE(fds); i++) { + int local_port; + + fds[i] = start_reuseport_server(AF_INET6, sock_type, "::1", 0, 0, + nr_soreuse); + if (!ASSERT_OK_PTR(fds[i], "start_reuseport_server")) + goto done; + local_port = get_socket_local_port(*fds[i]); + if (!ASSERT_GE(local_port, 0, "get_socket_local_port")) + goto done; + skel->rodata->ports[i] = ntohs(local_port); + } + + err = sock_iter_batch__load(skel); + if (!ASSERT_OK(err, "sock_iter_batch__load")) + goto done; + + link = bpf_program__attach_iter(sock_type == SOCK_STREAM ? + skel->progs.iter_tcp_soreuse : + skel->progs.iter_udp_soreuse, + NULL); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_iter")) + goto done; + + iter_fd = bpf_iter_create(bpf_link__fd(link)); + if (!ASSERT_GE(iter_fd, 0, "bpf_iter_create")) + goto done; + + /* Test reading a bucket (either from fds[0] or fds[1]). + * Only read "nr_soreuse - 1" number of sockets + * from a bucket and leave one socket out from + * that bucket on purpose. + */ + to_read = (nr_soreuse - 1) * sizeof(*indices); + total_read = 0; + first_idx = -1; + do { + nread = read(iter_fd, indices, onebyone ? sizeof(*indices) : to_read); + if (nread <= 0 || nread % sizeof(*indices)) + break; + total_read += nread; + + if (first_idx == -1) + first_idx = indices[0]; + for (i = 0; i < nread / sizeof(*indices); i++) + ASSERT_EQ(indices[i], first_idx, "first_idx"); + } while (total_read < to_read); + ASSERT_EQ(nread, onebyone ? sizeof(*indices) : to_read, "nread"); + ASSERT_EQ(total_read, to_read, "total_read"); + + free_fds(fds[first_idx], nr_soreuse); + fds[first_idx] = NULL; + + /* Read the "whole" second bucket */ + to_read = nr_soreuse * sizeof(*indices); + total_read = 0; + second_idx = !first_idx; + do { + nread = read(iter_fd, indices, onebyone ? sizeof(*indices) : to_read); + if (nread <= 0 || nread % sizeof(*indices)) + break; + total_read += nread; + + for (i = 0; i < nread / sizeof(*indices); i++) + ASSERT_EQ(indices[i], second_idx, "second_idx"); + } while (total_read <= to_read); + ASSERT_EQ(nread, 0, "nread"); + /* Both so_reuseport ports should be in different buckets, so + * total_read must equal to the expected to_read. + * + * For a very unlikely case, both ports collide at the same bucket, + * the bucket offset (i.e. 3) will be skipped and it cannot + * expect the to_read number of bytes. + */ + if (skel->bss->bucket[0] != skel->bss->bucket[1]) + ASSERT_EQ(total_read, to_read, "total_read"); + +done: + for (i = 0; i < ARRAY_SIZE(fds); i++) + free_fds(fds[i], nr_soreuse); + if (iter_fd < 0) + close(iter_fd); + bpf_link__destroy(link); + sock_iter_batch__destroy(skel); +} + +void test_sock_iter_batch(void) +{ + struct nstoken *nstoken = NULL; + + SYS_NOFAIL("ip netns del " TEST_NS " &> /dev/null"); + SYS(done, "ip netns add %s", TEST_NS); + SYS(done, "ip -net %s link set dev lo up", TEST_NS); + + nstoken = open_netns(TEST_NS); + if (!ASSERT_OK_PTR(nstoken, "open_netns")) + goto done; + + if (test__start_subtest("tcp")) { + do_test(SOCK_STREAM, true); + do_test(SOCK_STREAM, false); + } + if (test__start_subtest("udp")) { + do_test(SOCK_DGRAM, true); + do_test(SOCK_DGRAM, false); + } + close_netns(nstoken); + +done: + SYS_NOFAIL("ip netns del " TEST_NS " &> /dev/null"); +} diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h index 1bdc680b0e0e2..e8bd4b7b5ef76 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h +++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h @@ -72,6 +72,8 @@ #define inet_rcv_saddr sk.__sk_common.skc_rcv_saddr #define inet_dport sk.__sk_common.skc_dport +#define udp_portaddr_hash inet.sk.__sk_common.skc_u16hashes[1] + #define ir_loc_addr req.__req_common.skc_rcv_saddr #define ir_num req.__req_common.skc_num #define ir_rmt_addr req.__req_common.skc_daddr @@ -85,6 +87,7 @@ #define sk_rmem_alloc sk_backlog.rmem_alloc #define sk_refcnt __sk_common.skc_refcnt #define sk_state __sk_common.skc_state +#define sk_net __sk_common.skc_net #define sk_v6_daddr __sk_common.skc_v6_daddr #define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr #define sk_flags __sk_common.skc_flags diff --git a/tools/testing/selftests/bpf/progs/sock_iter_batch.c b/tools/testing/selftests/bpf/progs/sock_iter_batch.c new file mode 100644 index 0000000000000..ffbbfe1fa1c1e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/sock_iter_batch.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2024 Meta + +#include "vmlinux.h" +#include +#include +#include +#include "bpf_tracing_net.h" +#include "bpf_kfuncs.h" + +#define ATTR __always_inline +#include "test_jhash.h" + +static bool ipv6_addr_loopback(const struct in6_addr *a) +{ + return (a->s6_addr32[0] | a->s6_addr32[1] | + a->s6_addr32[2] | (a->s6_addr32[3] ^ bpf_htonl(1))) == 0; +} + +volatile const __u16 ports[2]; +unsigned int bucket[2]; + +SEC("iter/tcp") +int iter_tcp_soreuse(struct bpf_iter__tcp *ctx) +{ + struct sock *sk = (struct sock *)ctx->sk_common; + struct inet_hashinfo *hinfo; + unsigned int hash; + struct net *net; + int idx; + + if (!sk) + return 0; + + sk = bpf_rdonly_cast(sk, bpf_core_type_id_kernel(struct sock)); + if (sk->sk_family != AF_INET6 || + sk->sk_state != TCP_LISTEN || + !ipv6_addr_loopback(&sk->sk_v6_rcv_saddr)) + return 0; + + if (sk->sk_num == ports[0]) + idx = 0; + else if (sk->sk_num == ports[1]) + idx = 1; + else + return 0; + + /* bucket selection as in inet_lhash2_bucket_sk() */ + net = sk->sk_net.net; + hash = jhash2(sk->sk_v6_rcv_saddr.s6_addr32, 4, net->hash_mix); + hash ^= sk->sk_num; + hinfo = net->ipv4.tcp_death_row.hashinfo; + bucket[idx] = hash & hinfo->lhash2_mask; + bpf_seq_write(ctx->meta->seq, &idx, sizeof(idx)); + + return 0; +} + +#define udp_sk(ptr) container_of(ptr, struct udp_sock, inet.sk) + +SEC("iter/udp") +int iter_udp_soreuse(struct bpf_iter__udp *ctx) +{ + struct sock *sk = (struct sock *)ctx->udp_sk; + struct udp_table *udptable; + int idx; + + if (!sk) + return 0; + + sk = bpf_rdonly_cast(sk, bpf_core_type_id_kernel(struct sock)); + if (sk->sk_family != AF_INET6 || + !ipv6_addr_loopback(&sk->sk_v6_rcv_saddr)) + return 0; + + if (sk->sk_num == ports[0]) + idx = 0; + else if (sk->sk_num == ports[1]) + idx = 1; + else + return 0; + + /* bucket selection as in udp_hashslot2() */ + udptable = sk->sk_net.net->ipv4.udp_table; + bucket[idx] = udp_sk(sk)->udp_portaddr_hash & udptable->mask; + bpf_seq_write(ctx->meta->seq, &idx, sizeof(idx)); + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_jhash.h b/tools/testing/selftests/bpf/progs/test_jhash.h index c300734d26f63..ef53559bbbdf1 100644 --- a/tools/testing/selftests/bpf/progs/test_jhash.h +++ b/tools/testing/selftests/bpf/progs/test_jhash.h @@ -69,3 +69,34 @@ u32 jhash(const void *key, u32 length, u32 initval) return c; } + +static __always_inline u32 jhash2(const u32 *k, u32 length, u32 initval) +{ + u32 a, b, c; + + /* Set up the internal state */ + a = b = c = JHASH_INITVAL + (length<<2) + initval; + + /* Handle most of the key */ + while (length > 3) { + a += k[0]; + b += k[1]; + c += k[2]; + __jhash_mix(a, b, c); + length -= 3; + k += 3; + } + + /* Handle the last 3 u32's */ + switch (length) { + case 3: c += k[2]; + case 2: b += k[1]; + case 1: a += k[0]; + __jhash_final(a, b, c); + break; + case 0: /* Nothing left to add */ + break; + } + + return c; +} -- GitLab From dc9dfc8dc629e42f2234e3327b75324ffc752bc9 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 12 Jan 2024 16:32:57 -0800 Subject: [PATCH 0901/1290] net: tls, fix WARNIING in __sk_msg_free A splice with MSG_SPLICE_PAGES will cause tls code to use the tls_sw_sendmsg_splice path in the TLS sendmsg code to move the user provided pages from the msg into the msg_pl. This will loop over the msg until msg_pl is full, checked by sk_msg_full(msg_pl). The user can also set the MORE flag to hint stack to delay sending until receiving more pages and ideally a full buffer. If the user adds more pages to the msg than can fit in the msg_pl scatterlist (MAX_MSG_FRAGS) we should ignore the MORE flag and send the buffer anyways. What actually happens though is we abort the msg to msg_pl scatterlist setup and then because we forget to set 'full record' indicating we can no longer consume data without a send we fallthrough to the 'continue' path which will check if msg_data_left(msg) has more bytes to send and then attempts to fit them in the already full msg_pl. Then next iteration of sender doing send will encounter a full msg_pl and throw the warning in the syzbot report. To fix simply check if we have a full_record in splice code path and if not send the msg regardless of MORE flag. Reported-and-tested-by: syzbot+f2977222e0e95cec15c8@syzkaller.appspotmail.com Reported-by: Edward Adam Davis Fixes: fe1e81d4f73b ("tls/sw: Support MSG_SPLICE_PAGES") Reviewed-by: Jakub Kicinski Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index e37b4d2e2acde..31e8a94dfc111 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1052,7 +1052,11 @@ alloc_encrypted: if (ret < 0) goto send_end; tls_ctx->pending_open_record_frags = true; - if (full_record || eor || sk_msg_full(msg_pl)) + + if (sk_msg_full(msg_pl)) + full_record = true; + + if (full_record || eor) goto copied; continue; } -- GitLab From 034ea1305e659ddae44c19ba8449166fec318e2d Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 12 Jan 2024 16:32:58 -0800 Subject: [PATCH 0902/1290] net: tls, add test to capture error on large splice syzbot found an error with how splice() is handled with a msg greater than 32. This was fixed in previous patch, but lets add a test for it to ensure it continues to work. Signed-off-by: John Fastabend Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- tools/testing/selftests/net/tls.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 464853a7f9829..7799e042a9719 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -707,6 +707,20 @@ TEST_F(tls, splice_from_pipe) EXPECT_EQ(memcmp(mem_send, mem_recv, send_len), 0); } +TEST_F(tls, splice_more) +{ + unsigned int f = SPLICE_F_NONBLOCK | SPLICE_F_MORE | SPLICE_F_GIFT; + int send_len = TLS_PAYLOAD_MAX_LEN; + char mem_send[TLS_PAYLOAD_MAX_LEN]; + int i, send_pipe = 1; + int p[2]; + + ASSERT_GE(pipe(p), 0); + EXPECT_GE(write(p[1], mem_send, send_len), 0); + for (i = 0; i < 32; i++) + EXPECT_EQ(splice(p[0], NULL, self->fd, NULL, send_pipe, f), 1); +} + TEST_F(tls, splice_from_pipe2) { int send_len = 16000; -- GitLab From c061be1bd5e74e9685630cc95b818e13dbff9713 Mon Sep 17 00:00:00 2001 From: Marcin Wojtas Date: Fri, 12 Jan 2024 14:06:28 +0000 Subject: [PATCH 0903/1290] MAINTAINERS: eth: mvneta: update entry Add myself as driver maintainer and restore the maintained status. While at it, update the file field to cover mvneta_bm part of the driver. Signed-off-by: Marcin Wojtas Signed-off-by: David S. Miller --- MAINTAINERS | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 4194f414dd2c5..39eb0a6a29277 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12869,9 +12869,10 @@ S: Maintained F: drivers/thermal/armada_thermal.c MARVELL MVNETA ETHERNET DRIVER +M: Marcin Wojtas L: netdev@vger.kernel.org -S: Orphan -F: drivers/net/ethernet/marvell/mvneta.* +S: Maintained +F: drivers/net/ethernet/marvell/mvneta* MARVELL MVPP2 ETHERNET DRIVER M: Marcin Wojtas -- GitLab From 95931a245b44ee04f3359ec432e73614d44d8b38 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 14 Jan 2024 10:00:59 +0100 Subject: [PATCH 0904/1290] null_blk: Remove usage of the deprecated ida_simple_xx() API ida_alloc() and ida_free() should be preferred to the deprecated ida_simple_get() and ida_simple_remove(). This is less verbose. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/bf257b1078475a415cdc3344c6a750842946e367.1705222845.git.christophe.jaillet@wanadoo.fr Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 9f7695f00c2db..36755f263e8ec 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1840,7 +1840,7 @@ static void null_del_dev(struct nullb *nullb) dev = nullb->dev; - ida_simple_remove(&nullb_indexes, nullb->index); + ida_free(&nullb_indexes, nullb->index); list_del_init(&nullb->list); @@ -2174,7 +2174,7 @@ static int null_add_dev(struct nullb_device *dev) blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q); mutex_lock(&lock); - rv = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL); + rv = ida_alloc(&nullb_indexes, GFP_KERNEL); if (rv < 0) { mutex_unlock(&lock); goto out_cleanup_zone; -- GitLab From 2539b15d504c3f9fd8ca82032bf9c80c9864412c Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 12 Jan 2024 15:03:21 -0800 Subject: [PATCH 0905/1290] hwmon: (npcm750-pwm-fan) Fix crash observed when instantiating nuvoton,npcm750-pwm-fan Commit 89fec128d5d1 ("hwmon: (npcm750-pwm-fan) Add NPCM8xx support") introduced support for PWM fans on Nuvoton's npcm845 SoC. This chip supports three PWM modules with four PWM channels each. The older npcm750 only supported two PWM modules. The commit did not take into account that the older SoC only supported two PWM modules. This results in a crash if npcm750 is instantiated when the code attempts to instantiate the non-existing third PWM module. Unable to handle kernel paging request at virtual address e0aa2000 when write [e0aa2000] *pgd=04ab6811, *pte=00000000, *ppte=00000000 Internal error: Oops: 807 [#1] SMP ARM Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Tainted: G N 6.7.0-next-20240112-dirty #3 Hardware name: NPCM7XX Chip family PC is at npcm7xx_pwm_fan_probe+0x204/0x890 LR is at arm_heavy_mb+0x1c/0x38 Fix the problem by detecting the number of supported PWM modules in the probe function and only instantiating the supported modules. Fixes: 89fec128d5d1 ("hwmon: (npcm750-pwm-fan) Add NPCM8xx support") Cc: Tomer Maimon Signed-off-by: Guenter Roeck --- drivers/hwmon/npcm750-pwm-fan.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/hwmon/npcm750-pwm-fan.c b/drivers/hwmon/npcm750-pwm-fan.c index d9733da8ea34f..904816abb7c46 100644 --- a/drivers/hwmon/npcm750-pwm-fan.c +++ b/drivers/hwmon/npcm750-pwm-fan.c @@ -195,6 +195,7 @@ struct npcm7xx_cooling_device { struct npcm7xx_pwm_fan_data { void __iomem *pwm_base; void __iomem *fan_base; + int pwm_modules; unsigned long pwm_clk_freq; unsigned long fan_clk_freq; struct clk *pwm_clk; @@ -710,7 +711,7 @@ static u32 npcm7xx_pwm_init(struct npcm7xx_pwm_fan_data *data) /* Setting PWM Prescale Register value register to both modules */ prescale_val |= (prescale_val << NPCM7XX_PWM_PRESCALE_SHIFT_CH01); - for (m = 0; m < NPCM7XX_PWM_MAX_MODULES ; m++) { + for (m = 0; m < data->pwm_modules; m++) { iowrite32(prescale_val, NPCM7XX_PWM_REG_PR(data->pwm_base, m)); iowrite32(NPCM7XX_PWM_PRESCALE2_DEFAULT, NPCM7XX_PWM_REG_CSR(data->pwm_base, m)); @@ -946,6 +947,8 @@ static int npcm7xx_pwm_fan_probe(struct platform_device *pdev) if (!data->info) return -EINVAL; + data->pwm_modules = data->info->pwm_max_channel / NPCM7XX_PWM_MAX_CHN_NUM_IN_A_MODULE; + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "pwm"); if (!res) { dev_err(dev, "pwm resource not found\n"); @@ -983,7 +986,7 @@ static int npcm7xx_pwm_fan_probe(struct platform_device *pdev) output_freq = npcm7xx_pwm_init(data); npcm7xx_fan_init(data); - for (cnt = 0; cnt < NPCM7XX_PWM_MAX_MODULES ; cnt++) + for (cnt = 0; cnt < data->pwm_modules; cnt++) mutex_init(&data->pwm_lock[cnt]); for (i = 0; i < NPCM7XX_FAN_MAX_MODULE; i++) { -- GitLab From e327b2372bc0f18c30433ac40be07741b59231c5 Mon Sep 17 00:00:00 2001 From: Nikita Yushchenko Date: Sat, 13 Jan 2024 10:22:21 +0600 Subject: [PATCH 0906/1290] net: ravb: Fix dma_addr_t truncation in error case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In ravb_start_xmit(), ravb driver uses u32 variable to store result of dma_map_single() call. Since ravb hardware has 32-bit address fields in descriptors, this works properly when mapping is successful - it is platform's job to provide mapping addresses that fit into hardware limitations. However, in failure case dma_map_single() returns DMA_MAPPING_ERROR constant that is 64-bit when dma_addr_t is 64-bit. Storing this constant in u32 leads to truncation, and further call to dma_mapping_error() fails to notice the error. Fix that by storing result of dma_map_single() in a dma_addr_t variable. Fixes: c156633f1353 ("Renesas Ethernet AVB driver proper") Signed-off-by: Nikita Yushchenko Reviewed-by: Niklas Söderlund Reviewed-by: Sergey Shtylyov Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/ravb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 8649b3e90edb2..0e3731f50fc28 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -1949,7 +1949,7 @@ static netdev_tx_t ravb_start_xmit(struct sk_buff *skb, struct net_device *ndev) struct ravb_tstamp_skb *ts_skb; struct ravb_tx_desc *desc; unsigned long flags; - u32 dma_addr; + dma_addr_t dma_addr; void *buffer; u32 entry; u32 len; -- GitLab From fdfd6dde4328635861db029f6fdb649e17350526 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sun, 14 Jan 2024 15:17:18 +0900 Subject: [PATCH 0907/1290] ksmbd: update feature status in documentation Update ksmbd feature status in documentation file. - add support for v2 lease feature and SMB3 CCM/GCM256 encryption. - add planned compression, quic, gmac signing features. Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- Documentation/filesystems/smb/ksmbd.rst | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Documentation/filesystems/smb/ksmbd.rst b/Documentation/filesystems/smb/ksmbd.rst index 7bed96d794fc2..6b30e43a0d11f 100644 --- a/Documentation/filesystems/smb/ksmbd.rst +++ b/Documentation/filesystems/smb/ksmbd.rst @@ -73,15 +73,14 @@ Auto Negotiation Supported. Compound Request Supported. Oplock Cache Mechanism Supported. SMB2 leases(v1 lease) Supported. -Directory leases(v2 lease) Planned for future. +Directory leases(v2 lease) Supported. Multi-credits Supported. NTLM/NTLMv2 Supported. HMAC-SHA256 Signing Supported. Secure negotiate Supported. Signing Update Supported. Pre-authentication integrity Supported. -SMB3 encryption(CCM, GCM) Supported. (CCM and GCM128 supported, GCM256 in - progress) +SMB3 encryption(CCM, GCM) Supported. (CCM/GCM128 and CCM/GCM256 supported) SMB direct(RDMA) Supported. SMB3 Multi-channel Partially Supported. Planned to implement replay/retry mechanisms for future. @@ -112,6 +111,10 @@ DCE/RPC support Partially Supported. a few calls(NetShareEnumAll, for Witness protocol e.g.) ksmbd/nfsd interoperability Planned for future. The features that ksmbd support are Leases, Notify, ACLs and Share modes. +SMB3.1.1 Compression Planned for future. +SMB3.1.1 over QUIC Planned for future. +Signing/Encryption over RDMA Planned for future. +SMB3.1.1 GMAC signing support Planned for future. ============================== ================================================= -- GitLab From 92e470163d96df8db6c4fa0f484e4a229edb903d Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 13 Jan 2024 15:11:41 +0900 Subject: [PATCH 0908/1290] ksmbd: validate mech token in session setup If client send invalid mech token in session setup request, ksmbd validate and make the error if it is invalid. Cc: stable@vger.kernel.org Reported-by: zdi-disclosures@trendmicro.com # ZDI-CAN-22890 Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/asn1.c | 5 +++++ fs/smb/server/connection.h | 1 + fs/smb/server/smb2pdu.c | 22 +++++++++++++++++----- 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/fs/smb/server/asn1.c b/fs/smb/server/asn1.c index 4a4b2b03ff33d..b931a99ab9c85 100644 --- a/fs/smb/server/asn1.c +++ b/fs/smb/server/asn1.c @@ -214,10 +214,15 @@ static int ksmbd_neg_token_alloc(void *context, size_t hdrlen, { struct ksmbd_conn *conn = context; + if (!vlen) + return -EINVAL; + conn->mechToken = kmemdup_nul(value, vlen, GFP_KERNEL); if (!conn->mechToken) return -ENOMEM; + conn->mechTokenLen = (unsigned int)vlen; + return 0; } diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index 3c005246a32e8..342f935f57705 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -88,6 +88,7 @@ struct ksmbd_conn { __u16 dialect; char *mechToken; + unsigned int mechTokenLen; struct ksmbd_conn_ops *conn_ops; diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 3143819935dca..ba7a72a6a4f45 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -1414,7 +1414,10 @@ static struct ksmbd_user *session_user(struct ksmbd_conn *conn, char *name; unsigned int name_off, name_len, secbuf_len; - secbuf_len = le16_to_cpu(req->SecurityBufferLength); + if (conn->use_spnego && conn->mechToken) + secbuf_len = conn->mechTokenLen; + else + secbuf_len = le16_to_cpu(req->SecurityBufferLength); if (secbuf_len < sizeof(struct authenticate_message)) { ksmbd_debug(SMB, "blob len %d too small\n", secbuf_len); return NULL; @@ -1505,7 +1508,10 @@ static int ntlm_authenticate(struct ksmbd_work *work, struct authenticate_message *authblob; authblob = user_authblob(conn, req); - sz = le16_to_cpu(req->SecurityBufferLength); + if (conn->use_spnego && conn->mechToken) + sz = conn->mechTokenLen; + else + sz = le16_to_cpu(req->SecurityBufferLength); rc = ksmbd_decode_ntlmssp_auth_blob(authblob, sz, conn, sess); if (rc) { set_user_flag(sess->user, KSMBD_USER_FLAG_BAD_PASSWORD); @@ -1778,8 +1784,7 @@ int smb2_sess_setup(struct ksmbd_work *work) negblob_off = le16_to_cpu(req->SecurityBufferOffset); negblob_len = le16_to_cpu(req->SecurityBufferLength); - if (negblob_off < offsetof(struct smb2_sess_setup_req, Buffer) || - negblob_len < offsetof(struct negotiate_message, NegotiateFlags)) { + if (negblob_off < offsetof(struct smb2_sess_setup_req, Buffer)) { rc = -EINVAL; goto out_err; } @@ -1788,8 +1793,15 @@ int smb2_sess_setup(struct ksmbd_work *work) negblob_off); if (decode_negotiation_token(conn, negblob, negblob_len) == 0) { - if (conn->mechToken) + if (conn->mechToken) { negblob = (struct negotiate_message *)conn->mechToken; + negblob_len = conn->mechTokenLen; + } + } + + if (negblob_len < offsetof(struct negotiate_message, NegotiateFlags)) { + rc = -EINVAL; + goto out_err; } if (server_conf.auth_mechs & conn->auth_mechs) { -- GitLab From 38d20c62903d669693a1869aa68c4dd5674e2544 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 13 Jan 2024 15:30:07 +0900 Subject: [PATCH 0909/1290] ksmbd: fix UAF issue in ksmbd_tcp_new_connection() The race is between the handling of a new TCP connection and its disconnection. It leads to UAF on `struct tcp_transport` in ksmbd_tcp_new_connection() function. Cc: stable@vger.kernel.org Reported-by: zdi-disclosures@trendmicro.com # ZDI-CAN-22991 Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/connection.c | 6 ------ fs/smb/server/connection.h | 1 - fs/smb/server/transport_rdma.c | 11 ++++++----- fs/smb/server/transport_tcp.c | 13 +++++++------ 4 files changed, 13 insertions(+), 18 deletions(-) diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index d311c2ee10bd7..09e1e7771592f 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -416,13 +416,7 @@ static void stop_sessions(void) again: down_read(&conn_list_lock); list_for_each_entry(conn, &conn_list, conns_list) { - struct task_struct *task; - t = conn->transport; - task = t->handler; - if (task) - ksmbd_debug(CONN, "Stop session handler %s/%d\n", - task->comm, task_pid_nr(task)); ksmbd_conn_set_exiting(conn); if (t->ops->shutdown) { up_read(&conn_list_lock); diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index 342f935f57705..0e04cf8b1d896 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -135,7 +135,6 @@ struct ksmbd_transport_ops { struct ksmbd_transport { struct ksmbd_conn *conn; struct ksmbd_transport_ops *ops; - struct task_struct *handler; }; #define KSMBD_TCP_RECV_TIMEOUT (7 * HZ) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index c5629a68c8b73..8faa25c6e129b 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -2039,6 +2039,7 @@ static bool rdma_frwr_is_supported(struct ib_device_attr *attrs) static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id) { struct smb_direct_transport *t; + struct task_struct *handler; int ret; if (!rdma_frwr_is_supported(&new_cm_id->device->attrs)) { @@ -2056,11 +2057,11 @@ static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id) if (ret) goto out_err; - KSMBD_TRANS(t)->handler = kthread_run(ksmbd_conn_handler_loop, - KSMBD_TRANS(t)->conn, "ksmbd:r%u", - smb_direct_port); - if (IS_ERR(KSMBD_TRANS(t)->handler)) { - ret = PTR_ERR(KSMBD_TRANS(t)->handler); + handler = kthread_run(ksmbd_conn_handler_loop, + KSMBD_TRANS(t)->conn, "ksmbd:r%u", + smb_direct_port); + if (IS_ERR(handler)) { + ret = PTR_ERR(handler); pr_err("Can't start thread\n"); goto out_err; } diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c index eff7a1d793f00..9d4222154dcc0 100644 --- a/fs/smb/server/transport_tcp.c +++ b/fs/smb/server/transport_tcp.c @@ -185,6 +185,7 @@ static int ksmbd_tcp_new_connection(struct socket *client_sk) struct sockaddr *csin; int rc = 0; struct tcp_transport *t; + struct task_struct *handler; t = alloc_transport(client_sk); if (!t) { @@ -199,13 +200,13 @@ static int ksmbd_tcp_new_connection(struct socket *client_sk) goto out_error; } - KSMBD_TRANS(t)->handler = kthread_run(ksmbd_conn_handler_loop, - KSMBD_TRANS(t)->conn, - "ksmbd:%u", - ksmbd_tcp_get_port(csin)); - if (IS_ERR(KSMBD_TRANS(t)->handler)) { + handler = kthread_run(ksmbd_conn_handler_loop, + KSMBD_TRANS(t)->conn, + "ksmbd:%u", + ksmbd_tcp_get_port(csin)); + if (IS_ERR(handler)) { pr_err("cannot start conn thread\n"); - rc = PTR_ERR(KSMBD_TRANS(t)->handler); + rc = PTR_ERR(handler); free_transport(t); } return rc; -- GitLab From 77bebd186442a7d703b796784db7495129cc3e70 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Mon, 15 Jan 2024 10:24:54 +0900 Subject: [PATCH 0910/1290] ksmbd: only v2 leases handle the directory When smb2 leases is disable, ksmbd can send oplock break notification and cause wait oplock break ack timeout. It may appear like hang when accessing a directory. This patch make only v2 leases handle the directory. Cc: stable@vger.kernel.org Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/oplock.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c index 001926d3b348c..53dfaac425c68 100644 --- a/fs/smb/server/oplock.c +++ b/fs/smb/server/oplock.c @@ -1197,6 +1197,12 @@ int smb_grant_oplock(struct ksmbd_work *work, int req_op_level, u64 pid, bool prev_op_has_lease; __le32 prev_op_state = 0; + /* Only v2 leases handle the directory */ + if (S_ISDIR(file_inode(fp->filp)->i_mode)) { + if (!lctx || lctx->version != 2) + return 0; + } + opinfo = alloc_opinfo(work, pid, tid); if (!opinfo) return -ENOMEM; -- GitLab From 7c65aa3cc072cee76f577262fbe381a111a98774 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 13 Jan 2024 20:46:42 -0800 Subject: [PATCH 0911/1290] dma-debug: fix kernel-doc warnings Update the kernel-doc comments to catch up with the code changes and fix the kernel-doc warnings: debug.c:83: warning: Excess struct member 'stacktrace' description in 'dma_debug_entry' debug.c:83: warning: Function parameter or struct member 'stack_len' not described in 'dma_debug_entry' debug.c:83: warning: Function parameter or struct member 'stack_entries' not described in 'dma_debug_entry' Fixes: 746017ed8d4d ("dma/debug: Simplify stracktrace retrieval") Signed-off-by: Randy Dunlap Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Robin Murphy Cc: iommu@lists.linux.dev Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 3de494375b7b3..08b7f84b76f9e 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -62,7 +62,8 @@ enum map_err_types { * @pfn: page frame of the start address * @offset: offset of mapping relative to pfn * @map_err_type: track whether dma_mapping_error() was checked - * @stacktrace: support backtraces when a violation is detected + * @stack_len: number of backtrace entries in @stack_entries + * @stack_entries: stack of backtrace history */ struct dma_debug_entry { struct list_head list; -- GitLab From c8fb5a52b977ef91dc9342e556c63b9602065c8a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 12 Jan 2024 09:55:25 +0300 Subject: [PATCH 0912/1290] gpio: rtd: Fix signedness bug in probe The "data->irqs[]" array holds unsigned int so this error handling will not work correctly. Fixes: eee636bff0dc ("gpio: rtd: Add support for Realtek DHC(Digital Home Center) RTD SoCs") Signed-off-by: Dan Carpenter Reviewed-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-rtd.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/gpio/gpio-rtd.c b/drivers/gpio/gpio-rtd.c index a7939bd0aa566..bf7f008f58d70 100644 --- a/drivers/gpio/gpio-rtd.c +++ b/drivers/gpio/gpio-rtd.c @@ -525,18 +525,21 @@ static int rtd_gpio_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct gpio_irq_chip *irq_chip; struct rtd_gpio *data; + int ret; data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; - data->irqs[0] = platform_get_irq(pdev, 0); - if (data->irqs[0] < 0) - return data->irqs[0]; + ret = platform_get_irq(pdev, 0); + if (ret < 0) + return ret; + data->irqs[0] = ret; - data->irqs[1] = platform_get_irq(pdev, 1); - if (data->irqs[1] < 0) - return data->irqs[1]; + ret = platform_get_irq(pdev, 1); + if (ret < 0) + return ret; + data->irqs[1] = ret; data->info = device_get_match_data(dev); if (!data->info) -- GitLab From 314c020c4ed3de72b15603eb6892250bc4b51702 Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Fri, 12 Jan 2024 12:32:58 +0100 Subject: [PATCH 0913/1290] dt-bindings: gpio: xilinx: Fix node address in gpio Node address doesn't match reg property which is not correct. Fixes: ba96b2e7974b ("dt-bindings: gpio: gpio-xilinx: Convert Xilinx axi gpio binding to YAML") Signed-off-by: Michal Simek Reviewed-by: Krzysztof Kozlowski Signed-off-by: Bartosz Golaszewski --- Documentation/devicetree/bindings/gpio/xlnx,gpio-xilinx.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/gpio/xlnx,gpio-xilinx.yaml b/Documentation/devicetree/bindings/gpio/xlnx,gpio-xilinx.yaml index c1060e5fcef3a..d3d8a2e143ed2 100644 --- a/Documentation/devicetree/bindings/gpio/xlnx,gpio-xilinx.yaml +++ b/Documentation/devicetree/bindings/gpio/xlnx,gpio-xilinx.yaml @@ -126,7 +126,7 @@ examples: - | #include - gpio@e000a000 { + gpio@a0020000 { compatible = "xlnx,xps-gpio-1.00.a"; reg = <0xa0020000 0x10000>; #gpio-cells = <2>; -- GitLab From d460e9c2075164e9b1fa9c4c95f8c05517bd8752 Mon Sep 17 00:00:00 2001 From: Su Hui Date: Fri, 12 Jan 2024 12:24:04 +0800 Subject: [PATCH 0914/1290] gpio: mlxbf3: add an error code check in mlxbf3_gpio_probe Clang static checker warning: Value stored to 'ret' is never read. bgpio_init() returns error code if failed, it's better to add this check. Fixes: cd33f216d241 ("gpio: mlxbf3: Add gpio driver support") Signed-off-by: Su Hui [Bartosz: add the Fixes: tag] Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mlxbf3.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpio/gpio-mlxbf3.c b/drivers/gpio/gpio-mlxbf3.c index 7a3e1760fc5b7..d5906d419b0ab 100644 --- a/drivers/gpio/gpio-mlxbf3.c +++ b/drivers/gpio/gpio-mlxbf3.c @@ -215,6 +215,8 @@ static int mlxbf3_gpio_probe(struct platform_device *pdev) gs->gpio_clr_io + MLXBF_GPIO_FW_DATA_OUT_CLEAR, gs->gpio_set_io + MLXBF_GPIO_FW_OUTPUT_ENABLE_SET, gs->gpio_clr_io + MLXBF_GPIO_FW_OUTPUT_ENABLE_CLEAR, 0); + if (ret) + return dev_err_probe(dev, ret, "%s: bgpio_init() failed", __func__); gc->request = gpiochip_generic_request; gc->free = gpiochip_generic_free; -- GitLab From 3787ffdd13de81ba406e5b42c6c24f823395ba5e Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 12 Jan 2024 18:10:00 +0100 Subject: [PATCH 0915/1290] ALSA: scarlett2: Fix yet more -Wformat-truncation warnings The recent code change introduced a few false-positive compile warnings with -Wformat-trucation again. Suppress them by replacing snprintf() with scnprintf(). Fixes: 0a995e38dc44 ("ALSA: scarlett2: Add support for software-controllable input gain") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202401062344.AzZCYlpa-lkp@intel.com/ Link: https://lore.kernel.org/r/20240112171000.31855-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/usb/mixer_scarlett2.c | 42 ++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/sound/usb/mixer_scarlett2.c b/sound/usb/mixer_scarlett2.c index 1de3ddc50eb6a..6de605a601e5f 100644 --- a/sound/usb/mixer_scarlett2.c +++ b/sound/usb/mixer_scarlett2.c @@ -5361,9 +5361,9 @@ static int scarlett2_add_line_out_ctls(struct usb_mixer_interface *mixer) if (private->vol_sw_hw_switch[index]) scarlett2_vol_ctl_set_writable(mixer, i, 0); - snprintf(s, sizeof(s), - "Line Out %02d Volume Control Playback Enum", - i + 1); + scnprintf(s, sizeof(s), + "Line Out %02d Volume Control Playback Enum", + i + 1); err = scarlett2_add_new_ctl(mixer, &scarlett2_sw_hw_enum_ctl, i, 1, s, @@ -5406,8 +5406,8 @@ static int scarlett2_add_line_in_ctls(struct usb_mixer_interface *mixer) /* Add input level (line/inst) controls */ for (i = 0; i < info->level_input_count; i++) { - snprintf(s, sizeof(s), fmt, i + 1 + info->level_input_first, - "Level", "Enum"); + scnprintf(s, sizeof(s), fmt, i + 1 + info->level_input_first, + "Level", "Enum"); err = scarlett2_add_new_ctl(mixer, &scarlett2_level_enum_ctl, i, 1, s, &private->level_ctls[i]); if (err < 0) @@ -5416,7 +5416,7 @@ static int scarlett2_add_line_in_ctls(struct usb_mixer_interface *mixer) /* Add input pad controls */ for (i = 0; i < info->pad_input_count; i++) { - snprintf(s, sizeof(s), fmt, i + 1, "Pad", "Switch"); + scnprintf(s, sizeof(s), fmt, i + 1, "Pad", "Switch"); err = scarlett2_add_new_ctl(mixer, &scarlett2_pad_ctl, i, 1, s, &private->pad_ctls[i]); if (err < 0) @@ -5425,8 +5425,8 @@ static int scarlett2_add_line_in_ctls(struct usb_mixer_interface *mixer) /* Add input air controls */ for (i = 0; i < info->air_input_count; i++) { - snprintf(s, sizeof(s), fmt, i + 1 + info->air_input_first, - "Air", info->air_option ? "Enum" : "Switch"); + scnprintf(s, sizeof(s), fmt, i + 1 + info->air_input_first, + "Air", info->air_option ? "Enum" : "Switch"); err = scarlett2_add_new_ctl( mixer, &scarlett2_air_ctl[info->air_option], i, 1, s, &private->air_ctls[i]); @@ -5481,9 +5481,9 @@ static int scarlett2_add_line_in_ctls(struct usb_mixer_interface *mixer) for (i = 0; i < info->gain_input_count; i++) { if (i % 2) { - snprintf(s, sizeof(s), - "Line In %d-%d Link Capture Switch", - i, i + 1); + scnprintf(s, sizeof(s), + "Line In %d-%d Link Capture Switch", + i, i + 1); err = scarlett2_add_new_ctl( mixer, &scarlett2_input_link_ctl, i / 2, 1, s, @@ -5492,30 +5492,30 @@ static int scarlett2_add_line_in_ctls(struct usb_mixer_interface *mixer) return err; } - snprintf(s, sizeof(s), fmt, i + 1, - "Gain", "Volume"); + scnprintf(s, sizeof(s), fmt, i + 1, + "Gain", "Volume"); err = scarlett2_add_new_ctl( mixer, &scarlett2_input_gain_ctl, i, 1, s, &private->input_gain_ctls[i]); if (err < 0) return err; - snprintf(s, sizeof(s), fmt, i + 1, - "Autogain", "Switch"); + scnprintf(s, sizeof(s), fmt, i + 1, + "Autogain", "Switch"); err = scarlett2_add_new_ctl( mixer, &scarlett2_autogain_switch_ctl, i, 1, s, &private->autogain_ctls[i]); if (err < 0) return err; - snprintf(s, sizeof(s), fmt, i + 1, - "Autogain Status", "Enum"); + scnprintf(s, sizeof(s), fmt, i + 1, + "Autogain Status", "Enum"); err = scarlett2_add_new_ctl( mixer, &scarlett2_autogain_status_ctl, i, 1, s, &private->autogain_status_ctls[i]); - snprintf(s, sizeof(s), fmt, i + 1, - "Safe", "Switch"); + scnprintf(s, sizeof(s), fmt, i + 1, + "Safe", "Switch"); err = scarlett2_add_new_ctl( mixer, &scarlett2_safe_ctl, i, 1, s, &private->safe_ctls[i]); @@ -5902,8 +5902,8 @@ static int scarlett2_add_direct_monitor_ctls(struct usb_mixer_interface *mixer) for (k = 0; k < private->num_mix_in; k++, index++) { char name[SNDRV_CTL_ELEM_ID_NAME_MAXLEN]; - snprintf(name, sizeof(name), format, - mix_type, 'A' + j, k + 1); + scnprintf(name, sizeof(name), format, + mix_type, 'A' + j, k + 1); err = scarlett2_add_new_ctl( mixer, &scarlett2_monitor_mix_ctl, -- GitLab From 19adbe96d3e3c2188ad5838b936550e073cba54d Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 13 Jan 2024 17:08:54 +0100 Subject: [PATCH 0916/1290] ALSA: hda: generic: Remove obsolete call to ledtrig_audio_get Since 64f67b5240db ("leds: trigger: audio: Add an activate callback to ensure the initial brightness is set") the audio triggers have an activate callback which sets the LED brightness as soon as the (default) trigger is bound to the LED device. So we can remove the call to ledtrig_audio_get. Positive side effect: We have no code dependency to ledtrig-audio any longer, therefore, if built as module, it's no longer loaded if not needed. Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/3dc9167d-fb33-43a6-baa6-dbef8b5da7b9@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_generic.c | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/pci/hda/hda_generic.c b/sound/pci/hda/hda_generic.c index bf685d01259d3..de2a3d08c73c1 100644 --- a/sound/pci/hda/hda_generic.c +++ b/sound/pci/hda/hda_generic.c @@ -3946,7 +3946,6 @@ static int create_mute_led_cdev(struct hda_codec *codec, cdev->max_brightness = 1; cdev->default_trigger = micmute ? "audio-micmute" : "audio-mute"; cdev->brightness_set_blocking = callback; - cdev->brightness = ledtrig_audio_get(idx); cdev->flags = LED_CORE_SUSPENDRESUME; err = led_classdev_register(&codec->core.dev, cdev); -- GitLab From 848c8f563dadfdf01358b001ef7c9afe2a6ece8f Mon Sep 17 00:00:00 2001 From: Rander Wang Date: Mon, 15 Jan 2024 11:22:08 +0200 Subject: [PATCH 0917/1290] ASoC: SOF: ipc4-pcm: remove log message for LLP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LLP is supported by I2S and SDW platforms but not supported by HDA platforms. This log is used to notify developer current LLP status for current device. Since above case is known to developers, the log is unnecessary and should be removed. Fixes: 7cb19007baba ("ASoC: SOF: ipc4-pcm: add hw_params") Signed-off-by: Rander Wang Reviewed-by: Péter Ujfalusi Reviewed-by: Pierre-Louis Bossart Signed-off-by: Peter Ujfalusi Link: https://msgid.link/r/20240115092209.7184-2-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-pcm.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sound/soc/sof/ipc4-pcm.c b/sound/soc/sof/ipc4-pcm.c index 39039a647cca3..85d3f390e4b29 100644 --- a/sound/soc/sof/ipc4-pcm.c +++ b/sound/soc/sof/ipc4-pcm.c @@ -768,10 +768,8 @@ static void sof_ipc4_build_time_info(struct snd_sof_dev *sdev, struct snd_sof_pc info->llp_offset = offsetof(struct sof_ipc4_fw_registers, llp_evad_reading_slot) + sdev->fw_info_box.offset; sof_mailbox_read(sdev, info->llp_offset, &llp_slot, sizeof(llp_slot)); - if (llp_slot.node_id != dai_copier->data.gtw_cfg.node_id) { - dev_info(sdev->dev, "no llp found, fall back to default HDA path"); + if (llp_slot.node_id != dai_copier->data.gtw_cfg.node_id) info->llp_offset = 0; - } } static int sof_ipc4_pcm_hw_params(struct snd_soc_component *component, -- GitLab From ab09fb9c629ed3aaea6a82467f08595dbc549726 Mon Sep 17 00:00:00 2001 From: Kai Vehmanen Date: Mon, 15 Jan 2024 11:22:09 +0200 Subject: [PATCH 0918/1290] ASoC: SOF: ipc4-loader: remove the CPC check warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Warnings related to missing data in firmware manifest have proven to be too verbose. This relates to description of DSP module cost expressed in cycles per chunk (CPC). If a matching value is not found in the manifest, kernel will pass a zero value and DSP firmware will use a conservative value in its place. Downgrade the warnings to dev_dbg(). Fixes: d8a2c9879349 ("ASoC: SOF: ipc4-loader/topology: Query the CPC value from manifest") Signed-off-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Reviewed-by: Péter Ujfalusi Reviewed-by: Liam Girdwood Signed-off-by: Peter Ujfalusi Link: https://msgid.link/r/20240115092209.7184-3-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/ipc4-loader.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/sound/soc/sof/ipc4-loader.c b/sound/soc/sof/ipc4-loader.c index 3539b0a66e1be..c79479afa8d0d 100644 --- a/sound/soc/sof/ipc4-loader.c +++ b/sound/soc/sof/ipc4-loader.c @@ -482,13 +482,10 @@ void sof_ipc4_update_cpc_from_manifest(struct snd_sof_dev *sdev, msg = "No CPC match in the firmware file's manifest"; no_cpc: - dev_warn(sdev->dev, "%s (UUID: %pUL): %s (ibs/obs: %u/%u)\n", - fw_module->man4_module_entry.name, - &fw_module->man4_module_entry.uuid, msg, basecfg->ibs, - basecfg->obs); - dev_warn_once(sdev->dev, "Please try to update the firmware.\n"); - dev_warn_once(sdev->dev, "If the issue persists, file a bug at\n"); - dev_warn_once(sdev->dev, "https://github.com/thesofproject/sof/issues/\n"); + dev_dbg(sdev->dev, "%s (UUID: %pUL): %s (ibs/obs: %u/%u)\n", + fw_module->man4_module_entry.name, + &fw_module->man4_module_entry.uuid, msg, basecfg->ibs, + basecfg->obs); } const struct sof_ipc_fw_loader_ops ipc4_loader_ops = { -- GitLab From 521277d12b5a75982d4f642d2ee22db8d7f986dd Mon Sep 17 00:00:00 2001 From: Nicky Chorley Date: Sun, 14 Jan 2024 19:10:56 +0000 Subject: [PATCH 0919/1290] block: Correct a documentation comment in blk-cgroup.c Commit 99e603874366 ("blk-cgroup: pass a gendisk to the blkg allocation helpers") changed blkg_alloc() to take a struct gendisk instead of a struct request_queue, but the documentation comment still referred to q. So, update that comment to refer to disk instead and fix a typo. Signed-off-by: Nicky Chorley Link: https://lore.kernel.org/r/20240114191056.6992-1-ndchorley@gmail.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index e303fd3173137..ff93c385ba5af 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -300,7 +300,7 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) * @disk: gendisk the new blkg is associated with * @gfp_mask: allocation mask to use * - * Allocate a new blkg assocating @blkcg and @q. + * Allocate a new blkg associating @blkcg and @disk. */ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, gfp_t gfp_mask) -- GitLab From 5c7fa5c8ad79a1d7cc9f59636e2f99e8b5471248 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Mon, 15 Jan 2024 22:56:26 +0800 Subject: [PATCH 0920/1290] sbitmap: remove stale comment in sbq_calc_wake_batch After commit 106397376c036 ("sbitmap: fix batching wakeup"), we may wake up more than one queue for each batch. Just remove stale comment that we wake up only one queue for each batch. Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20240115145626.665562-1-shikemeng@huaweicloud.com Signed-off-by: Jens Axboe --- lib/sbitmap.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/lib/sbitmap.c b/lib/sbitmap.c index d0a5081dfd122..92c6b1fd89893 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -388,11 +388,6 @@ static unsigned int sbq_calc_wake_batch(struct sbitmap_queue *sbq, unsigned int shallow_depth; /* - * For each batch, we wake up one queue. We need to make sure that our - * batch size is small enough that the full depth of the bitmap, - * potentially limited by a shallow depth, is enough to wake up all of - * the queues. - * * Each full word of the bitmap has bits_per_word bits, and there might * be a partial word. There are depth / bits_per_word full words and * depth % bits_per_word bits left over. In bitwise arithmetic: -- GitLab From 7b1a8a5fcee4a85be1f540ac0e09761d421e562d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Roberto=20de=20Souza?= Date: Thu, 4 Jan 2024 08:18:32 -0800 Subject: [PATCH 0921/1290] drm/xe: Fix definition of intel_wakeref_t MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit i915 defines it as unsigned long so Xe should do the same to avoid compilation warnings: CC [M] drivers/gpu/drm/i915/i915_gem.o CC [M] drivers/gpu/drm/xe/i915-display/intel_display_power_well.o In file included from ./include/drm/drm_mm.h:51, from drivers/gpu/drm/xe/xe_bo_types.h:11, from drivers/gpu/drm/xe/xe_bo.h:11, from ./drivers/gpu/drm/xe/compat-i915-headers/gem/i915_gem_object.h:11, from ./drivers/gpu/drm/xe/compat-i915-headers/i915_drv.h:15, from drivers/gpu/drm/i915/display/intel_display_power.c:8: drivers/gpu/drm/i915/display/intel_display_power.c: In function ‘print_async_put_domains_state’: drivers/gpu/drm/i915/display/intel_display_power.c:408:29: warning: format ‘%lu’ expects argument of type ‘long unsigned int’, but argument 5 has type ‘int’ [-Wformat=] 408 | drm_dbg(&i915->drm, "async_put_wakeref %lu\n", | ^~~~~~~~~~~~~~~~~~~~~~~~~ 409 | power_domains->async_put_wakeref); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | | | int ./include/drm/drm_print.h:410:39: note: in definition of macro ‘drm_dev_dbg’ 410 | __drm_dev_dbg(NULL, dev, cat, fmt, ##__VA_ARGS__) | ^~~ ./include/drm/drm_print.h:510:33: note: in expansion of macro ‘drm_dbg_driver’ 510 | #define drm_dbg(drm, fmt, ...) drm_dbg_driver(drm, fmt, ##__VA_ARGS__) | ^~~~~~~~~~~~~~ drivers/gpu/drm/i915/display/intel_display_power.c:408:9: note: in expansion of macro ‘drm_dbg’ 408 | drm_dbg(&i915->drm, "async_put_wakeref %lu\n", | ^~~~~~~ drivers/gpu/drm/i915/display/intel_display_power.c:408:50: note: format string is defined here 408 | drm_dbg(&i915->drm, "async_put_wakeref %lu\n", | ~~^ | | | long unsigned int | %u CC [M] drivers/gpu/drm/i915/i915_gem_evict.o CC [M] drivers/gpu/drm/i915/i915_gem_gtt.o CC [M] drivers/gpu/drm/xe/i915-display/intel_display_trace.o CC [M] drivers/gpu/drm/xe/i915-display/intel_display_wa.o CC [M] drivers/gpu/drm/i915/i915_query.o Fixes: 44e694958b95 ("drm/xe/display: Implement display support") Cc: Maarten Lankhorst Reviewed-by: Jani Nikula Signed-off-by: José Roberto de Souza (cherry picked from commit fdbadf504375886a0320ac6f84c850322a6b32e1) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/compat-i915-headers/intel_wakeref.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/compat-i915-headers/intel_wakeref.h b/drivers/gpu/drm/xe/compat-i915-headers/intel_wakeref.h index 1c5e30cf10caa..ecb1c07077069 100644 --- a/drivers/gpu/drm/xe/compat-i915-headers/intel_wakeref.h +++ b/drivers/gpu/drm/xe/compat-i915-headers/intel_wakeref.h @@ -5,4 +5,4 @@ #include -typedef bool intel_wakeref_t; +typedef unsigned long intel_wakeref_t; -- GitLab From 56c253daabc8bd9dfbae52c3d9e0dd34977347a6 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Thu, 4 Jan 2024 00:00:39 -0800 Subject: [PATCH 0922/1290] drm/xe: Fix exec IOCTL long running exec queue ring full condition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The intent is to return -EWOULDBLOCK to the user if a long running exec queue is full during the exec IOCTL. -EWOULDBLOCK aliases to -EAGAIN which results in the exec IOCTL doing a retry loop. Fix this by ensuring the retry loop is broken when returning -EWOULDBLOCK. Fixes: 8ae8a2e8dd21 ("drm/xe: Long running job update") Reported-by: Sai Gowtham Ch Signed-off-by: Matthew Brost Reviewed-by: Brian Welty (cherry picked from commit 97d0047cbb17318431eaf37dfe1a6855539340f9) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_exec.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c index d30c0d0689bcc..b853feed9ccc1 100644 --- a/drivers/gpu/drm/xe/xe_exec.c +++ b/drivers/gpu/drm/xe/xe_exec.c @@ -115,7 +115,7 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file) struct xe_sched_job *job; struct dma_fence *rebind_fence; struct xe_vm *vm; - bool write_locked; + bool write_locked, skip_retry = false; ktime_t end = 0; int err = 0; @@ -227,7 +227,8 @@ retry: } if (xe_exec_queue_is_lr(q) && xe_exec_queue_ring_full(q)) { - err = -EWOULDBLOCK; + err = -EWOULDBLOCK; /* Aliased to -EAGAIN */ + skip_retry = true; goto err_exec; } @@ -337,7 +338,7 @@ err_unlock_list: up_write(&vm->lock); else up_read(&vm->lock); - if (err == -EAGAIN) + if (err == -EAGAIN && !skip_retry) goto retry; err_syncs: for (i = 0; i < num_syncs; i++) -- GitLab From 457f4439833487acb18abdd55e95fbb17d43fdca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Fri, 22 Dec 2023 18:59:04 +0100 Subject: [PATCH 0923/1290] drm/xe/vm: Fix an error path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If using the VM_BIND_OP_UNMAP_ALL without any bound vmas for the vm, we will end up dereferencing an uninitialized variable and leak a bo lock. Fix this. v2: - Updated commit message (Lucas De Marchi) Reported-by: Dafna Hirschfeld Closes: https://lore.kernel.org/intel-xe/jrwua7ckbiozfcaodx4gg2h4taiuxs53j5zlpf3qzvyhyiyl2d@pbs3plurokrj/ Suggested-by: Dafna Hirschfeld Fixes: b06d47be7c83 ("drm/xe: Port Xe to GPUVA") Signed-off-by: Thomas Hellström Acked-by: Lucas De Marchi Reviewed-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20231222175904.16732-1-thomas.hellstrom@linux.intel.com (cherry picked from commit 9d0c1c5618be02c5acda7e6bbb728007b0632984) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_vm.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 0cfe7289b97ef..b0e3cab6a5845 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -2063,9 +2063,11 @@ vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_bo *bo, if (err) return ERR_PTR(err); - vm_bo = drm_gpuvm_bo_find(&vm->gpuvm, obj); - if (!vm_bo) - break; + vm_bo = drm_gpuvm_bo_obtain(&vm->gpuvm, obj); + if (IS_ERR(vm_bo)) { + xe_bo_unlock(bo); + return ERR_CAST(vm_bo); + } ops = drm_gpuvm_bo_unmap_ops_create(vm_bo); drm_gpuvm_bo_put(vm_bo); -- GitLab From 3ec276d06698189506f508f87c0f4f17c11e0251 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Tue, 9 Jan 2024 12:24:02 +0100 Subject: [PATCH 0924/1290] drm/xe: Use __iomem for the regs pointer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The regs pointer points to IO memory. Annotate it properly and fix the corresponding sparse warning. Fixes: a4e2f3a299ea ("drm/xe: refactor xe_mmio_probe_tiles to support MMIO extension") Cc: Koby Elbaz Cc: Ofir Bitton Cc: Moti Haimovski Cc: Rodrigo Vivi Signed-off-by: Thomas Hellström Reviewed-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20240109112405.108136-2-thomas.hellstrom@linux.intel.com (cherry picked from commit 9d03bf30e78673d827484bbc17a6fd8f5e43a039) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_mmio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_mmio.c b/drivers/gpu/drm/xe/xe_mmio.c index f660cfb79f504..c8c5d74b6e904 100644 --- a/drivers/gpu/drm/xe/xe_mmio.c +++ b/drivers/gpu/drm/xe/xe_mmio.c @@ -303,7 +303,7 @@ void xe_mmio_probe_tiles(struct xe_device *xe) u8 id, tile_count = xe->info.tile_count; struct xe_gt *gt = xe_root_mmio_gt(xe); struct xe_tile *tile; - void *regs; + void __iomem *regs; u32 mtcfg; if (tile_count == 1) -- GitLab From 77232e6a28447c2942558d05f1c3115bdf95a9e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Tue, 9 Jan 2024 12:24:03 +0100 Subject: [PATCH 0925/1290] drm/xe: Annotate xe_mem_region::mapping with __iomem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pointer points to IO memory, but the __iomem annotation was incorrectly placed. Annotate it correctly, update its usage accordingly and fix the corresponding sparse error. Fixes: 0887a2e7ab62 ("drm/xe: Make xe_mem_region struct") Cc: Oak Zeng Cc: Michael J. Ruhl Cc: Matthew Brost Cc: Rodrigo Vivi Signed-off-by: Thomas Hellström Reviewed-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20240109112405.108136-3-thomas.hellstrom@linux.intel.com (cherry picked from commit 20855b62a30538361e587cfc7c5245f07d4f826a) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_bo.c | 4 ++-- drivers/gpu/drm/xe/xe_device_types.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index 8e4a3b1f6b938..3cd29bd015a02 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -442,7 +442,7 @@ static int xe_ttm_io_mem_reserve(struct ttm_device *bdev, if (vram->mapping && mem->placement & TTM_PL_FLAG_CONTIGUOUS) - mem->bus.addr = (u8 *)vram->mapping + + mem->bus.addr = (u8 __force *)vram->mapping + mem->bus.offset; mem->bus.offset += vram->io_start; @@ -734,7 +734,7 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict, /* Create a new VMAP once kernel BO back in VRAM */ if (!ret && resource_is_vram(new_mem)) { struct xe_mem_region *vram = res_to_mem_region(new_mem); - void *new_addr = vram->mapping + + void __iomem *new_addr = vram->mapping + (new_mem->start << PAGE_SHIFT); if (XE_WARN_ON(new_mem->start == XE_BO_INVALID_OFFSET)) { diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h index c45ef17b34732..4b38c6bc6c76c 100644 --- a/drivers/gpu/drm/xe/xe_device_types.h +++ b/drivers/gpu/drm/xe/xe_device_types.h @@ -97,7 +97,7 @@ struct xe_mem_region { */ resource_size_t actual_physical_size; /** @mapping: pointer to VRAM mappable space */ - void *__iomem mapping; + void __iomem *mapping; }; /** -- GitLab From 5c63e7574739c034e072dea0e0a6fcbe8d538666 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Tue, 9 Jan 2024 12:24:04 +0100 Subject: [PATCH 0926/1290] drm/xe: Annotate multiple mmio pointers with __iomem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are a couple of pointers pointing to MMIO space. Annotate them with __iomem and fix the corresponding sparse warnings. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Fixes: 3b0d4a557996 ("drm/xe: Move register MMIO into xe_tile") Fixes: 399a13323f0d ("drm/xe: add 28-bit address support in struct xe_reg") Cc: Rodrigo Vivi Cc: Matthew Brost Cc: Lucas De Marchi Cc: Matt Roper Cc: Koby Elbaz Cc: Ofir Bitton Cc: Moti Haimovski Signed-off-by: Thomas Hellström Reviewed-by: Lucas De Marchi Reviewed-by: Matt Roper Link: https://patchwork.freedesktop.org/patch/msgid/20240109112405.108136-4-thomas.hellstrom@linux.intel.com (cherry picked from commit 9d612ee52c6096bc70d43f54921ba2831ffbf1ad) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_device_types.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h index 4b38c6bc6c76c..5dc9127a20293 100644 --- a/drivers/gpu/drm/xe/xe_device_types.h +++ b/drivers/gpu/drm/xe/xe_device_types.h @@ -146,7 +146,7 @@ struct xe_tile { size_t size; /** @regs: pointer to tile's MMIO space (starting with registers) */ - void *regs; + void __iomem *regs; } mmio; /** @@ -159,7 +159,7 @@ struct xe_tile { size_t size; /** @regs: pointer to tile's additional MMIO-extension space */ - void *regs; + void __iomem *regs; } mmio_ext; /** @mem: memory management info for tile */ @@ -301,7 +301,7 @@ struct xe_device { /** @size: size of MMIO space for device */ size_t size; /** @regs: pointer to MMIO space for device */ - void *regs; + void __iomem *regs; } mmio; /** @mem: memory info for device */ -- GitLab From 98949068eb559a31f162ab37f56a89bf6c3698ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Tue, 9 Jan 2024 12:24:05 +0100 Subject: [PATCH 0927/1290] drm/xe: Annotate xe_ttm_stolen_mgr::mapping with __iomem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pointer points to IO memory, but the __iomem annotation was incorrectly placed. Annotate it correctly, update its usage accordingly and fix the corresponding sparse error. Fixes: d8b52a02cb40 ("drm/xe: Implement stolen memory.") Cc: Maarten Lankhorst Cc: Matthew Brost Cc: Rodrigo Vivi Signed-off-by: Thomas Hellström Reviewed-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20240109112405.108136-5-thomas.hellstrom@linux.intel.com (cherry picked from commit dcddb6f0b06d454c9a3b2b240a43f0e7310c7f7c) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_ttm_stolen_mgr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_ttm_stolen_mgr.c b/drivers/gpu/drm/xe/xe_ttm_stolen_mgr.c index d2b00d0bf1e20..e5d7d5e2bec12 100644 --- a/drivers/gpu/drm/xe/xe_ttm_stolen_mgr.c +++ b/drivers/gpu/drm/xe/xe_ttm_stolen_mgr.c @@ -31,7 +31,7 @@ struct xe_ttm_stolen_mgr { /* GPU base offset */ resource_size_t stolen_base; - void *__iomem mapping; + void __iomem *mapping; }; static inline struct xe_ttm_stolen_mgr * @@ -275,7 +275,7 @@ static int __xe_ttm_stolen_io_mem_reserve_bar2(struct xe_device *xe, drm_WARN_ON(&xe->drm, !(mem->placement & TTM_PL_FLAG_CONTIGUOUS)); if (mem->placement & TTM_PL_FLAG_CONTIGUOUS && mgr->mapping) - mem->bus.addr = (u8 *)mgr->mapping + mem->bus.offset; + mem->bus.addr = (u8 __force *)mgr->mapping + mem->bus.offset; mem->bus.offset += mgr->io_base; mem->bus.is_iomem = true; -- GitLab From fef257eb6dcb9f39baee9ac44f064cd796ecfd0b Mon Sep 17 00:00:00 2001 From: Brian Welty Date: Fri, 5 Jan 2024 11:04:39 -0800 Subject: [PATCH 0928/1290] drm/xe: Fix guc_exec_queue_set_priority MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We need to set q->priority prior to calling guc_exec_queue_add_msg() as that will call init_policies() and sets the scheduling properties to those stored in the exec_queue. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Signed-off-by: Brian Welty Signed-off-by: Matthew Brost Reviewed-by: Matthew Brost (cherry picked from commit b16483f9f8120b530327879fa3ea576e897946da) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_guc_submit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index 21ac68e3246f8..5de3ac47c4623 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -1308,8 +1308,8 @@ static int guc_exec_queue_set_priority(struct xe_exec_queue *q, if (!msg) return -ENOMEM; - guc_exec_queue_add_msg(q, msg, SET_SCHED_PROPS); q->priority = priority; + guc_exec_queue_add_msg(q, msg, SET_SCHED_PROPS); return 0; } -- GitLab From 19c02225242498eea9267d444ee1276016368d49 Mon Sep 17 00:00:00 2001 From: Brian Welty Date: Fri, 5 Jan 2024 11:04:40 -0800 Subject: [PATCH 0929/1290] drm/xe: Fix modifying exec_queue priority in xe_migrate_init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After exec_queue has been created, we cannot simply modify q->priority. This needs to be done by the backend via q->ops. However in this case, it would be more efficient to simply pass a flag when creating the exec_queue and set the desired priority upfront during queue creation. To that end: new flag EXEC_QUEUE_FLAG_HIGH_PRIORITY is introduced. The priority field is moved to be with other scheduling properties and is now exec_queue.sched_props.priority. This is no longer set to initial value by the backend, but is now set within __xe_exec_queue_create(). Fixes: b4eecedc75c1 ("drm/xe: Fix potential deadlock handling page faults") Signed-off-by: Brian Welty Signed-off-by: Matthew Brost Reviewed-by: Matthew Brost (cherry picked from commit a8004af338f6b3319476ecbed63ea49bf393fc1f) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_exec_queue.c | 5 +++++ drivers/gpu/drm/xe/xe_exec_queue_types.h | 6 ++++-- drivers/gpu/drm/xe/xe_guc_submit.c | 7 +++---- drivers/gpu/drm/xe/xe_migrate.c | 5 ++--- 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c index 44fe8097b7cda..bcfc4127c7c59 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.c +++ b/drivers/gpu/drm/xe/xe_exec_queue.c @@ -67,6 +67,11 @@ static struct xe_exec_queue *__xe_exec_queue_create(struct xe_device *xe, q->sched_props.timeslice_us = hwe->eclass->sched_props.timeslice_us; q->sched_props.preempt_timeout_us = hwe->eclass->sched_props.preempt_timeout_us; + if (q->flags & EXEC_QUEUE_FLAG_KERNEL && + q->flags & EXEC_QUEUE_FLAG_HIGH_PRIORITY) + q->sched_props.priority = XE_EXEC_QUEUE_PRIORITY_KERNEL; + else + q->sched_props.priority = XE_EXEC_QUEUE_PRIORITY_NORMAL; if (xe_exec_queue_is_parallel(q)) { q->parallel.composite_fence_ctx = dma_fence_context_alloc(1); diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h index 3d7e704ec3d9f..8d4b7feb8c306 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h @@ -52,8 +52,6 @@ struct xe_exec_queue { struct xe_vm *vm; /** @class: class of this exec queue */ enum xe_engine_class class; - /** @priority: priority of this exec queue */ - enum xe_exec_queue_priority priority; /** * @logical_mask: logical mask of where job submitted to exec queue can run */ @@ -84,6 +82,8 @@ struct xe_exec_queue { #define EXEC_QUEUE_FLAG_VM BIT(4) /* child of VM queue for multi-tile VM jobs */ #define EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD BIT(5) +/* kernel exec_queue only, set priority to highest level */ +#define EXEC_QUEUE_FLAG_HIGH_PRIORITY BIT(6) /** * @flags: flags for this exec queue, should statically setup aside from ban @@ -142,6 +142,8 @@ struct xe_exec_queue { u32 timeslice_us; /** @preempt_timeout_us: preemption timeout in micro-seconds */ u32 preempt_timeout_us; + /** @priority: priority of this exec queue */ + enum xe_exec_queue_priority priority; } sched_props; /** @compute: compute exec queue state */ diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index 5de3ac47c4623..54ffcfcdd41f9 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -421,7 +421,7 @@ static void init_policies(struct xe_guc *guc, struct xe_exec_queue *q) { struct exec_queue_policy policy; struct xe_device *xe = guc_to_xe(guc); - enum xe_exec_queue_priority prio = q->priority; + enum xe_exec_queue_priority prio = q->sched_props.priority; u32 timeslice_us = q->sched_props.timeslice_us; u32 preempt_timeout_us = q->sched_props.preempt_timeout_us; @@ -1231,7 +1231,6 @@ static int guc_exec_queue_init(struct xe_exec_queue *q) err = xe_sched_entity_init(&ge->entity, sched); if (err) goto err_sched; - q->priority = XE_EXEC_QUEUE_PRIORITY_NORMAL; if (xe_exec_queue_is_lr(q)) INIT_WORK(&q->guc->lr_tdr, xe_guc_exec_queue_lr_cleanup); @@ -1301,14 +1300,14 @@ static int guc_exec_queue_set_priority(struct xe_exec_queue *q, { struct xe_sched_msg *msg; - if (q->priority == priority || exec_queue_killed_or_banned(q)) + if (q->sched_props.priority == priority || exec_queue_killed_or_banned(q)) return 0; msg = kmalloc(sizeof(*msg), GFP_KERNEL); if (!msg) return -ENOMEM; - q->priority = priority; + q->sched_props.priority = priority; guc_exec_queue_add_msg(q, msg, SET_SCHED_PROPS); return 0; diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c index adf1dab5eba25..02fca8f9adc28 100644 --- a/drivers/gpu/drm/xe/xe_migrate.c +++ b/drivers/gpu/drm/xe/xe_migrate.c @@ -344,7 +344,8 @@ struct xe_migrate *xe_migrate_init(struct xe_tile *tile) m->q = xe_exec_queue_create(xe, vm, logical_mask, 1, hwe, EXEC_QUEUE_FLAG_KERNEL | - EXEC_QUEUE_FLAG_PERMANENT); + EXEC_QUEUE_FLAG_PERMANENT | + EXEC_QUEUE_FLAG_HIGH_PRIORITY); } else { m->q = xe_exec_queue_create_class(xe, primary_gt, vm, XE_ENGINE_CLASS_COPY, @@ -355,8 +356,6 @@ struct xe_migrate *xe_migrate_init(struct xe_tile *tile) xe_vm_close_and_put(vm); return ERR_CAST(m->q); } - if (xe->info.has_usm) - m->q->priority = XE_EXEC_QUEUE_PRIORITY_KERNEL; mutex_init(&m->job_mutex); -- GitLab From 23ca3d2fe367794d2816530fa6b141339fddc1c6 Mon Sep 17 00:00:00 2001 From: Vinay Belgaumkar Date: Mon, 8 Jan 2024 14:58:42 -0800 Subject: [PATCH 0930/1290] drm/xe: Check skip_guc_pc before setting SLPC flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Don't set SLPC GuC feature ctl flag if skip_guc_pc is true. v2: Skip the freq related sysfs creation as well (Badal) v3: Remove unnecessary parenthesis (Lucas) Fixes: 975e4a3795d4 ("drm/xe: Manually setup C6 when skip_guc_pc is set") Fixes: bef52b5c7a19 ("drm/xe: Create a xe_gt_freq component for raw management and sysfs") Reviewed-by: Lucas De Marchi Signed-off-by: Vinay Belgaumkar Link: https://lore.kernel.org/r/20240108225842.966066-1-vinay.belgaumkar@intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit 69cac0a8f3ef8db4d62441c4a2686ec676c9facd) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_gt_freq.c | 3 +++ drivers/gpu/drm/xe/xe_guc.c | 7 ++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_gt_freq.c b/drivers/gpu/drm/xe/xe_gt_freq.c index 3adfa6686e7cf..e5b0f4ecdbe82 100644 --- a/drivers/gpu/drm/xe/xe_gt_freq.c +++ b/drivers/gpu/drm/xe/xe_gt_freq.c @@ -196,6 +196,9 @@ void xe_gt_freq_init(struct xe_gt *gt) struct xe_device *xe = gt_to_xe(gt); int err; + if (xe->info.skip_guc_pc) + return; + gt->freq = kobject_create_and_add("freq0", gt->sysfs); if (!gt->freq) { drm_warn(&xe->drm, "failed to add freq0 directory to %s\n", diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c index 482cb0df9f15b..0a61390c64a7b 100644 --- a/drivers/gpu/drm/xe/xe_guc.c +++ b/drivers/gpu/drm/xe/xe_guc.c @@ -60,7 +60,12 @@ static u32 guc_ctl_debug_flags(struct xe_guc *guc) static u32 guc_ctl_feature_flags(struct xe_guc *guc) { - return GUC_CTL_ENABLE_SLPC; + u32 flags = 0; + + if (!guc_to_xe(guc)->info.skip_guc_pc) + flags |= GUC_CTL_ENABLE_SLPC; + + return flags; } static u32 guc_ctl_log_params_flags(struct xe_guc *guc) -- GitLab From 190db3b1da8f40131d6153de7469abce16766302 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Jan 2024 06:48:29 -0800 Subject: [PATCH 0931/1290] drm/xe: Fix build bug for GCC 11 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Building drivers/gpu/drm/xe/xe_gt_pagefault.c with GCC 11 results in the following build errors: ./include/linux/fortify-string.h:57:33: error: writing 16 bytes into a region of size 0 [-Werror=stringop-overflow=] 57 | #define __underlying_memcpy __builtin_memcpy | ^ ./include/linux/fortify-string.h:644:9: note: in expansion of macro ‘__underlying_memcpy’ 644 | __underlying_##op(p, q, __fortify_size); \ | ^~~~~~~~~~~~~ ./include/linux/fortify-string.h:689:26: note: in expansion of macro ‘__fortify_memcpy_chk’ 689 | #define memcpy(p, q, s) __fortify_memcpy_chk(p, q, s, \ | ^~~~~~~~~~~~~~~~~~~~ drivers/gpu/drm/xe/xe_gt_pagefault.c:340:17: note: in expansion of macro ‘memcpy’ 340 | memcpy(pf_queue->data + pf_queue->tail, msg, len * sizeof(u32)); | ^~~~~~ In file included from drivers/gpu/drm/xe/xe_device_types.h:17, from drivers/gpu/drm/xe/xe_vm_types.h:16, from drivers/gpu/drm/xe/xe_bo.h:13, from drivers/gpu/drm/xe/xe_gt_pagefault.c:16: drivers/gpu/drm/xe/xe_gt_types.h:102:25: note: at offset [1144, 265324] into destination object ‘tile’ of size 8 102 | struct xe_tile *tile; | ^~~~ Fix these by removing -Wstringop-overflow from drm/xe builds. Closes: https://lore.kernel.org/all/45ad1d0f-a10f-483e-848a-76a30252edbe@paulmck-laptop/ Fixes: 7a8bc11782d3 ("drm/xe: Enable W=1 warnings by default") Suggested-by: Stephen Rothwell Signed-off-by: Paul E. McKenney [ This particular warning is broken on GCC11. In future changes it will be moved to the normal C flags in the top level Makefile (out of Makefile.extrawarn), but accounting for the compiler support. Just remove it out of xe's forced extra warnings for now ] Signed-off-by: Lucas De Marchi (cherry picked from commit a109d19992294736abd4f4232ea639e03eb1f9e7) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile index 53bd2a8ba1ae5..efcf0ab7a1a69 100644 --- a/drivers/gpu/drm/xe/Makefile +++ b/drivers/gpu/drm/xe/Makefile @@ -17,7 +17,6 @@ subdir-ccflags-y += $(call cc-option, -Wunused-const-variable) subdir-ccflags-y += $(call cc-option, -Wpacked-not-aligned) subdir-ccflags-y += $(call cc-option, -Wformat-overflow) subdir-ccflags-y += $(call cc-option, -Wformat-truncation) -subdir-ccflags-y += $(call cc-option, -Wstringop-overflow) subdir-ccflags-y += $(call cc-option, -Wstringop-truncation) # The following turn off the warnings enabled by -Wextra ifeq ($(findstring 2, $(KBUILD_EXTRA_WARN)),) -- GitLab From ffd915e41a4a2277fd8041dc77603df59acf3e01 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 5 Jan 2024 15:22:23 +0300 Subject: [PATCH 0932/1290] drm/xe/device: clean up on error in probe() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This error path should clean up before returning. Smatch detected this bug: drivers/gpu/drm/xe/xe_device.c:487 xe_device_probe() warn: missing unwind goto? Fixes: 4cb12b71923b ("drm/xe/xe2: Determine bios enablement for flat ccs on igfx") Signed-off-by: Dan Carpenter Signed-off-by: Matthew Brost Reviewed-by: Matthew Brost (cherry picked from commit c10da95afa68060e13c5f920d96671943a7e54d9) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index d9ae77fe7382d..b8d8da5466708 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -484,7 +484,7 @@ int xe_device_probe(struct xe_device *xe) err = xe_device_set_has_flat_ccs(xe); if (err) - return err; + goto err_irq_shutdown; err = xe_mmio_probe_vram(xe); if (err) -- GitLab From 616576df35193bbadac31dc42a32d5943e183f45 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 5 Jan 2024 15:20:35 +0300 Subject: [PATCH 0933/1290] drm/xe/selftests: Fix an error pointer dereference bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Check if "bo" is an error pointer before calling xe_bo_lock() on it. Fixes: d6abc18d6693 ("drm/xe/xe2: Modify xe_bo_test for system memory") Signed-off-by: Dan Carpenter Signed-off-by: Matthew Brost Reviewed-by: Matthew Brost (cherry picked from commit 88ec23528b32ddb9ce2e8492f2629b0056353697) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/tests/xe_bo.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/xe/tests/xe_bo.c b/drivers/gpu/drm/xe/tests/xe_bo.c index 412b2e7ce40cb..3436fd9cf2b27 100644 --- a/drivers/gpu/drm/xe/tests/xe_bo.c +++ b/drivers/gpu/drm/xe/tests/xe_bo.c @@ -125,14 +125,13 @@ static void ccs_test_run_tile(struct xe_device *xe, struct xe_tile *tile, bo = xe_bo_create_user(xe, NULL, NULL, SZ_1M, DRM_XE_GEM_CPU_CACHING_WC, ttm_bo_type_device, bo_flags); - - xe_bo_lock(bo, false); - if (IS_ERR(bo)) { KUNIT_FAIL(test, "Failed to create bo.\n"); return; } + xe_bo_lock(bo, false); + kunit_info(test, "Verifying that CCS data is cleared on creation.\n"); ret = ccs_test_migrate(tile, bo, false, 0ULL, 0xdeadbeefdeadbeefULL, test); -- GitLab From ec32f4f1bed87f0b87b9b0091231c8685db1138c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 5 Jan 2024 15:20:22 +0300 Subject: [PATCH 0934/1290] drm/xe: unlock on error path in xe_vm_add_compute_exec_queue() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop the "&vm->lock" before returning. Fixes: 24f947d58fe5 ("drm/xe: Use DRM GPUVM helpers for external- and evicted objects") Signed-off-by: Dan Carpenter Signed-off-by: Matthew Brost Reviewed-by: Matthew Brost (cherry picked from commit cf46019e8550a810cc023af7aa020ba43103b44d) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_vm.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index b0e3cab6a5845..10b6995fbf294 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -335,13 +335,13 @@ int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q) down_write(&vm->lock); err = drm_gpuvm_exec_lock(&vm_exec); if (err) - return err; + goto out_up_write; pfence = xe_preempt_fence_create(q, q->compute.context, ++q->compute.seqno); if (!pfence) { err = -ENOMEM; - goto out_unlock; + goto out_fini; } list_add(&q->compute.link, &vm->preempt.exec_queues); @@ -364,8 +364,9 @@ int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q) up_read(&vm->userptr.notifier_lock); -out_unlock: +out_fini: drm_exec_fini(exec); +out_up_write: up_write(&vm->lock); return err; -- GitLab From 7425c43c268f859426d02ccb3f043bdbae31cca9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Wed, 10 Jan 2024 17:34:15 +0100 Subject: [PATCH 0935/1290] drm/xe/migrate: Fix CCS copy for small VRAM copy chunks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since the migrate code is using the identity map for addressing VRAM, copy chunks may become as small as 64K if the VRAM resource is fragmented. However, a chunk size smaller that 1MiB may lead to the *next* chunk's offset into the CCS metadata backup memory may not be page-aligned, and the XY_CTRL_SURF_COPY_BLT command can't handle that, and even if it could, the current code doesn't handle the offset calculaton correctly. To fix this, make sure we align the size of VRAM copy chunks to 1MiB. If the remaining data to copy is smaller than that, that's not a problem, so use the remaining size. If the VRAM copy cunk becomes fragmented due to the size alignment restriction, don't use the identity map, but instead emit PTEs into the page-table like we do for system memory. v2: - Rebase v3: - Future proof somewhat by taking into account the real data size to flat CCS metadata size ratio. (Matt Roper) - Invert a couple of if-statements for better readability. - Fix support for 4K-granularity VRAM sizes. (Tested on DG1). v4: - Fix up code comments - Fix debug printout format typo. v5: - Add a Fixes: tag. Cc: Matt Roper Cc: Matthew Auld Cc: Matthew Brost Fixes: e89b384cde62 ("drm/xe/migrate: Update emit_pte to cope with a size level than 4k") Signed-off-by: Thomas Hellström Reviewed-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20240110163415.524165-1-thomas.hellstrom@linux.intel.com (cherry picked from commit ef51d7542d143f3fd9a48d4e2c307563661668aa) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/tests/xe_migrate.c | 2 +- drivers/gpu/drm/xe/xe_migrate.c | 128 ++++++++++++++++---------- 2 files changed, 80 insertions(+), 50 deletions(-) diff --git a/drivers/gpu/drm/xe/tests/xe_migrate.c b/drivers/gpu/drm/xe/tests/xe_migrate.c index 7a32faa2f6888..a6523df0f1d39 100644 --- a/drivers/gpu/drm/xe/tests/xe_migrate.c +++ b/drivers/gpu/drm/xe/tests/xe_migrate.c @@ -331,7 +331,7 @@ static void xe_migrate_sanity_test(struct xe_migrate *m, struct kunit *test) xe_res_first_sg(xe_bo_sg(pt), 0, pt->size, &src_it); emit_pte(m, bb, NUM_KERNEL_PDE - 1, xe_bo_is_vram(pt), false, - &src_it, XE_PAGE_SIZE, pt); + &src_it, XE_PAGE_SIZE, pt->ttm.resource); run_sanity_job(m, xe, bb, bb->len, "Writing PTE for our fake PT", test); diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c index 02fca8f9adc28..e05e9e7282b68 100644 --- a/drivers/gpu/drm/xe/xe_migrate.c +++ b/drivers/gpu/drm/xe/xe_migrate.c @@ -62,6 +62,8 @@ struct xe_migrate { * out of the pt_bo. */ struct drm_suballoc_manager vm_update_sa; + /** @min_chunk_size: For dgfx, Minimum chunk size */ + u64 min_chunk_size; }; #define MAX_PREEMPTDISABLE_TRANSFER SZ_8M /* Around 1ms. */ @@ -363,6 +365,19 @@ struct xe_migrate *xe_migrate_init(struct xe_tile *tile) if (err) return ERR_PTR(err); + if (IS_DGFX(xe)) { + if (xe_device_has_flat_ccs(xe)) + /* min chunk size corresponds to 4K of CCS Metadata */ + m->min_chunk_size = SZ_4K * SZ_64K / + xe_device_ccs_bytes(xe, SZ_64K); + else + /* Somewhat arbitrary to avoid a huge amount of blits */ + m->min_chunk_size = SZ_64K; + m->min_chunk_size = roundup_pow_of_two(m->min_chunk_size); + drm_dbg(&xe->drm, "Migrate min chunk size is 0x%08llx\n", + (unsigned long long)m->min_chunk_size); + } + return m; } @@ -374,16 +389,35 @@ static u64 max_mem_transfer_per_pass(struct xe_device *xe) return MAX_PREEMPTDISABLE_TRANSFER; } -static u64 xe_migrate_res_sizes(struct xe_device *xe, struct xe_res_cursor *cur) +static u64 xe_migrate_res_sizes(struct xe_migrate *m, struct xe_res_cursor *cur) { - /* - * For VRAM we use identity mapped pages so we are limited to current - * cursor size. For system we program the pages ourselves so we have no - * such limitation. - */ - return min_t(u64, max_mem_transfer_per_pass(xe), - mem_type_is_vram(cur->mem_type) ? cur->size : - cur->remaining); + struct xe_device *xe = tile_to_xe(m->tile); + u64 size = min_t(u64, max_mem_transfer_per_pass(xe), cur->remaining); + + if (mem_type_is_vram(cur->mem_type)) { + /* + * VRAM we want to blit in chunks with sizes aligned to + * min_chunk_size in order for the offset to CCS metadata to be + * page-aligned. If it's the last chunk it may be smaller. + * + * Another constraint is that we need to limit the blit to + * the VRAM block size, unless size is smaller than + * min_chunk_size. + */ + u64 chunk = max_t(u64, cur->size, m->min_chunk_size); + + size = min_t(u64, size, chunk); + if (size > m->min_chunk_size) + size = round_down(size, m->min_chunk_size); + } + + return size; +} + +static bool xe_migrate_allow_identity(u64 size, const struct xe_res_cursor *cur) +{ + /* If the chunk is not fragmented, allow identity map. */ + return cur->size >= size; } static u32 pte_update_size(struct xe_migrate *m, @@ -396,7 +430,12 @@ static u32 pte_update_size(struct xe_migrate *m, u32 cmds = 0; *L0_pt = pt_ofs; - if (!is_vram) { + if (is_vram && xe_migrate_allow_identity(*L0, cur)) { + /* Offset into identity map. */ + *L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile), + cur->start + vram_region_gpu_offset(res)); + cmds += cmd_size; + } else { /* Clip L0 to available size */ u64 size = min(*L0, (u64)avail_pts * SZ_2M); u64 num_4k_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE); @@ -412,11 +451,6 @@ static u32 pte_update_size(struct xe_migrate *m, /* Each chunk has a single blit command */ cmds += cmd_size; - } else { - /* Offset into identity map. */ - *L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile), - cur->start + vram_region_gpu_offset(res)); - cmds += cmd_size; } return cmds; @@ -426,10 +460,10 @@ static void emit_pte(struct xe_migrate *m, struct xe_bb *bb, u32 at_pt, bool is_vram, bool is_comp_pte, struct xe_res_cursor *cur, - u32 size, struct xe_bo *bo) + u32 size, struct ttm_resource *res) { struct xe_device *xe = tile_to_xe(m->tile); - + struct xe_vm *vm = m->q->vm; u16 pat_index; u32 ptes; u64 ofs = at_pt * XE_PAGE_SIZE; @@ -442,13 +476,6 @@ static void emit_pte(struct xe_migrate *m, else pat_index = xe->pat.idx[XE_CACHE_WB]; - /* - * FIXME: Emitting VRAM PTEs to L0 PTs is forbidden. Currently - * we're only emitting VRAM PTEs during sanity tests, so when - * that's moved to a Kunit test, we should condition VRAM PTEs - * on running tests. - */ - ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE); while (ptes) { @@ -468,20 +495,22 @@ static void emit_pte(struct xe_migrate *m, addr = xe_res_dma(cur) & PAGE_MASK; if (is_vram) { - /* Is this a 64K PTE entry? */ - if ((m->q->vm->flags & XE_VM_FLAG_64K) && - !(cur_ofs & (16 * 8 - 1))) { - xe_tile_assert(m->tile, IS_ALIGNED(addr, SZ_64K)); + if (vm->flags & XE_VM_FLAG_64K) { + u64 va = cur_ofs * XE_PAGE_SIZE / 8; + + xe_assert(xe, (va & (SZ_64K - 1)) == + (addr & (SZ_64K - 1))); + flags |= XE_PTE_PS64; } - addr += vram_region_gpu_offset(bo->ttm.resource); + addr += vram_region_gpu_offset(res); devmem = true; } - addr = m->q->vm->pt_ops->pte_encode_addr(m->tile->xe, - addr, pat_index, - 0, devmem, flags); + addr = vm->pt_ops->pte_encode_addr(m->tile->xe, + addr, pat_index, + 0, devmem, flags); bb->cs[bb->len++] = lower_32_bits(addr); bb->cs[bb->len++] = upper_32_bits(addr); @@ -693,8 +722,8 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m, bool usm = xe->info.has_usm; u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; - src_L0 = xe_migrate_res_sizes(xe, &src_it); - dst_L0 = xe_migrate_res_sizes(xe, &dst_it); + src_L0 = xe_migrate_res_sizes(m, &src_it); + dst_L0 = xe_migrate_res_sizes(m, &dst_it); drm_dbg(&xe->drm, "Pass %u, sizes: %llu & %llu\n", pass++, src_L0, dst_L0); @@ -715,6 +744,7 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m, &ccs_ofs, &ccs_pt, 0, 2 * avail_pts, avail_pts); + xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE)); } /* Add copy commands size here */ @@ -727,20 +757,20 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m, goto err_sync; } - if (!src_is_vram) - emit_pte(m, bb, src_L0_pt, src_is_vram, true, &src_it, src_L0, - src_bo); - else + if (src_is_vram && xe_migrate_allow_identity(src_L0, &src_it)) xe_res_next(&src_it, src_L0); - - if (!dst_is_vram) - emit_pte(m, bb, dst_L0_pt, dst_is_vram, true, &dst_it, src_L0, - dst_bo); else + emit_pte(m, bb, src_L0_pt, src_is_vram, true, &src_it, src_L0, + src); + + if (dst_is_vram && xe_migrate_allow_identity(src_L0, &dst_it)) xe_res_next(&dst_it, src_L0); + else + emit_pte(m, bb, dst_L0_pt, dst_is_vram, true, &dst_it, src_L0, + dst); if (copy_system_ccs) - emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src_bo); + emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src); bb->cs[bb->len++] = MI_BATCH_BUFFER_END; update_idx = bb->len; @@ -949,7 +979,7 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m, bool usm = xe->info.has_usm; u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; - clear_L0 = xe_migrate_res_sizes(xe, &src_it); + clear_L0 = xe_migrate_res_sizes(m, &src_it); drm_dbg(&xe->drm, "Pass %u, size: %llu\n", pass++, clear_L0); @@ -976,12 +1006,12 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m, size -= clear_L0; /* Preemption is enabled again by the ring ops. */ - if (!clear_vram) { - emit_pte(m, bb, clear_L0_pt, clear_vram, true, &src_it, clear_L0, - bo); - } else { + if (clear_vram && xe_migrate_allow_identity(clear_L0, &src_it)) xe_res_next(&src_it, clear_L0); - } + else + emit_pte(m, bb, clear_L0_pt, clear_vram, true, &src_it, clear_L0, + dst); + bb->cs[bb->len++] = MI_BATCH_BUFFER_END; update_idx = bb->len; -- GitLab From 8049e3954aeaaeb488cd4e371526721c7fca297e Mon Sep 17 00:00:00 2001 From: Brian Welty Date: Wed, 10 Jan 2024 16:21:11 -0800 Subject: [PATCH 0936/1290] drm/xe: Fix bounds checking in __xe_bo_placement_for_flags() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Requesting all memory regions on PVC will fill bo->placements up to XE_BO_MAX_PLACEMENTS. The subsequent call to try_add_stolen() will trip over the bounds checking even though XE_PL_STOLEN is not expected to be used in this case. This is hit with igt@xe_exec_fault_mode@once-basic-prefetch: xe 0000:8c:00.0: [drm] Assertion `*c < (sizeof(bo->placements) / sizeof((bo->placements)[0]) + ((int)(sizeof(struct { int:(-!!(__builtin_types_compatible_p(typeof((bo->placements)), typeof(&(bo->placements)[0])))); }))))` failed! WARNING: CPU: 30 PID: 6161 at drivers/gpu/drm/xe/xe_bo.c:203 __xe_bo_placement_for_flags+0x218/0x240 [xe] Is fixed here by moving the bounds checks closer to where we actually write into the bo->placement array. Fixes: 8c54ee8a8606 ("drm/xe: Ensure that we don't access the placements array out-of-bounds") Link: https://patchwork.freedesktop.org/patch/msgid/20240111002111.10190-1-brian.welty@intel.com Signed-off-by: Matthew Brost Signed-off-by: Brian Welty Reviewed-by: Matthew Brost (cherry picked from commit 52e3fa3e3ea3ee05e32c1a8d72bb3ae306a4da64) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_bo.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index 3cd29bd015a02..0b0e262e2166d 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -125,9 +125,9 @@ static struct xe_mem_region *res_to_mem_region(struct ttm_resource *res) static void try_add_system(struct xe_device *xe, struct xe_bo *bo, u32 bo_flags, u32 *c) { - xe_assert(xe, *c < ARRAY_SIZE(bo->placements)); - if (bo_flags & XE_BO_CREATE_SYSTEM_BIT) { + xe_assert(xe, *c < ARRAY_SIZE(bo->placements)); + bo->placements[*c] = (struct ttm_place) { .mem_type = XE_PL_TT, }; @@ -145,6 +145,8 @@ static void add_vram(struct xe_device *xe, struct xe_bo *bo, struct xe_mem_region *vram; u64 io_size; + xe_assert(xe, *c < ARRAY_SIZE(bo->placements)); + vram = to_xe_ttm_vram_mgr(ttm_manager_type(&xe->ttm, mem_type))->vram; xe_assert(xe, vram && vram->usable_size); io_size = vram->io_size; @@ -175,8 +177,6 @@ static void add_vram(struct xe_device *xe, struct xe_bo *bo, static void try_add_vram(struct xe_device *xe, struct xe_bo *bo, u32 bo_flags, u32 *c) { - xe_assert(xe, *c < ARRAY_SIZE(bo->placements)); - if (bo->props.preferred_gt == XE_GT1) { if (bo_flags & XE_BO_CREATE_VRAM1_BIT) add_vram(xe, bo, bo->placements, bo_flags, XE_PL_VRAM1, c); @@ -193,9 +193,9 @@ static void try_add_vram(struct xe_device *xe, struct xe_bo *bo, static void try_add_stolen(struct xe_device *xe, struct xe_bo *bo, u32 bo_flags, u32 *c) { - xe_assert(xe, *c < ARRAY_SIZE(bo->placements)); - if (bo_flags & XE_BO_CREATE_STOLEN_BIT) { + xe_assert(xe, *c < ARRAY_SIZE(bo->placements)); + bo->placements[*c] = (struct ttm_place) { .mem_type = XE_PL_STOLEN, .flags = bo_flags & (XE_BO_CREATE_PINNED_BIT | -- GitLab From cbcb358b744bf8d8c52b35d5efb1a960438c31cd Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 24 Aug 2023 17:55:51 +0800 Subject: [PATCH 0937/1290] ceph: skip reconnecting if MDS is not ready When MDS closed the session the kclient will send to reconnect to it immediately, but if the MDS just restarted and still not ready yet, such as still in the up:replay state and the sessionmap journal logs hasn't be replayed, the MDS will close the session. And then the kclient could remove the session and later when the mdsmap is in RECONNECT phrase it will skip reconnecting. But the MDS will wait until timeout and then evict the kclient. Just skip sending the reconnection request until the MDS is ready. Link: https://tracker.ceph.com/issues/62489 Signed-off-by: Xiubo Li Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index d95eb525519a6..be00c189ed460 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -5870,7 +5870,8 @@ static void mds_peer_reset(struct ceph_connection *con) pr_warn_client(mdsc->fsc->client, "mds%d closed our session\n", s->s_mds); - if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO) + if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO && + ceph_mdsmap_get_state(mdsc->mdsmap, s->s_mds) >= CEPH_MDS_STATE_RECONNECT) send_mds_reconnect(mdsc, s); } -- GitLab From f48e0342a74d7770cdf1d11894bdc3b6d989b29e Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Mon, 6 Nov 2023 10:02:32 +0530 Subject: [PATCH 0938/1290] ceph: reinitialize mds feature bit even when session in open Following along the same lines as per the user-space fix. Right now this isn't really an issue with the ceph kernel driver because of the feature bit laginess, however, that can change over time (when the new snaprealm info type is ported to the kernel driver) and depending on the MDS version that's being upgraded can cause message decoding issues - so, fix that early on. Link: http://tracker.ceph.com/issues/63188 Signed-off-by: Venky Shankar Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index be00c189ed460..6781438f87829 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4128,12 +4128,12 @@ static void handle_session(struct ceph_mds_session *session, pr_info_client(cl, "mds%d reconnect success\n", session->s_mds); + session->s_features = features; if (session->s_state == CEPH_MDS_SESSION_OPEN) { pr_notice_client(cl, "mds%d is already opened\n", session->s_mds); } else { session->s_state = CEPH_MDS_SESSION_OPEN; - session->s_features = features; renewed_caps(mdsc, session, 0); if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &session->s_features)) -- GitLab From b79e4a0aa902322756ced7361a2c637d462c3c1c Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 7 Nov 2023 09:37:47 +0800 Subject: [PATCH 0939/1290] libceph: remove MAX_EXTENTS check for sparse reads There is no any limit for the extent array size and it's possible that when reading with a large size contents the total number of extents will exceed 4096. Then the messager will fail by reseting the connection and keeps resending the inflight IOs infinitely. [ idryomov: adjust error message ] Link: https://tracker.ceph.com/issues/62081 Signed-off-by: Xiubo Li Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/osd_client.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index d3a759e052c81..625622016f576 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -5850,8 +5850,6 @@ static inline void convert_extent_map(struct ceph_sparse_read *sr) } #endif -#define MAX_EXTENTS 4096 - static int osd_sparse_read(struct ceph_connection *con, struct ceph_msg_data_cursor *cursor, char **pbuf) @@ -5882,23 +5880,16 @@ next_op: if (count > 0) { if (!sr->sr_extent || count > sr->sr_ext_len) { - /* - * Apply a hard cap to the number of extents. - * If we have more, assume something is wrong. - */ - if (count > MAX_EXTENTS) { - dout("%s: OSD returned 0x%x extents in a single reply!\n", - __func__, count); - return -EREMOTEIO; - } - /* no extent array provided, or too short */ kfree(sr->sr_extent); sr->sr_extent = kmalloc_array(count, sizeof(*sr->sr_extent), GFP_NOIO); - if (!sr->sr_extent) + if (!sr->sr_extent) { + pr_err("%s: failed to allocate %u extents\n", + __func__, count); return -ENOMEM; + } sr->sr_ext_len = count; } ret = count * sizeof(*sr->sr_extent); -- GitLab From aaefabc4a5f7ae48682c4d2d5d10faaf95c08eb9 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 7 Nov 2023 10:44:41 +0800 Subject: [PATCH 0940/1290] ceph: try to allocate a smaller extent map for sparse read In fscrypt case and for a smaller read length we can predict the max count of the extent map. And for small read length use cases this could save some memories. [ idryomov: squash into a single patch to avoid build break, drop redundant variable in ceph_alloc_sparse_ext_map() ] Signed-off-by: Xiubo Li Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 4 +++- fs/ceph/file.c | 8 ++++++-- fs/ceph/super.h | 14 ++++++++++++++ include/linux/ceph/osd_client.h | 7 +++++-- 4 files changed, 28 insertions(+), 5 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 85be3bf18cdf3..a5caafc1859ec 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -357,6 +357,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) u64 len = subreq->len; bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD); u64 off = subreq->start; + int extent_cnt; if (ceph_inode_is_shutdown(inode)) { err = -EIO; @@ -379,7 +380,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) } if (sparse) { - err = ceph_alloc_sparse_ext_map(&req->r_ops[0]); + extent_cnt = __ceph_sparse_read_ext_count(inode, len); + err = ceph_alloc_sparse_ext_map(&req->r_ops[0], extent_cnt); if (err) goto out; } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3b5aae29e9447..4dde0da10079b 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1028,6 +1028,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, struct ceph_osd_req_op *op; u64 read_off = off; u64 read_len = len; + int extent_cnt; /* determine new offset/length if encrypted */ ceph_fscrypt_adjust_off_and_len(inode, &read_off, &read_len); @@ -1067,7 +1068,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, op = &req->r_ops[0]; if (sparse) { - ret = ceph_alloc_sparse_ext_map(op); + extent_cnt = __ceph_sparse_read_ext_count(inode, read_len); + ret = ceph_alloc_sparse_ext_map(op, extent_cnt); if (ret) { ceph_osdc_put_request(req); break; @@ -1464,6 +1466,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, ssize_t len; struct ceph_osd_req_op *op; int readop = sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ; + int extent_cnt; if (write) size = min_t(u64, size, fsc->mount_options->wsize); @@ -1527,7 +1530,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len); op = &req->r_ops[0]; if (sparse) { - ret = ceph_alloc_sparse_ext_map(op); + extent_cnt = __ceph_sparse_read_ext_count(inode, size); + ret = ceph_alloc_sparse_ext_map(op, extent_cnt); if (ret) { ceph_osdc_put_request(req); break; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index fe0f64a0acb27..b06e2bc86221b 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -3,6 +3,7 @@ #define _FS_CEPH_SUPER_H #include +#include #include #include @@ -1407,6 +1408,19 @@ static inline void __ceph_update_quota(struct ceph_inode_info *ci, ceph_adjust_quota_realms_count(&ci->netfs.inode, has_quota); } +static inline int __ceph_sparse_read_ext_count(struct inode *inode, u64 len) +{ + int cnt = 0; + + if (IS_ENCRYPTED(inode)) { + cnt = len >> CEPH_FSCRYPT_BLOCK_SHIFT; + if (cnt > CEPH_SPARSE_EXT_ARRAY_INITIAL) + cnt = 0; + } + + return cnt; +} + extern void ceph_handle_quota(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index b8610e9d2471f..fa018d5864e74 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -572,9 +572,12 @@ int __ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt); */ #define CEPH_SPARSE_EXT_ARRAY_INITIAL 16 -static inline int ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op) +static inline int ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt) { - return __ceph_alloc_sparse_ext_map(op, CEPH_SPARSE_EXT_ARRAY_INITIAL); + if (!cnt) + cnt = CEPH_SPARSE_EXT_ARRAY_INITIAL; + + return __ceph_alloc_sparse_ext_map(op, cnt); } extern void ceph_osdc_get_request(struct ceph_osd_request *req); -- GitLab From b493ad718b1f0357394d2cdecbf00a44a36fa085 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Fri, 17 Nov 2023 13:26:18 +0800 Subject: [PATCH 0941/1290] ceph: fix deadlock or deadcode of misusing dget() The lock order is incorrect between denty and its parent, we should always make sure that the parent get the lock first. But since this deadcode is never used and the parent dir will always be set from the callers, let's just remove it. Link: https://lore.kernel.org/r/20231116081919.GZ1957730@ZenIV Reported-by: Al Viro Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 2c0b8dc3dd0d8..9c02f328c966c 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4887,13 +4887,15 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, struct inode *dir, int mds, int drop, int unless) { - struct dentry *parent = NULL; struct ceph_mds_request_release *rel = *p; struct ceph_dentry_info *di = ceph_dentry(dentry); struct ceph_client *cl; int force = 0; int ret; + /* This shouldn't happen */ + BUG_ON(!dir); + /* * force an record for the directory caps if we have a dentry lease. * this is racy (can't take i_ceph_lock and d_lock together), but it @@ -4903,14 +4905,9 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, spin_lock(&dentry->d_lock); if (di->lease_session && di->lease_session->s_mds == mds) force = 1; - if (!dir) { - parent = dget(dentry->d_parent); - dir = d_inode(parent); - } spin_unlock(&dentry->d_lock); ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force); - dput(parent); cl = ceph_inode_to_client(dir); spin_lock(&dentry->d_lock); -- GitLab From 9c896d6bc3dfef86659a6a1fb25ccdea5dbef6a3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 22 Nov 2023 19:08:38 -0800 Subject: [PATCH 0942/1290] ceph: select FS_ENCRYPTION_ALGS if FS_ENCRYPTION The kconfig options for filesystems that support FS_ENCRYPTION are supposed to select FS_ENCRYPTION_ALGS. This is needed to ensure that required crypto algorithms get enabled as loadable modules or builtin as is appropriate for the set of enabled filesystems. Do this for CEPH_FS so that there aren't any missing algorithms if someone happens to have CEPH_FS as their only enabled filesystem that supports encryption. Cc: stable@vger.kernel.org Fixes: f061feda6c54 ("ceph: add fscrypt ioctls and ceph.fscrypt.auth vxattr") Signed-off-by: Eric Biggers Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 94df854147d35..7249d70e1a43f 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -7,6 +7,7 @@ config CEPH_FS select CRYPTO_AES select CRYPTO select NETFS_SUPPORT + select FS_ENCRYPTION_ALGS if FS_ENCRYPTION default n help Choose Y or M here to include support for mounting the -- GitLab From 66207de308df82242da0bf88035b24bcd4377562 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 16 Nov 2023 09:46:19 +0800 Subject: [PATCH 0943/1290] ceph: rename create_session_open_msg() to create_session_full_msg() Makes the create session msg helper to be more general and could be used by other ops. Signed-off-by: Xiubo Li Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 6781438f87829..a7eb722c1b48c 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1534,7 +1534,8 @@ static int encode_metric_spec(void **p, void *end) * session message, specialization for CEPH_SESSION_REQUEST_OPEN * to include additional client metadata fields. */ -static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u64 seq) +static struct ceph_msg * +create_session_full_msg(struct ceph_mds_client *mdsc, int op, u64 seq) { struct ceph_msg *msg; struct ceph_mds_session_head *h; @@ -1589,7 +1590,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 end = p + msg->front.iov_len; h = p; - h->op = cpu_to_le32(CEPH_SESSION_REQUEST_OPEN); + h->op = cpu_to_le32(op); h->seq = cpu_to_le64(seq); /* @@ -1663,7 +1664,8 @@ static int __open_session(struct ceph_mds_client *mdsc, session->s_renew_requested = jiffies; /* send connect message */ - msg = create_session_open_msg(mdsc, session->s_seq); + msg = create_session_full_msg(mdsc, CEPH_SESSION_REQUEST_OPEN, + session->s_seq); if (IS_ERR(msg)) return PTR_ERR(msg); ceph_con_send(&session->s_con, msg); -- GitLab From 6df89bf220fdac9f40b0d35cd132eef54cf99d4b Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 16 Nov 2023 10:56:24 +0800 Subject: [PATCH 0944/1290] ceph: send oldest_client_tid when renewing caps Update the oldest_client_tid via the session renew caps msg to make sure that the MDSs won't pile up the completed request list in a very large size. [ idryomov: drop inapplicable comment ] Link: https://tracker.ceph.com/issues/63364 Signed-off-by: Xiubo Li Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index a7eb722c1b48c..6cfeba08a3600 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1579,6 +1579,9 @@ create_session_full_msg(struct ceph_mds_client *mdsc, int op, u64 seq) size = METRIC_BYTES(count); extra_bytes += 2 + 4 + 4 + size; + /* flags, mds auth caps and oldest_client_tid */ + extra_bytes += 4 + 4 + 8; + /* Allocate the message */ msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes, GFP_NOFS, false); @@ -1597,9 +1600,9 @@ create_session_full_msg(struct ceph_mds_client *mdsc, int op, u64 seq) * Serialize client metadata into waiting buffer space, using * the format that userspace expects for map * - * ClientSession messages with metadata are v4 + * ClientSession messages with metadata are v7 */ - msg->hdr.version = cpu_to_le16(4); + msg->hdr.version = cpu_to_le16(7); msg->hdr.compat_version = cpu_to_le16(1); /* The write pointer, following the session_head structure */ @@ -1635,6 +1638,15 @@ create_session_full_msg(struct ceph_mds_client *mdsc, int op, u64 seq) return ERR_PTR(ret); } + /* version == 5, flags */ + ceph_encode_32(&p, 0); + + /* version == 6, mds auth caps */ + ceph_encode_32(&p, 0); + + /* version == 7, oldest_client_tid */ + ceph_encode_64(&p, mdsc->oldest_tid); + msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); @@ -2030,10 +2042,10 @@ static int send_renew_caps(struct ceph_mds_client *mdsc, doutc(cl, "to mds%d (%s)\n", session->s_mds, ceph_mds_state_name(state)); - msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, + msg = create_session_full_msg(mdsc, CEPH_SESSION_REQUEST_RENEWCAPS, ++session->s_renew_seq); - if (!msg) - return -ENOMEM; + if (IS_ERR(msg)) + return PTR_ERR(msg); ceph_con_send(&session->s_con, msg); return 0; } -- GitLab From b36b03344f5fccb81e5cf3b3ede68b7e7a7e930a Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 22 Nov 2023 15:55:14 +0800 Subject: [PATCH 0945/1290] ceph: remove duplicated code in ceph_netfs_issue_read() When allocating an osd request the libceph.ko will add the 'read_from_replica' flag by default. Signed-off-by: Xiubo Li Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index a5caafc1859ec..792bbbf8de647 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -371,8 +371,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, off, &len, 0, 1, sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ, - CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica, - NULL, ci->i_truncate_seq, ci->i_truncate_size, false); + CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq, + ci->i_truncate_size, false); if (IS_ERR(req)) { err = PTR_ERR(req); req = NULL; -- GitLab From 0f4cf64eabc6e16cfc2704f1960e82dc79d91c8d Mon Sep 17 00:00:00 2001 From: Wenchao Hao Date: Thu, 23 Nov 2023 09:53:40 +0800 Subject: [PATCH 0946/1290] ceph: fix invalid pointer access if get_quota_realm return ERR_PTR This issue is reported by smatch that get_quota_realm() might return ERR_PTR but we did not handle it. It's not a immediate bug, while we still should address it to avoid potential bugs if get_quota_realm() is changed to return other ERR_PTR in future. Set ceph_snap_realm's pointer in get_quota_realm()'s to address this issue, the pointer would be set to NULL if get_quota_realm() failed to get struct ceph_snap_realm, so no ERR_PTR would happen any more. [ xiubli: minor code style clean up ] Signed-off-by: Wenchao Hao Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/quota.c | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 9d36c3532de14..06ee397e0c3a6 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -197,10 +197,10 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc) } /* - * This function walks through the snaprealm for an inode and returns the - * ceph_snap_realm for the first snaprealm that has quotas set (max_files, + * This function walks through the snaprealm for an inode and set the + * realmp with the first snaprealm that has quotas set (max_files, * max_bytes, or any, depending on the 'which_quota' argument). If the root is - * reached, return the root ceph_snap_realm instead. + * reached, set the realmp with the root ceph_snap_realm instead. * * Note that the caller is responsible for calling ceph_put_snap_realm() on the * returned realm. @@ -211,10 +211,9 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc) * this function will return -EAGAIN; otherwise, the snaprealms walk-through * will be restarted. */ -static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc, - struct inode *inode, - enum quota_get_realm which_quota, - bool retry) +static int get_quota_realm(struct ceph_mds_client *mdsc, struct inode *inode, + enum quota_get_realm which_quota, + struct ceph_snap_realm **realmp, bool retry) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci = NULL; @@ -222,8 +221,10 @@ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc, struct inode *in; bool has_quota; + if (realmp) + *realmp = NULL; if (ceph_snap(inode) != CEPH_NOSNAP) - return NULL; + return 0; restart: realm = ceph_inode(inode)->i_snap_realm; @@ -250,7 +251,7 @@ restart: break; ceph_put_snap_realm(mdsc, realm); if (!retry) - return ERR_PTR(-EAGAIN); + return -EAGAIN; goto restart; } @@ -259,8 +260,11 @@ restart: iput(in); next = realm->parent; - if (has_quota || !next) - return realm; + if (has_quota || !next) { + if (realmp) + *realmp = realm; + return 0; + } ceph_get_snap_realm(mdsc, next); ceph_put_snap_realm(mdsc, realm); @@ -269,7 +273,7 @@ restart: if (realm) ceph_put_snap_realm(mdsc, realm); - return NULL; + return 0; } bool ceph_quota_is_same_realm(struct inode *old, struct inode *new) @@ -277,6 +281,7 @@ bool ceph_quota_is_same_realm(struct inode *old, struct inode *new) struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old->i_sb); struct ceph_snap_realm *old_realm, *new_realm; bool is_same; + int ret; restart: /* @@ -286,9 +291,9 @@ restart: * dropped and we can then restart the whole operation. */ down_read(&mdsc->snap_rwsem); - old_realm = get_quota_realm(mdsc, old, QUOTA_GET_ANY, true); - new_realm = get_quota_realm(mdsc, new, QUOTA_GET_ANY, false); - if (PTR_ERR(new_realm) == -EAGAIN) { + get_quota_realm(mdsc, old, QUOTA_GET_ANY, &old_realm, true); + ret = get_quota_realm(mdsc, new, QUOTA_GET_ANY, &new_realm, false); + if (ret == -EAGAIN) { up_read(&mdsc->snap_rwsem); if (old_realm) ceph_put_snap_realm(mdsc, old_realm); @@ -492,8 +497,8 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf) bool is_updated = false; down_read(&mdsc->snap_rwsem); - realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), - QUOTA_GET_MAX_BYTES, true); + get_quota_realm(mdsc, d_inode(fsc->sb->s_root), QUOTA_GET_MAX_BYTES, + &realm, true); up_read(&mdsc->snap_rwsem); if (!realm) return false; -- GitLab From f6fb21b22fbe443f92b0d580391a7fb46d1840df Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Dec 2023 05:20:54 +0000 Subject: [PATCH 0947/1290] ceph: d_obtain_{alias,root}(ERR_PTR(...)) will do the right thing Clean up the code. Signed-off-by: Al Viro Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/export.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 726af69d4d62c..a79f163ae4ed2 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -286,8 +286,6 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb, doutc(cl, "%llx.%llx parent %llx hash %x err=%d", vino.ino, vino.snap, sfh->parent_ino, sfh->hash, err); } - if (IS_ERR(inode)) - return ERR_CAST(inode); /* see comments in ceph_get_parent() */ return unlinked ? d_obtain_root(inode) : d_obtain_alias(inode); } -- GitLab From 2a965d1b15d28065b35ab4ebd1e51558fcd91aa5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Dec 2023 05:29:25 +0000 Subject: [PATCH 0948/1290] ceph: get rid of passing callbacks in __dentry_leases_walk() __dentry_leases_walk() gets a callback and calls it for a bunch of denties; there are exactly two callers and we already have a flag telling them apart - lwc->dir_lease. Seeing that indirect calls are costly these days, let's get rid of the callback and just call the right function directly. Has a side benefit of saner signatures... [ xiubli: a minor fix in the commit title ] Signed-off-by: Al Viro Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/dir.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 91709934c8b14..7681587437504 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1593,10 +1593,12 @@ struct ceph_lease_walk_control { unsigned long dir_lease_ttl; }; +static int __dir_lease_check(const struct dentry *, struct ceph_lease_walk_control *); +static int __dentry_lease_check(const struct dentry *); + static unsigned long __dentry_leases_walk(struct ceph_mds_client *mdsc, - struct ceph_lease_walk_control *lwc, - int (*check)(struct dentry*, void*)) + struct ceph_lease_walk_control *lwc) { struct ceph_dentry_info *di, *tmp; struct dentry *dentry, *last = NULL; @@ -1624,7 +1626,10 @@ __dentry_leases_walk(struct ceph_mds_client *mdsc, goto next; } - ret = check(dentry, lwc); + if (lwc->dir_lease) + ret = __dir_lease_check(dentry, lwc); + else + ret = __dentry_lease_check(dentry); if (ret & TOUCH) { /* move it into tail of dir lease list */ __dentry_dir_lease_touch(mdsc, di); @@ -1681,7 +1686,7 @@ next: return freed; } -static int __dentry_lease_check(struct dentry *dentry, void *arg) +static int __dentry_lease_check(const struct dentry *dentry) { struct ceph_dentry_info *di = ceph_dentry(dentry); int ret; @@ -1696,9 +1701,9 @@ static int __dentry_lease_check(struct dentry *dentry, void *arg) return DELETE; } -static int __dir_lease_check(struct dentry *dentry, void *arg) +static int __dir_lease_check(const struct dentry *dentry, + struct ceph_lease_walk_control *lwc) { - struct ceph_lease_walk_control *lwc = arg; struct ceph_dentry_info *di = ceph_dentry(dentry); int ret = __dir_lease_try_check(dentry); @@ -1737,7 +1742,7 @@ int ceph_trim_dentries(struct ceph_mds_client *mdsc) lwc.dir_lease = false; lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE * 2; - freed = __dentry_leases_walk(mdsc, &lwc, __dentry_lease_check); + freed = __dentry_leases_walk(mdsc, &lwc); if (!lwc.nr_to_scan) /* more invalid leases */ return -EAGAIN; @@ -1747,7 +1752,7 @@ int ceph_trim_dentries(struct ceph_mds_client *mdsc) lwc.dir_lease = true; lwc.expire_dir_lease = freed < count; lwc.dir_lease_ttl = mdsc->fsc->mount_options->caps_wanted_delay_max * HZ; - freed +=__dentry_leases_walk(mdsc, &lwc, __dir_lease_check); + freed +=__dentry_leases_walk(mdsc, &lwc); if (!lwc.nr_to_scan) /* more to check */ return -EAGAIN; -- GitLab From 2b872b0f466d2acb4491da845c66b49246d5cdf9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 15 Jan 2024 22:46:35 +0800 Subject: [PATCH 0949/1290] erofs: Don't use certain unnecessary folio_*() functions Filesystems should use folio->index and folio->mapping, instead of folio_index(folio), folio_mapping() and folio_file_mapping() since they know that it's in the pagecache. Change this automagically with: perl -p -i -e 's/folio_mapping[(]([^)]*)[)]/\1->mapping/g' fs/erofs/*.c perl -p -i -e 's/folio_file_mapping[(]([^)]*)[)]/\1->mapping/g' fs/erofs/*.c perl -p -i -e 's/folio_index[(]([^)]*)[)]/\1->index/g' fs/erofs/*.c Reported-by: Matthew Wilcox Signed-off-by: David Howells Reviewed-by: Jeff Layton Cc: Chao Yu Cc: Yue Hu Cc: Jeffle Xu Cc: linux-erofs@lists.ozlabs.org Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20240115144635.1931422-1-hsiangkao@linux.alibaba.com --- fs/erofs/fscache.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 87ff35bff8d5b..bc12030393b24 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -165,10 +165,10 @@ static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie, static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio) { int ret; - struct erofs_fscache *ctx = folio_mapping(folio)->host->i_private; + struct erofs_fscache *ctx = folio->mapping->host->i_private; struct erofs_fscache_request *req; - req = erofs_fscache_req_alloc(folio_mapping(folio), + req = erofs_fscache_req_alloc(folio->mapping, folio_pos(folio), folio_size(folio)); if (IS_ERR(req)) { folio_unlock(folio); @@ -276,7 +276,7 @@ static int erofs_fscache_read_folio(struct file *file, struct folio *folio) struct erofs_fscache_request *req; int ret; - req = erofs_fscache_req_alloc(folio_mapping(folio), + req = erofs_fscache_req_alloc(folio->mapping, folio_pos(folio), folio_size(folio)); if (IS_ERR(req)) { folio_unlock(folio); -- GitLab From 04036d49c44b1772d4158640f3ccd938a12a3cb8 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Sat, 13 Jan 2024 12:09:47 +0800 Subject: [PATCH 0950/1290] virtio_blk: remove duplicate check if queue is broken in virtblk_done virtqueue_enable_cb() will call virtqueue_poll() which will check if queue is broken at beginning, so remove the virtqueue_is_broken() call Acked-by: Jason Wang Signed-off-by: Li RongQing Signed-off-by: Jens Axboe --- drivers/block/virtio_blk.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 7d7a19b2b9a8e..24963f445cfe4 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -367,8 +367,6 @@ static void virtblk_done(struct virtqueue *vq) blk_mq_complete_request(req); req_done = true; } - if (unlikely(virtqueue_is_broken(vq))) - break; } while (!virtqueue_enable_cb(vq)); /* In case queue is stopped waiting for more buffers. */ -- GitLab From 832b371097eb928d077c827b8f117bf5b99d35c0 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Mon, 15 Jan 2024 16:05:26 +0100 Subject: [PATCH 0951/1290] gpiolib: Fix scope-based gpio_device refcounting Commit 9e4555d1e54a ("gpiolib: add support for scope-based management to gpio_device") sought to add scope-based gpio_device refcounting, but erroneously forgot a negation of IS_ERR_OR_NULL(). As a result, gpio_device_put() is not called if the gpio_device pointer is valid (meaning the ref is leaked), but only called if the pointer is NULL or an ERR_PTR(). While at it drop a superfluous trailing semicolon. Fixes: 9e4555d1e54a ("gpiolib: add support for scope-based management to gpio_device") Signed-off-by: Lukas Wunner Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index e846bd4e7559b..9a5c6c76e6533 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -635,7 +635,7 @@ struct gpio_device *gpio_device_get(struct gpio_device *gdev); void gpio_device_put(struct gpio_device *gdev); DEFINE_FREE(gpio_device_put, struct gpio_device *, - if (IS_ERR_OR_NULL(_T)) gpio_device_put(_T)); + if (!IS_ERR_OR_NULL(_T)) gpio_device_put(_T)) struct device *gpio_device_to_device(struct gpio_device *gdev); -- GitLab From bf3ff145df184698a8a80b33265064638572366f Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Thu, 11 Jan 2024 12:47:16 +0200 Subject: [PATCH 0952/1290] drm/xe: display support should not depend on EXPERT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the DRM_XE_DISPLAY config dependency on EXPERT. I can only presume the idea was only experts should be able to disable it, but the effect is the opposite. Reported-by: Eero Tamminen Reviewed-by: Francois Dugast Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20240111104716.3548744-1-jani.nikula@intel.com (cherry picked from commit 1c7531f50eaa425eca8ff726287b8df3a4a51e55) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/Kconfig b/drivers/gpu/drm/xe/Kconfig index 1cced50d8d8c9..e36ae1f0d8859 100644 --- a/drivers/gpu/drm/xe/Kconfig +++ b/drivers/gpu/drm/xe/Kconfig @@ -47,7 +47,7 @@ config DRM_XE config DRM_XE_DISPLAY bool "Enable display support" - depends on DRM_XE && EXPERT && DRM_XE=m + depends on DRM_XE && DRM_XE=m select FB_IOMEM_HELPERS select I2C select I2C_ALGOBIT -- GitLab From 5f4c01f1e3c7b0c8d1e5dd6f080531de7aa5e47b Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Mon, 15 Jan 2024 17:19:34 -0300 Subject: [PATCH 0953/1290] spinlock: Fix failing build for PREEMPT_RT Since 1d71b30e1f85 ("sched.h: Move (spin|rwlock)_needbreak() to spinlock.h") build fails for PREEMPT_RT, since there is no definition available of either spin_needbreak() and rwlock_needbreak(). Since it was moved on the mentioned commit, it was placed inside a !PREEMPT_RT part of the code, making it out of reach for an RT kernel. Fix this by moving code it a few lines down so it can be reached by an RT build, where it can also make use of the *_is_contended() definition added by the spinlock_rt.h. Fixes: d1d71b30e1f85 ("sched.h: Move (spin|rwlock)_needbreak() to spinlock.h") Signed-off-by: Leonardo Bras Signed-off-by: Kent Overstreet Acked-by: Waiman Long --- include/linux/spinlock.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 0c71f06454d9e..b5c59fdad160f 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -449,6 +449,12 @@ static __always_inline int spin_is_contended(spinlock_t *lock) return raw_spin_is_contended(&lock->rlock); } +#define assert_spin_locked(lock) assert_raw_spin_locked(&(lock)->rlock) + +#else /* !CONFIG_PREEMPT_RT */ +# include +#endif /* CONFIG_PREEMPT_RT */ + /* * Does a critical section need to be broken due to another * task waiting?: (technically does not depend on CONFIG_PREEMPTION, @@ -480,12 +486,6 @@ static inline int rwlock_needbreak(rwlock_t *lock) #endif } -#define assert_spin_locked(lock) assert_raw_spin_locked(&(lock)->rlock) - -#else /* !CONFIG_PREEMPT_RT */ -# include -#endif /* CONFIG_PREEMPT_RT */ - /* * Pull the atomic_t declaration: * (asm-mips/atomic.h needs above definitions) -- GitLab From eea7615b684fc98cd0403beaaa2194e6f029c812 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 14 Jan 2024 15:13:20 -0800 Subject: [PATCH 0954/1290] rtc: ac100: remove misuses of kernel-doc Prevent kernel-doc warnings by changing "/**" to common comment format "/*" in non-kernel-doc comments: drivers/rtc/rtc-ac100.c:103: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Clock controls for 3 clock output pins drivers/rtc/rtc-ac100.c:382: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * RTC related bits Signed-off-by: Randy Dunlap Cc: Chen-Yu Tsai Cc: Alexandre Belloni Cc: Link: https://lore.kernel.org/r/20240114231320.31437-1-rdunlap@infradead.org Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-ac100.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-ac100.c b/drivers/rtc/rtc-ac100.c index eaf2c9ab96619..fa642bba3cee0 100644 --- a/drivers/rtc/rtc-ac100.c +++ b/drivers/rtc/rtc-ac100.c @@ -99,7 +99,7 @@ struct ac100_rtc_dev { struct clk_hw_onecell_data *clk_data; }; -/** +/* * Clock controls for 3 clock output pins */ @@ -378,7 +378,7 @@ static void ac100_rtc_unregister_clks(struct ac100_rtc_dev *chip) clk_unregister_fixed_rate(chip->rtc_32k_clk->clk); } -/** +/* * RTC related bits */ static int ac100_rtc_get_time(struct device *dev, struct rtc_time *rtc_tm) -- GitLab From 83c0711453e54dc696b13e9512fbbed6d9180ea2 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Mon, 8 Jan 2024 01:43:57 +0100 Subject: [PATCH 0955/1290] rtc: rv8803: add wakeup-source support The RV8803 can be wired directly to a PMIC that can wake up an SoC without the CPU getting interrupts. Link: https://lore.kernel.org/r/20240108004357.602918-1-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni --- Documentation/devicetree/bindings/rtc/epson,rx8900.yaml | 2 ++ drivers/rtc/rtc-rv8803.c | 7 +++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/rtc/epson,rx8900.yaml b/Documentation/devicetree/bindings/rtc/epson,rx8900.yaml index 1df7c45d95c18..b770149c5fd67 100644 --- a/Documentation/devicetree/bindings/rtc/epson,rx8900.yaml +++ b/Documentation/devicetree/bindings/rtc/epson,rx8900.yaml @@ -29,6 +29,8 @@ properties: trickle-diode-disable: true + wakeup-source: true + required: - compatible - reg diff --git a/drivers/rtc/rtc-rv8803.c b/drivers/rtc/rtc-rv8803.c index 11e6b0d31f5d7..1327251e527c2 100644 --- a/drivers/rtc/rtc-rv8803.c +++ b/drivers/rtc/rtc-rv8803.c @@ -712,9 +712,12 @@ static int rv8803_probe(struct i2c_client *client) if (err) dev_err(&client->dev, "failed to set wake IRQ\n"); } + } else { + if (device_property_read_bool(&client->dev, "wakeup-source")) + device_init_wakeup(&client->dev, true); + else + clear_bit(RTC_FEATURE_ALARM, rv8803->rtc->features); } - if (!client->irq) - clear_bit(RTC_FEATURE_ALARM, rv8803->rtc->features); if (of_property_read_bool(client->dev.of_node, "epson,vdet-disable")) rv8803->backup |= RX8900_FLAG_VDETOFF; -- GitLab From 62b38f30a00ff47142e58d0a6d60dd296e0e8108 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 15 Jan 2024 10:56:47 -0800 Subject: [PATCH 0956/1290] gpio: EN7523: fix kernel-doc warnings Add "struct" keyword and explain the @dir array differently to prevent kernel-doc warnings: gpio-en7523.c:22: warning: cannot understand function prototype: 'struct airoha_gpio_ctrl ' gpio-en7523.c:27: warning: Function parameter or struct member 'dir' not described in 'airoha_gpio_ctrl' gpio-en7523.c:27: warning: Excess struct member 'dir0' description in 'airoha_gpio_ctrl' gpio-en7523.c:27: warning: Excess struct member 'dir1' description in 'airoha_gpio_ctrl' Fixes: 0868ad385aff ("gpio: Add support for Airoha EN7523 GPIO controller") Signed-off-by: Randy Dunlap Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-en7523.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpio/gpio-en7523.c b/drivers/gpio/gpio-en7523.c index f836a8db4c1d2..69834db2c1cf2 100644 --- a/drivers/gpio/gpio-en7523.c +++ b/drivers/gpio/gpio-en7523.c @@ -12,11 +12,11 @@ #define AIROHA_GPIO_MAX 32 /** - * airoha_gpio_ctrl - Airoha GPIO driver data + * struct airoha_gpio_ctrl - Airoha GPIO driver data * @gc: Associated gpio_chip instance. * @data: The data register. - * @dir0: The direction register for the lower 16 pins. - * @dir1: The direction register for the higher 16 pins. + * @dir: [0] The direction register for the lower 16 pins. + * [1]: The direction register for the higher 16 pins. * @output: The output enable register. */ struct airoha_gpio_ctrl { -- GitLab From 5905777847b5c3aa8aefe2aad5103f9f468fb990 Mon Sep 17 00:00:00 2001 From: Antoniu Miclaus Date: Mon, 20 Nov 2023 14:00:16 +0200 Subject: [PATCH 0957/1290] dt-bindings: rtc: max31335: add max31335 bindings Document the Analog Devices MAX31335 device tree bindings. Reviewed-by: Krzysztof Kozlowski Signed-off-by: Antoniu Miclaus Link: https://lore.kernel.org/r/20231120120114.48657-1-antoniu.miclaus@analog.com Signed-off-by: Alexandre Belloni --- .../devicetree/bindings/rtc/adi,max31335.yaml | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 Documentation/devicetree/bindings/rtc/adi,max31335.yaml diff --git a/Documentation/devicetree/bindings/rtc/adi,max31335.yaml b/Documentation/devicetree/bindings/rtc/adi,max31335.yaml new file mode 100644 index 0000000000000..0125cf6727cc3 --- /dev/null +++ b/Documentation/devicetree/bindings/rtc/adi,max31335.yaml @@ -0,0 +1,70 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/rtc/adi,max31335.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Analog Devices MAX31335 RTC + +maintainers: + - Antoniu Miclaus + +description: + Analog Devices MAX31335 I2C RTC ±2ppm Automotive Real-Time Clock with + Integrated MEMS Resonator. + +allOf: + - $ref: rtc.yaml# + +properties: + compatible: + const: adi,max31335 + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + "#clock-cells": + description: + RTC can be used as a clock source through its clock output pin. + const: 0 + + adi,tc-diode: + description: + Select the diode configuration for the trickle charger. + schottky - Schottky diode in series. + standard+schottky - standard diode + Schottky diode in series. + enum: [schottky, standard+schottky] + + trickle-resistor-ohms: + description: + Selected resistor for trickle charger. Should be specified if trickle + charger should be enabled. + enum: [3000, 6000, 11000] + +required: + - compatible + - reg + +unevaluatedProperties: false + +examples: + - | + #include + i2c { + #address-cells = <1>; + #size-cells = <0>; + + rtc@68 { + compatible = "adi,max31335"; + reg = <0x68>; + pinctrl-0 = <&rtc_nint_pins>; + interrupts-extended = <&gpio1 16 IRQ_TYPE_LEVEL_HIGH>; + aux-voltage-chargeable = <1>; + trickle-resistor-ohms = <6000>; + adi,tc-diode = "schottky"; + }; + }; +... -- GitLab From dedaf03b99d6561fae06457fd7fc2b0aa154d003 Mon Sep 17 00:00:00 2001 From: Antoniu Miclaus Date: Mon, 20 Nov 2023 14:00:17 +0200 Subject: [PATCH 0958/1290] rtc: max31335: add driver support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RTC driver for MAX31335 ±2ppm Automotive Real-Time Clock with Integrated MEMS Resonator. Reviewed-by: Guenter Roeck Signed-off-by: Antoniu Miclaus Link: https://lore.kernel.org/r/20231120120114.48657-2-antoniu.miclaus@analog.com Signed-off-by: Alexandre Belloni --- MAINTAINERS | 8 + drivers/rtc/Kconfig | 13 + drivers/rtc/Makefile | 1 + drivers/rtc/rtc-max31335.c | 707 +++++++++++++++++++++++++++++++++++++ 4 files changed, 729 insertions(+) create mode 100644 drivers/rtc/rtc-max31335.c diff --git a/MAINTAINERS b/MAINTAINERS index 2635f7b59e3ca..b4e368049fbff 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12986,6 +12986,14 @@ F: Documentation/devicetree/bindings/hwmon/adi,max31827.yaml F: Documentation/hwmon/max31827.rst F: drivers/hwmon/max31827.c +MAX31335 RTC DRIVER +M: Antoniu Miclaus +L: linux-rtc@vger.kernel.org +S: Supported +W: https://ez.analog.com/linux-software-drivers +F: Documentation/devicetree/bindings/rtc/adi,max31335.yaml +F: drivers/rtc/rtc-max31335.c + MAX6650 HARDWARE MONITOR AND FAN CONTROLLER DRIVER L: linux-hwmon@vger.kernel.org S: Orphan diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 1ea9c76001505..e37a4341f442d 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -373,6 +373,19 @@ config RTC_DRV_MAX8997 This driver can also be built as a module. If so, the module will be called rtc-max8997. +config RTC_DRV_MAX31335 + tristate "Analog Devices MAX31335" + depends on I2C + depends on COMMON_CLK + depends on HWMON || HWMON=n + select REGMAP_I2C + help + If you say yes here you get support for the Analog Devices + MAX31335. + + This driver can also be built as a module. If so, the module + will be called rtc-max31335. + config RTC_DRV_MAX77686 tristate "Maxim MAX77686" depends on MFD_MAX77686 || MFD_MAX77620 || MFD_MAX77714 || COMPILE_TEST diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile index 31c4606fd9bf0..6efff381c484d 100644 --- a/drivers/rtc/Makefile +++ b/drivers/rtc/Makefile @@ -89,6 +89,7 @@ obj-$(CONFIG_RTC_DRV_M48T35) += rtc-m48t35.o obj-$(CONFIG_RTC_DRV_M48T59) += rtc-m48t59.o obj-$(CONFIG_RTC_DRV_M48T86) += rtc-m48t86.o obj-$(CONFIG_RTC_DRV_MA35D1) += rtc-ma35d1.o +obj-$(CONFIG_RTC_DRV_MAX31335) += rtc-max31335.o obj-$(CONFIG_RTC_DRV_MAX6900) += rtc-max6900.o obj-$(CONFIG_RTC_DRV_MAX6902) += rtc-max6902.o obj-$(CONFIG_RTC_DRV_MAX6916) += rtc-max6916.o diff --git a/drivers/rtc/rtc-max31335.c b/drivers/rtc/rtc-max31335.c new file mode 100644 index 0000000000000..3ddfe71bbb560 --- /dev/null +++ b/drivers/rtc/rtc-max31335.c @@ -0,0 +1,707 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * RTC driver for the MAX31335 + * + * Copyright (C) 2023 Analog Devices + * + * Antoniu Miclaus + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* MAX31335 Register Map */ +#define MAX31335_STATUS1 0x00 +#define MAX31335_INT_EN1 0x01 +#define MAX31335_STATUS2 0x02 +#define MAX31335_INT_EN2 0x03 +#define MAX31335_RTC_RESET 0x04 +#define MAX31335_RTC_CONFIG 0x05 +#define MAX31335_RTC_CONFIG2 0x06 +#define MAX31335_TIMESTAMP_CONFIG 0x07 +#define MAX31335_TIMER_CONFIG 0x08 +#define MAX31335_SECONDS_1_128 0x09 +#define MAX31335_SECONDS 0x0A +#define MAX31335_MINUTES 0x0B +#define MAX31335_HOURS 0x0C +#define MAX31335_DAY 0x0D +#define MAX31335_DATE 0x0E +#define MAX31335_MONTH 0x0F +#define MAX31335_YEAR 0x0F +#define MAX31335_ALM1_SEC 0x11 +#define MAX31335_ALM1_MIN 0x12 +#define MAX31335_ALM1_HRS 0x13 +#define MAX31335_ALM1_DAY_DATE 0x14 +#define MAX31335_ALM1_MON 0x15 +#define MAX31335_ALM1_YEAR 0x16 +#define MAX31335_ALM2_MIN 0x17 +#define MAX31335_ALM2_HRS 0x18 +#define MAX31335_ALM2_DAY_DATE 0x19 +#define MAX31335_TIMER_COUNT 0x1A +#define MAX31335_TIMER_INIT 0x1B +#define MAX31335_PWR_MGMT 0x1C +#define MAX31335_TRICKLE_REG 0x1D +#define MAX31335_AGING_OFFSET 0x1E +#define MAX31335_TS_CONFIG 0x30 +#define MAX31335_TEMP_ALARM_HIGH_MSB 0x31 +#define MAX31335_TEMP_ALARM_HIGH_LSB 0x32 +#define MAX31335_TEMP_ALARM_LOW_MSB 0x33 +#define MAX31335_TEMP_ALARM_LOW_LSB 0x34 +#define MAX31335_TEMP_DATA_MSB 0x35 +#define MAX31335_TEMP_DATA_LSB 0x36 +#define MAX31335_TS0_SEC_1_128 0x40 +#define MAX31335_TS0_SEC 0x41 +#define MAX31335_TS0_MIN 0x42 +#define MAX31335_TS0_HOUR 0x43 +#define MAX31335_TS0_DATE 0x44 +#define MAX31335_TS0_MONTH 0x45 +#define MAX31335_TS0_YEAR 0x46 +#define MAX31335_TS0_FLAGS 0x47 +#define MAX31335_TS1_SEC_1_128 0x48 +#define MAX31335_TS1_SEC 0x49 +#define MAX31335_TS1_MIN 0x4A +#define MAX31335_TS1_HOUR 0x4B +#define MAX31335_TS1_DATE 0x4C +#define MAX31335_TS1_MONTH 0x4D +#define MAX31335_TS1_YEAR 0x4E +#define MAX31335_TS1_FLAGS 0x4F +#define MAX31335_TS2_SEC_1_128 0x50 +#define MAX31335_TS2_SEC 0x51 +#define MAX31335_TS2_MIN 0x52 +#define MAX31335_TS2_HOUR 0x53 +#define MAX31335_TS2_DATE 0x54 +#define MAX31335_TS2_MONTH 0x55 +#define MAX31335_TS2_YEAR 0x56 +#define MAX31335_TS2_FLAGS 0x57 +#define MAX31335_TS3_SEC_1_128 0x58 +#define MAX31335_TS3_SEC 0x59 +#define MAX31335_TS3_MIN 0x5A +#define MAX31335_TS3_HOUR 0x5B +#define MAX31335_TS3_DATE 0x5C +#define MAX31335_TS3_MONTH 0x5D +#define MAX31335_TS3_YEAR 0x5E +#define MAX31335_TS3_FLAGS 0x5F + +/* MAX31335_STATUS1 Bit Definitions */ +#define MAX31335_STATUS1_PSDECT BIT(7) +#define MAX31335_STATUS1_OSF BIT(6) +#define MAX31335_STATUS1_PFAIL BIT(5) +#define MAX31335_STATUS1_VBATLOW BIT(4) +#define MAX31335_STATUS1_DIF BIT(3) +#define MAX31335_STATUS1_TIF BIT(2) +#define MAX31335_STATUS1_A2F BIT(1) +#define MAX31335_STATUS1_A1F BIT(0) + +/* MAX31335_INT_EN1 Bit Definitions */ +#define MAX31335_INT_EN1_DOSF BIT(6) +#define MAX31335_INT_EN1_PFAILE BIT(5) +#define MAX31335_INT_EN1_VBATLOWE BIT(4) +#define MAX31335_INT_EN1_DIE BIT(3) +#define MAX31335_INT_EN1_TIE BIT(2) +#define MAX31335_INT_EN1_A2IE BIT(1) +#define MAX31335_INT_EN1_A1IE BIT(0) + +/* MAX31335_STATUS2 Bit Definitions */ +#define MAX31335_STATUS2_TEMP_RDY BIT(2) +#define MAX31335_STATUS2_OTF BIT(1) +#define MAX31335_STATUS2_UTF BIT(0) + +/* MAX31335_INT_EN2 Bit Definitions */ +#define MAX31335_INT_EN2_TEMP_RDY_EN BIT(2) +#define MAX31335_INT_EN2_OTIE BIT(1) +#define MAX31335_INT_EN2_UTIE BIT(0) + +/* MAX31335_RTC_RESET Bit Definitions */ +#define MAX31335_RTC_RESET_SWRST BIT(0) + +/* MAX31335_RTC_CONFIG1 Bit Definitions */ +#define MAX31335_RTC_CONFIG1_EN_IO BIT(6) +#define MAX31335_RTC_CONFIG1_A1AC GENMASK(5, 4) +#define MAX31335_RTC_CONFIG1_DIP BIT(3) +#define MAX31335_RTC_CONFIG1_I2C_TIMEOUT BIT(1) +#define MAX31335_RTC_CONFIG1_EN_OSC BIT(0) + +/* MAX31335_RTC_CONFIG2 Bit Definitions */ +#define MAX31335_RTC_CONFIG2_ENCLKO BIT(2) +#define MAX31335_RTC_CONFIG2_CLKO_HZ GENMASK(1, 0) + +/* MAX31335_TIMESTAMP_CONFIG Bit Definitions */ +#define MAX31335_TIMESTAMP_CONFIG_TSVLOW BIT(5) +#define MAX31335_TIMESTAMP_CONFIG_TSPWM BIT(4) +#define MAX31335_TIMESTAMP_CONFIG_TSDIN BIT(3) +#define MAX31335_TIMESTAMP_CONFIG_TSOW BIT(2) +#define MAX31335_TIMESTAMP_CONFIG_TSR BIT(1) +#define MAX31335_TIMESTAMP_CONFIG_TSE BIT(0) + +/* MAX31335_TIMER_CONFIG Bit Definitions */ +#define MAX31335_TIMER_CONFIG_TE BIT(4) +#define MAX31335_TIMER_CONFIG_TPAUSE BIT(3) +#define MAX31335_TIMER_CONFIG_TRPT BIT(2) +#define MAX31335_TIMER_CONFIG_TFS GENMASK(1, 0) + +/* MAX31335_HOURS Bit Definitions */ +#define MAX31335_HOURS_F_24_12 BIT(6) +#define MAX31335_HOURS_HR_20_AM_PM BIT(5) + +/* MAX31335_MONTH Bit Definitions */ +#define MAX31335_MONTH_CENTURY BIT(7) + +/* MAX31335_PWR_MGMT Bit Definitions */ +#define MAX31335_PWR_MGMT_PFVT BIT(0) + +/* MAX31335_TRICKLE_REG Bit Definitions */ +#define MAX31335_TRICKLE_REG_TRICKLE GENMASK(3, 1) +#define MAX31335_TRICKLE_REG_EN_TRICKLE BIT(0) + +/* MAX31335_TS_CONFIG Bit Definitions */ +#define MAX31335_TS_CONFIG_AUTO BIT(4) +#define MAX31335_TS_CONFIG_CONVERT_T BIT(3) +#define MAX31335_TS_CONFIG_TSINT GENMASK(2, 0) + +/* MAX31335_TS_FLAGS Bit Definitions */ +#define MAX31335_TS_FLAGS_VLOWF BIT(3) +#define MAX31335_TS_FLAGS_VBATF BIT(2) +#define MAX31335_TS_FLAGS_VCCF BIT(1) +#define MAX31335_TS_FLAGS_DINF BIT(0) + +/* MAX31335 Miscellaneous Definitions */ +#define MAX31335_TRICKLE_SCHOTTKY_DIODE 1 +#define MAX31335_TRICKLE_STANDARD_DIODE 4 +#define MAX31335_RAM_SIZE 32 +#define MAX31335_TIME_SIZE 0x07 + +#define clk_hw_to_max31335(_hw) container_of(_hw, struct max31335_data, clkout) + +struct max31335_data { + struct regmap *regmap; + struct rtc_device *rtc; + struct clk_hw clkout; +}; + +static const int max31335_clkout_freq[] = { 1, 64, 1024, 32768 }; + +static const u16 max31335_trickle_resistors[] = {3000, 6000, 11000}; + +static bool max31335_volatile_reg(struct device *dev, unsigned int reg) +{ + /* time keeping registers */ + if (reg >= MAX31335_SECONDS && + reg < MAX31335_SECONDS + MAX31335_TIME_SIZE) + return true; + + /* interrupt status register */ + if (reg == MAX31335_INT_EN1_A1IE) + return true; + + /* temperature registers */ + if (reg == MAX31335_TEMP_DATA_MSB || MAX31335_TEMP_DATA_LSB) + return true; + + return false; +} + +static const struct regmap_config regmap_config = { + .reg_bits = 8, + .val_bits = 8, + .max_register = 0x5F, + .volatile_reg = max31335_volatile_reg, +}; + +static int max31335_read_time(struct device *dev, struct rtc_time *tm) +{ + struct max31335_data *max31335 = dev_get_drvdata(dev); + u8 date[7]; + int ret; + + ret = regmap_bulk_read(max31335->regmap, MAX31335_SECONDS, date, + sizeof(date)); + if (ret) + return ret; + + tm->tm_sec = bcd2bin(date[0] & 0x7f); + tm->tm_min = bcd2bin(date[1] & 0x7f); + tm->tm_hour = bcd2bin(date[2] & 0x3f); + tm->tm_wday = bcd2bin(date[3] & 0x7) - 1; + tm->tm_mday = bcd2bin(date[4] & 0x3f); + tm->tm_mon = bcd2bin(date[5] & 0x1f) - 1; + tm->tm_year = bcd2bin(date[6]) + 100; + + if (FIELD_GET(MAX31335_MONTH_CENTURY, date[5])) + tm->tm_year += 100; + + return 0; +} + +static int max31335_set_time(struct device *dev, struct rtc_time *tm) +{ + struct max31335_data *max31335 = dev_get_drvdata(dev); + u8 date[7]; + + date[0] = bin2bcd(tm->tm_sec); + date[1] = bin2bcd(tm->tm_min); + date[2] = bin2bcd(tm->tm_hour); + date[3] = bin2bcd(tm->tm_wday + 1); + date[4] = bin2bcd(tm->tm_mday); + date[5] = bin2bcd(tm->tm_mon + 1); + date[6] = bin2bcd(tm->tm_year % 100); + + if (tm->tm_year >= 200) + date[5] |= FIELD_PREP(MAX31335_MONTH_CENTURY, 1); + + return regmap_bulk_write(max31335->regmap, MAX31335_SECONDS, date, + sizeof(date)); +} + +static int max31335_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) +{ + struct max31335_data *max31335 = dev_get_drvdata(dev); + int ret, ctrl, status; + struct rtc_time time; + u8 regs[6]; + + ret = regmap_bulk_read(max31335->regmap, MAX31335_ALM1_SEC, regs, + sizeof(regs)); + if (ret) + return ret; + + alrm->time.tm_sec = bcd2bin(regs[0] & 0x7f); + alrm->time.tm_min = bcd2bin(regs[1] & 0x7f); + alrm->time.tm_hour = bcd2bin(regs[2] & 0x3f); + alrm->time.tm_mday = bcd2bin(regs[3] & 0x3f); + alrm->time.tm_mon = bcd2bin(regs[4] & 0x1f) - 1; + alrm->time.tm_year = bcd2bin(regs[5]) + 100; + + ret = max31335_read_time(dev, &time); + if (ret) + return ret; + + if (time.tm_year >= 200) + alrm->time.tm_year += 100; + + ret = regmap_read(max31335->regmap, MAX31335_INT_EN1, &ctrl); + if (ret) + return ret; + + ret = regmap_read(max31335->regmap, MAX31335_STATUS1, &status); + if (ret) + return ret; + + alrm->enabled = FIELD_GET(MAX31335_INT_EN1_A1IE, ctrl); + alrm->pending = FIELD_GET(MAX31335_STATUS1_A1F, status); + + return 0; +} + +static int max31335_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) +{ + struct max31335_data *max31335 = dev_get_drvdata(dev); + unsigned int reg; + u8 regs[6]; + int ret; + + regs[0] = bin2bcd(alrm->time.tm_sec); + regs[1] = bin2bcd(alrm->time.tm_min); + regs[2] = bin2bcd(alrm->time.tm_hour); + regs[3] = bin2bcd(alrm->time.tm_mday); + regs[4] = bin2bcd(alrm->time.tm_mon + 1); + regs[5] = bin2bcd(alrm->time.tm_year % 100); + + ret = regmap_bulk_write(max31335->regmap, MAX31335_ALM1_SEC, + regs, sizeof(regs)); + if (ret) + return ret; + + reg = FIELD_PREP(MAX31335_INT_EN1_A1IE, alrm->enabled); + ret = regmap_update_bits(max31335->regmap, MAX31335_INT_EN1, + MAX31335_INT_EN1_A1IE, reg); + if (ret) + return ret; + + ret = regmap_update_bits(max31335->regmap, MAX31335_STATUS1, + MAX31335_STATUS1_A1F, 0); + + return 0; +} + +static int max31335_alarm_irq_enable(struct device *dev, unsigned int enabled) +{ + struct max31335_data *max31335 = dev_get_drvdata(dev); + + return regmap_update_bits(max31335->regmap, MAX31335_INT_EN1, + MAX31335_INT_EN1_A1IE, enabled); +} + +static irqreturn_t max31335_handle_irq(int irq, void *dev_id) +{ + struct max31335_data *max31335 = dev_id; + struct mutex *lock = &max31335->rtc->ops_lock; + int ret, status; + + mutex_lock(lock); + + ret = regmap_read(max31335->regmap, MAX31335_STATUS1, &status); + if (ret) + goto exit; + + if (FIELD_GET(MAX31335_STATUS1_A1F, status)) { + ret = regmap_update_bits(max31335->regmap, MAX31335_STATUS1, + MAX31335_STATUS1_A1F, 0); + if (ret) + goto exit; + + rtc_update_irq(max31335->rtc, 1, RTC_AF | RTC_IRQF); + } + +exit: + mutex_unlock(lock); + + return IRQ_HANDLED; +} + +static const struct rtc_class_ops max31335_rtc_ops = { + .read_time = max31335_read_time, + .set_time = max31335_set_time, + .read_alarm = max31335_read_alarm, + .set_alarm = max31335_set_alarm, + .alarm_irq_enable = max31335_alarm_irq_enable, +}; + +static int max31335_trickle_charger_setup(struct device *dev, + struct max31335_data *max31335) +{ + u32 ohms, chargeable; + int i, trickle_cfg; + const char *diode; + + if (device_property_read_u32(dev, "aux-voltage-chargeable", + &chargeable)) + return 0; + + if (device_property_read_u32(dev, "trickle-resistor-ohms", &ohms)) + return 0; + + if (device_property_read_string(dev, "adi,tc-diode", &diode)) + return 0; + + if (!strcmp(diode, "schottky")) + trickle_cfg = MAX31335_TRICKLE_SCHOTTKY_DIODE; + else if (!strcmp(diode, "standard+schottky")) + trickle_cfg = MAX31335_TRICKLE_STANDARD_DIODE; + else + return dev_err_probe(dev, -EINVAL, + "Invalid tc-diode value: %s\n", diode); + + for (i = 0; i < ARRAY_SIZE(max31335_trickle_resistors); i++) + if (ohms == max31335_trickle_resistors[i]) + break; + + if (i >= ARRAY_SIZE(max31335_trickle_resistors)) + return 0; + + i = i + trickle_cfg; + + return regmap_write(max31335->regmap, MAX31335_TRICKLE_REG, + FIELD_PREP(MAX31335_TRICKLE_REG_TRICKLE, i) | + FIELD_PREP(MAX31335_TRICKLE_REG_EN_TRICKLE, + chargeable)); +} + +static unsigned long max31335_clkout_recalc_rate(struct clk_hw *hw, + unsigned long parent_rate) +{ + struct max31335_data *max31335 = clk_hw_to_max31335(hw); + unsigned int freq_mask; + unsigned int reg; + int ret; + + ret = regmap_read(max31335->regmap, MAX31335_RTC_CONFIG2, ®); + if (ret) + return 0; + + freq_mask = __roundup_pow_of_two(ARRAY_SIZE(max31335_clkout_freq)) - 1; + + return max31335_clkout_freq[reg & freq_mask]; +} + +static long max31335_clkout_round_rate(struct clk_hw *hw, unsigned long rate, + unsigned long *prate) +{ + int index; + + index = find_closest(rate, max31335_clkout_freq, + ARRAY_SIZE(max31335_clkout_freq)); + + return max31335_clkout_freq[index]; +} + +static int max31335_clkout_set_rate(struct clk_hw *hw, unsigned long rate, + unsigned long parent_rate) +{ + struct max31335_data *max31335 = clk_hw_to_max31335(hw); + unsigned int freq_mask; + int index; + + index = find_closest(rate, max31335_clkout_freq, + ARRAY_SIZE(max31335_clkout_freq)); + freq_mask = __roundup_pow_of_two(ARRAY_SIZE(max31335_clkout_freq)) - 1; + + return regmap_update_bits(max31335->regmap, MAX31335_RTC_CONFIG2, + freq_mask, index); +} + +static int max31335_clkout_enable(struct clk_hw *hw) +{ + struct max31335_data *max31335 = clk_hw_to_max31335(hw); + + return regmap_set_bits(max31335->regmap, MAX31335_RTC_CONFIG2, + MAX31335_RTC_CONFIG2_ENCLKO); +} + +static void max31335_clkout_disable(struct clk_hw *hw) +{ + struct max31335_data *max31335 = clk_hw_to_max31335(hw); + + regmap_clear_bits(max31335->regmap, MAX31335_RTC_CONFIG2, + MAX31335_RTC_CONFIG2_ENCLKO); +} + +static int max31335_clkout_is_enabled(struct clk_hw *hw) +{ + struct max31335_data *max31335 = clk_hw_to_max31335(hw); + unsigned int reg; + int ret; + + ret = regmap_read(max31335->regmap, MAX31335_RTC_CONFIG2, ®); + if (ret) + return ret; + + return !!(reg & MAX31335_RTC_CONFIG2_ENCLKO); +} + +static const struct clk_ops max31335_clkout_ops = { + .recalc_rate = max31335_clkout_recalc_rate, + .round_rate = max31335_clkout_round_rate, + .set_rate = max31335_clkout_set_rate, + .enable = max31335_clkout_enable, + .disable = max31335_clkout_disable, + .is_enabled = max31335_clkout_is_enabled, +}; + +static struct clk_init_data max31335_clk_init = { + .name = "max31335-clkout", + .ops = &max31335_clkout_ops, +}; + +static int max31335_nvmem_reg_read(void *priv, unsigned int offset, + void *val, size_t bytes) +{ + struct max31335_data *max31335 = priv; + unsigned int reg = MAX31335_TS0_SEC_1_128 + offset; + + return regmap_bulk_read(max31335->regmap, reg, val, bytes); +} + +static int max31335_nvmem_reg_write(void *priv, unsigned int offset, + void *val, size_t bytes) +{ + struct max31335_data *max31335 = priv; + unsigned int reg = MAX31335_TS0_SEC_1_128 + offset; + + return regmap_bulk_write(max31335->regmap, reg, val, bytes); +} + +static struct nvmem_config max31335_nvmem_cfg = { + .reg_read = max31335_nvmem_reg_read, + .reg_write = max31335_nvmem_reg_write, + .word_size = 8, + .size = MAX31335_RAM_SIZE, +}; + +#if IS_REACHABLE(HWMON) +static int max31335_read_temp(struct device *dev, enum hwmon_sensor_types type, + u32 attr, int channel, long *val) +{ + struct max31335_data *max31335 = dev_get_drvdata(dev); + u8 reg[2]; + s16 temp; + int ret; + + if (type != hwmon_temp || attr != hwmon_temp_input) + return -EOPNOTSUPP; + + ret = regmap_bulk_read(max31335->regmap, MAX31335_TEMP_DATA_MSB, + reg, 2); + if (ret) + return ret; + + temp = get_unaligned_be16(reg); + + *val = (temp / 64) * 250; + + return 0; +} + +static umode_t max31335_is_visible(const void *data, + enum hwmon_sensor_types type, + u32 attr, int channel) +{ + if (type == hwmon_temp && attr == hwmon_temp_input) + return 0444; + + return 0; +} + +static const struct hwmon_channel_info *max31335_info[] = { + HWMON_CHANNEL_INFO(temp, HWMON_T_INPUT), + NULL +}; + +static const struct hwmon_ops max31335_hwmon_ops = { + .is_visible = max31335_is_visible, + .read = max31335_read_temp, +}; + +static const struct hwmon_chip_info max31335_chip_info = { + .ops = &max31335_hwmon_ops, + .info = max31335_info, +}; +#endif + +static int max31335_clkout_register(struct device *dev) +{ + struct max31335_data *max31335 = dev_get_drvdata(dev); + int ret; + + if (!device_property_present(dev, "#clock-cells")) + return regmap_clear_bits(max31335->regmap, MAX31335_RTC_CONFIG2, + MAX31335_RTC_CONFIG2_ENCLKO); + + max31335->clkout.init = &max31335_clk_init; + + ret = devm_clk_hw_register(dev, &max31335->clkout); + if (ret) + return dev_err_probe(dev, ret, "cannot register clock\n"); + + ret = devm_of_clk_add_hw_provider(dev, of_clk_hw_simple_get, + &max31335->clkout); + if (ret) + return dev_err_probe(dev, ret, "cannot add hw provider\n"); + + max31335->clkout.clk = devm_clk_get_enabled(dev, NULL); + if (IS_ERR(max31335->clkout.clk)) + return dev_err_probe(dev, PTR_ERR(max31335->clkout.clk), + "cannot enable clkout\n"); + + return 0; +} + +static int max31335_probe(struct i2c_client *client) +{ + struct max31335_data *max31335; +#if IS_REACHABLE(HWMON) + struct device *hwmon; +#endif + int ret; + + max31335 = devm_kzalloc(&client->dev, sizeof(*max31335), GFP_KERNEL); + if (!max31335) + return -ENOMEM; + + max31335->regmap = devm_regmap_init_i2c(client, ®map_config); + if (IS_ERR(max31335->regmap)) + return PTR_ERR(max31335->regmap); + + i2c_set_clientdata(client, max31335); + + max31335->rtc = devm_rtc_allocate_device(&client->dev); + if (IS_ERR(max31335->rtc)) + return PTR_ERR(max31335->rtc); + + max31335->rtc->ops = &max31335_rtc_ops; + max31335->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000; + max31335->rtc->range_max = RTC_TIMESTAMP_END_2199; + max31335->rtc->alarm_offset_max = 24 * 60 * 60; + + ret = max31335_clkout_register(&client->dev); + if (ret) + return ret; + + if (client->irq > 0) { + ret = devm_request_threaded_irq(&client->dev, client->irq, + NULL, max31335_handle_irq, + IRQF_ONESHOT, + "max31335", max31335); + if (ret) { + dev_warn(&client->dev, + "unable to request IRQ, alarm max31335 disabled\n"); + client->irq = 0; + } + } + + if (!client->irq) + clear_bit(RTC_FEATURE_ALARM, max31335->rtc->features); + + max31335_nvmem_cfg.priv = max31335; + ret = devm_rtc_nvmem_register(max31335->rtc, &max31335_nvmem_cfg); + if (ret) + return dev_err_probe(&client->dev, ret, + "cannot register rtc nvmem\n"); + +#if IS_REACHABLE(HWMON) + hwmon = devm_hwmon_device_register_with_info(&client->dev, client->name, + max31335, + &max31335_chip_info, + NULL); + if (IS_ERR(hwmon)) + return dev_err_probe(&client->dev, PTR_ERR(hwmon), + "cannot register hwmon device\n"); +#endif + + ret = max31335_trickle_charger_setup(&client->dev, max31335); + if (ret) + return ret; + + return devm_rtc_register_device(max31335->rtc); +} + +static const struct i2c_device_id max31335_id[] = { + { "max31335", 0 }, + { } +}; + +MODULE_DEVICE_TABLE(i2c, max31335_id); + +static const struct of_device_id max31335_of_match[] = { + { .compatible = "adi,max31335" }, + { } +}; + +MODULE_DEVICE_TABLE(of, max31335_of_match); + +static struct i2c_driver max31335_driver = { + .driver = { + .name = "rtc-max31335", + .of_match_table = max31335_of_match, + }, + .probe = max31335_probe, + .id_table = max31335_id, +}; +module_i2c_driver(max31335_driver); + +MODULE_AUTHOR("Antoniu Miclaus "); +MODULE_DESCRIPTION("MAX31335 RTC driver"); +MODULE_LICENSE("GPL"); -- GitLab From 02eed83abc1395a1207591aafad9bcfc5cb1abcb Mon Sep 17 00:00:00 2001 From: Dafna Hirschfeld Date: Sun, 7 Jan 2024 15:07:00 +0200 Subject: [PATCH 0959/1290] drm/amdkfd: fixes for HMM mem allocation Fix err return value and reset pgmap->type after checking it. Fixes: c83dee9b6394 ("drm/amdkfd: add SPM support for SVM") Reviewed-by: Felix Kuehling Signed-off-by: Dafna Hirschfeld Signed-off-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index d630100b9e91b..f856901055d34 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -1026,7 +1026,7 @@ int kgd2kfd_init_zone_device(struct amdgpu_device *adev) } else { res = devm_request_free_mem_region(adev->dev, &iomem_resource, size); if (IS_ERR(res)) - return -ENOMEM; + return PTR_ERR(res); pgmap->range.start = res->start; pgmap->range.end = res->end; pgmap->type = MEMORY_DEVICE_PRIVATE; @@ -1042,10 +1042,10 @@ int kgd2kfd_init_zone_device(struct amdgpu_device *adev) r = devm_memremap_pages(adev->dev, pgmap); if (IS_ERR(r)) { pr_err("failed to register HMM device memory\n"); - /* Disable SVM support capability */ - pgmap->type = 0; if (pgmap->type == MEMORY_DEVICE_PRIVATE) devm_release_mem_region(adev->dev, res->start, resource_size(res)); + /* Disable SVM support capability */ + pgmap->type = 0; return PTR_ERR(r); } -- GitLab From 25852d4b97572ff62ffee574cb8bb4bc551af23a Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 2 Oct 2023 14:27:13 -0400 Subject: [PATCH 0960/1290] drm/amdgpu: fix avg vs input power reporting on smu7 Hawaii, Bonaire, Fiji, and Tonga support average power, the others support current power. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher --- .../gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c index b1a8799e2dee3..aa91730e4eaff 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c +++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c @@ -3999,6 +3999,7 @@ static int smu7_read_sensor(struct pp_hwmgr *hwmgr, int idx, uint32_t sclk, mclk, activity_percent; uint32_t offset, val_vid; struct smu7_hwmgr *data = (struct smu7_hwmgr *)(hwmgr->backend); + struct amdgpu_device *adev = hwmgr->adev; /* size must be at least 4 bytes for all sensors */ if (*size < 4) @@ -4042,7 +4043,21 @@ static int smu7_read_sensor(struct pp_hwmgr *hwmgr, int idx, *size = 4; return 0; case AMDGPU_PP_SENSOR_GPU_INPUT_POWER: - return smu7_get_gpu_power(hwmgr, (uint32_t *)value); + if ((adev->asic_type != CHIP_HAWAII) && + (adev->asic_type != CHIP_BONAIRE) && + (adev->asic_type != CHIP_FIJI) && + (adev->asic_type != CHIP_TONGA)) + return smu7_get_gpu_power(hwmgr, (uint32_t *)value); + else + return -EOPNOTSUPP; + case AMDGPU_PP_SENSOR_GPU_AVG_POWER: + if ((adev->asic_type != CHIP_HAWAII) && + (adev->asic_type != CHIP_BONAIRE) && + (adev->asic_type != CHIP_FIJI) && + (adev->asic_type != CHIP_TONGA)) + return -EOPNOTSUPP; + else + return smu7_get_gpu_power(hwmgr, (uint32_t *)value); case AMDGPU_PP_SENSOR_VDDGFX: if ((data->vr_config & VRCONF_VDDGFX_MASK) == (VR_SVI2_PLANE_2 << VRCONF_VDDGFX_SHIFT)) -- GitLab From d02069850fc102b07ae923535d5e212f2c8a34e9 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 2 Oct 2023 14:43:06 -0400 Subject: [PATCH 0961/1290] drm/amdgpu: fall back to INPUT power for AVG power via INFO IOCTL For backwards compatibility with userspace. Fixes: 47f1724db4fe ("drm/amd: Introduce `AMDGPU_PP_SENSOR_GPU_INPUT_POWER`") Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2897 Reviewed-by: Yang Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index b5ebafd4a3adf..bf4f48fe438d1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -1105,7 +1105,12 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) if (amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_AVG_POWER, (void *)&ui32, &ui32_size)) { - return -EINVAL; + /* fall back to input power for backwards compat */ + if (amdgpu_dpm_read_sensor(adev, + AMDGPU_PP_SENSOR_GPU_INPUT_POWER, + (void *)&ui32, &ui32_size)) { + return -EINVAL; + } } ui32 >>= 8; break; -- GitLab From 6127d7df4a5b66783da5a55ff60b3920a9c315a2 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 7 Dec 2023 10:36:56 -0500 Subject: [PATCH 0962/1290] drm/amdgpu/pm: clarify debugfs pm output On APUs power is SoC power, not just GPU. Clarify that for UVD/VCE/VCN the IP is powered down, not disabled which can confusing and lead to concerns that the IP is actually not available. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/amdgpu_pm.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c index f3cb490fe79b1..087d57850304c 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c @@ -4349,11 +4349,19 @@ static int amdgpu_debugfs_pm_info_pp(struct seq_file *m, struct amdgpu_device *a if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_VDDNB, (void *)&value, &size)) seq_printf(m, "\t%u mV (VDDNB)\n", value); size = sizeof(uint32_t); - if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_AVG_POWER, (void *)&query, &size)) - seq_printf(m, "\t%u.%02u W (average GPU)\n", query >> 8, query & 0xff); + if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_AVG_POWER, (void *)&query, &size)) { + if (adev->flags & AMD_IS_APU) + seq_printf(m, "\t%u.%02u W (average SoC including CPU)\n", query >> 8, query & 0xff); + else + seq_printf(m, "\t%u.%02u W (average SoC)\n", query >> 8, query & 0xff); + } size = sizeof(uint32_t); - if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_INPUT_POWER, (void *)&query, &size)) - seq_printf(m, "\t%u.%02u W (current GPU)\n", query >> 8, query & 0xff); + if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_INPUT_POWER, (void *)&query, &size)) { + if (adev->flags & AMD_IS_APU) + seq_printf(m, "\t%u.%02u W (current SoC including CPU)\n", query >> 8, query & 0xff); + else + seq_printf(m, "\t%u.%02u W (current SoC)\n", query >> 8, query & 0xff); + } size = sizeof(value); seq_printf(m, "\n"); @@ -4379,9 +4387,9 @@ static int amdgpu_debugfs_pm_info_pp(struct seq_file *m, struct amdgpu_device *a /* VCN clocks */ if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_VCN_POWER_STATE, (void *)&value, &size)) { if (!value) { - seq_printf(m, "VCN: Disabled\n"); + seq_printf(m, "VCN: Powered down\n"); } else { - seq_printf(m, "VCN: Enabled\n"); + seq_printf(m, "VCN: Powered up\n"); if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_UVD_DCLK, (void *)&value, &size)) seq_printf(m, "\t%u MHz (DCLK)\n", value/100); if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_UVD_VCLK, (void *)&value, &size)) @@ -4393,9 +4401,9 @@ static int amdgpu_debugfs_pm_info_pp(struct seq_file *m, struct amdgpu_device *a /* UVD clocks */ if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_UVD_POWER, (void *)&value, &size)) { if (!value) { - seq_printf(m, "UVD: Disabled\n"); + seq_printf(m, "UVD: Powered down\n"); } else { - seq_printf(m, "UVD: Enabled\n"); + seq_printf(m, "UVD: Powered up\n"); if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_UVD_DCLK, (void *)&value, &size)) seq_printf(m, "\t%u MHz (DCLK)\n", value/100); if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_UVD_VCLK, (void *)&value, &size)) @@ -4407,9 +4415,9 @@ static int amdgpu_debugfs_pm_info_pp(struct seq_file *m, struct amdgpu_device *a /* VCE clocks */ if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_VCE_POWER, (void *)&value, &size)) { if (!value) { - seq_printf(m, "VCE: Disabled\n"); + seq_printf(m, "VCE: Powered down\n"); } else { - seq_printf(m, "VCE: Enabled\n"); + seq_printf(m, "VCE: Powered up\n"); if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_VCE_ECCLK, (void *)&value, &size)) seq_printf(m, "\t%u MHz (ECCLK)\n", value/100); } -- GitLab From 8f8cb7124e86c68ab09aa446664192d3829a40be Mon Sep 17 00:00:00 2001 From: Yifan Zhang Date: Mon, 8 Jan 2024 15:46:12 +0800 Subject: [PATCH 0963/1290] drm/amdgpu: update headers for nbio v7.11 This patch is to update headers for nbio v7.11. Signed-off-by: Yifan Zhang Acked-by: Alex Deucher Reviewed-by: Tim Huang Signed-off-by: Alex Deucher --- .../drm/amd/include/asic_reg/nbio/nbio_7_11_0_offset.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/include/asic_reg/nbio/nbio_7_11_0_offset.h b/drivers/gpu/drm/amd/include/asic_reg/nbio/nbio_7_11_0_offset.h index 7ee3d291120d5..6f80bfa7e41ac 100644 --- a/drivers/gpu/drm/amd/include/asic_reg/nbio/nbio_7_11_0_offset.h +++ b/drivers/gpu/drm/amd/include/asic_reg/nbio/nbio_7_11_0_offset.h @@ -8707,10 +8707,10 @@ #define regBIF_BX1_MM_CFGREGS_CNTL_BASE_IDX 2 #define regBIF_BX1_BX_RESET_CNTL 0x00f0 #define regBIF_BX1_BX_RESET_CNTL_BASE_IDX 2 -#define regBIF_BX1_INTERRUPT_CNTL 0x8e11 -#define regBIF_BX1_INTERRUPT_CNTL_BASE_IDX 5 -#define regBIF_BX1_INTERRUPT_CNTL2 0x8e12 -#define regBIF_BX1_INTERRUPT_CNTL2_BASE_IDX 5 +#define regBIF_BX1_INTERRUPT_CNTL 0x00f1 +#define regBIF_BX1_INTERRUPT_CNTL_BASE_IDX 2 +#define regBIF_BX1_INTERRUPT_CNTL2 0x00f2 +#define regBIF_BX1_INTERRUPT_CNTL2_BASE_IDX 2 #define regBIF_BX1_CLKREQB_PAD_CNTL 0x00f8 #define regBIF_BX1_CLKREQB_PAD_CNTL_BASE_IDX 2 #define regBIF_BX1_BIF_FEATURES_CONTROL_MISC 0x00fb -- GitLab From c9edcc1864f8529fd24441da40a1275232b5efc4 Mon Sep 17 00:00:00 2001 From: Yifan Zhang Date: Mon, 8 Jan 2024 16:05:27 +0800 Subject: [PATCH 0964/1290] drm/amdgpu: update ATHUB_MISC_CNTL offset for athub v3.3 This patch to update ATHUB_MISC_CNTL offset for athub v3.3 v2: correct a typo (Tim) v3: correct patch title (Lang) Signed-off-by: Yifan Zhang Acked-by: Alex Deucher Reviewed-by: Tim Huang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/athub_v3_0.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/athub_v3_0.c b/drivers/gpu/drm/amd/amdgpu/athub_v3_0.c index f0737fb3a999e..d1bba9c64e16d 100644 --- a/drivers/gpu/drm/amd/amdgpu/athub_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/athub_v3_0.c @@ -30,6 +30,8 @@ #define regATHUB_MISC_CNTL_V3_0_1 0x00d7 #define regATHUB_MISC_CNTL_V3_0_1_BASE_IDX 0 +#define regATHUB_MISC_CNTL_V3_3_0 0x00d8 +#define regATHUB_MISC_CNTL_V3_3_0_BASE_IDX 0 static uint32_t athub_v3_0_get_cg_cntl(struct amdgpu_device *adev) @@ -40,6 +42,9 @@ static uint32_t athub_v3_0_get_cg_cntl(struct amdgpu_device *adev) case IP_VERSION(3, 0, 1): data = RREG32_SOC15(ATHUB, 0, regATHUB_MISC_CNTL_V3_0_1); break; + case IP_VERSION(3, 3, 0): + data = RREG32_SOC15(ATHUB, 0, regATHUB_MISC_CNTL_V3_3_0); + break; default: data = RREG32_SOC15(ATHUB, 0, regATHUB_MISC_CNTL); break; @@ -53,6 +58,9 @@ static void athub_v3_0_set_cg_cntl(struct amdgpu_device *adev, uint32_t data) case IP_VERSION(3, 0, 1): WREG32_SOC15(ATHUB, 0, regATHUB_MISC_CNTL_V3_0_1, data); break; + case IP_VERSION(3, 3, 0): + WREG32_SOC15(ATHUB, 0, regATHUB_MISC_CNTL_V3_3_0, data); + break; default: WREG32_SOC15(ATHUB, 0, regATHUB_MISC_CNTL, data); break; -- GitLab From 6616b5e1999146b1304abe78232af810080c67e3 Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Fri, 5 Jan 2024 12:05:09 +0530 Subject: [PATCH 0965/1290] drm/amd/powerplay: Fix kzalloc parameter 'ATOM_Tonga_PPM_Table' in 'get_platform_power_management_table()' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In 'struct phm_ppm_table *ptr' allocation using kzalloc, an incorrect structure type is passed to sizeof() in kzalloc, larger structure types were used, thus using correct type 'struct phm_ppm_table' fixes the below: drivers/gpu/drm/amd/amdgpu/../pm/powerplay/hwmgr/process_pptables_v1_0.c:203 get_platform_power_management_table() warn: struct type mismatch 'phm_ppm_table vs _ATOM_Tonga_PPM_Table' Cc: Eric Huang Cc: Christian König Cc: Alex Deucher Signed-off-by: Srinivasan Shanmugam Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/powerplay/hwmgr/process_pptables_v1_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/process_pptables_v1_0.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/process_pptables_v1_0.c index f2a55c1413f59..17882f8dfdd34 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/process_pptables_v1_0.c +++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/process_pptables_v1_0.c @@ -200,7 +200,7 @@ static int get_platform_power_management_table( struct pp_hwmgr *hwmgr, ATOM_Tonga_PPM_Table *atom_ppm_table) { - struct phm_ppm_table *ptr = kzalloc(sizeof(ATOM_Tonga_PPM_Table), GFP_KERNEL); + struct phm_ppm_table *ptr = kzalloc(sizeof(*ptr), GFP_KERNEL); struct phm_ppt_v1_information *pp_table_information = (struct phm_ppt_v1_information *)(hwmgr->pptable); -- GitLab From 30d8dffab7d00da7fd13ecdb7d41a1f25ed6a4af Mon Sep 17 00:00:00 2001 From: Victor Lu Date: Tue, 19 Dec 2023 11:55:09 -0500 Subject: [PATCH 0966/1290] drm/amdgpu: Do not program VM_L2_CNTL under SRIOV VM_L2_CNTL* should not be programmed on driver unload under SRIOV. These regs are skipped during SRIOV driver init. Signed-off-by: Victor Lu Reviewed-by: Vignesh Chander Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c index 95d06da544e2a..49aecdcee0069 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c @@ -456,10 +456,12 @@ static void gfxhub_v1_2_xcc_gart_disable(struct amdgpu_device *adev, WREG32_SOC15_RLC(GC, GET_INST(GC, j), regMC_VM_MX_L1_TLB_CNTL, tmp); /* Setup L2 cache */ - tmp = RREG32_SOC15(GC, GET_INST(GC, j), regVM_L2_CNTL); - tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0); - WREG32_SOC15(GC, GET_INST(GC, j), regVM_L2_CNTL, tmp); - WREG32_SOC15(GC, GET_INST(GC, j), regVM_L2_CNTL3, 0); + if (!amdgpu_sriov_vf(adev)) { + tmp = RREG32_SOC15(GC, GET_INST(GC, j), regVM_L2_CNTL); + tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0); + WREG32_SOC15(GC, GET_INST(GC, j), regVM_L2_CNTL, tmp); + WREG32_SOC15(GC, GET_INST(GC, j), regVM_L2_CNTL3, 0); + } } } -- GitLab From fac4ebd79fed60e79cccafdad45a2bb8d3795044 Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Thu, 4 Jan 2024 15:26:42 +0530 Subject: [PATCH 0967/1290] drm/amdgpu: Fix with right return code '-EIO' in 'amdgpu_gmc_vram_checking()' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The amdgpu_gmc_vram_checking() function in emulation checks whether all of the memory range of shared system memory could be accessed by GPU, from this aspect, -EIO is returned for error scenarios. Fixes the below: drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c:919 gmc_v6_0_hw_init() warn: missing error code? 'r' drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c:1103 gmc_v7_0_hw_init() warn: missing error code? 'r' drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c:1223 gmc_v8_0_hw_init() warn: missing error code? 'r' drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c:2344 gmc_v9_0_hw_init() warn: missing error code? 'r' Cc: Xiaojian Du Cc: Lijo Lazar Cc: Christian König Cc: Alex Deucher Signed-off-by: Srinivasan Shanmugam Suggested-by: Christian König Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c index d2f273d77e595..55784a9f26c4c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c @@ -1045,21 +1045,28 @@ int amdgpu_gmc_vram_checking(struct amdgpu_device *adev) * seconds, so here, we just pick up three parts for emulation. */ ret = memcmp(vram_ptr, cptr, 10); - if (ret) - return ret; + if (ret) { + ret = -EIO; + goto release_buffer; + } ret = memcmp(vram_ptr + (size / 2), cptr, 10); - if (ret) - return ret; + if (ret) { + ret = -EIO; + goto release_buffer; + } ret = memcmp(vram_ptr + size - 10, cptr, 10); - if (ret) - return ret; + if (ret) { + ret = -EIO; + goto release_buffer; + } +release_buffer: amdgpu_bo_free_kernel(&vram_bo, &vram_gpu, &vram_ptr); - return 0; + return ret; } static ssize_t current_memory_partition_show( -- GitLab From 8e8272f0dc22e11b2791dc778b07bd66c208d5a8 Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Fri, 5 Jan 2024 10:08:58 +0530 Subject: [PATCH 0968/1290] drm/amdgpu: Fix unsigned comparison with less than zero in vpe_u1_8_from_fraction() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The variables 'numerator' and 'denominator', are unsigned 16-bit integer types, that can never be less than 0. Thus fixing the below: drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c:62 vpe_u1_8_from_fraction() warn: unsigned 'numerator' is never less than zero. drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c:63 vpe_u1_8_from_fraction() warn: unsigned 'denominator' is never less than zero. Cc: Peyton Lee Cc: Lang Yu Cc: Christian König Cc: Alex Deucher Signed-off-by: Srinivasan Shanmugam Reviewed-by: Peyton Lee Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c index 6f149b54d4d39..b9a15d51eb5c3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c @@ -59,11 +59,8 @@ static inline uint16_t complete_integer_division_u16( static uint16_t vpe_u1_8_from_fraction(uint16_t numerator, uint16_t denominator) { - bool arg1_negative = numerator < 0; - bool arg2_negative = denominator < 0; - - uint16_t arg1_value = (uint16_t)(arg1_negative ? -numerator : numerator); - uint16_t arg2_value = (uint16_t)(arg2_negative ? -denominator : denominator); + u16 arg1_value = numerator; + u16 arg2_value = denominator; uint16_t remainder; @@ -100,9 +97,6 @@ static uint16_t vpe_u1_8_from_fraction(uint16_t numerator, uint16_t denominator) res_value += summand; } - if (arg1_negative ^ arg2_negative) - res_value = -res_value; - return res_value; } -- GitLab From 8a44fdd3cf91debbd09b43bd2519ad2b2486ccf4 Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Thu, 21 Dec 2023 18:13:11 +0530 Subject: [PATCH 0969/1290] drm/amdgpu: Release 'adev->pm.fw' before return in 'amdgpu_device_need_post()' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In function 'amdgpu_device_need_post(struct amdgpu_device *adev)' - 'adev->pm.fw' may not be released before return. Using the function release_firmware() to release adev->pm.fw. Thus fixing the below: drivers/gpu/drm/amd/amdgpu/amdgpu_device.c:1571 amdgpu_device_need_post() warn: 'adev->pm.fw' from request_firmware() not released on lines: 1554. Cc: Monk Liu Cc: Christian König Cc: Alex Deucher Signed-off-by: Srinivasan Shanmugam Suggested-by: Lijo Lazar Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 5bb444bb36cec..ecbc58269951c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1544,6 +1544,7 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev) return true; fw_ver = *((uint32_t *)adev->pm.fw->data + 69); + release_firmware(adev->pm.fw); if (fw_ver < 0x00160e00) return true; } -- GitLab From 2b9a073b7304f4a9e130d04794c91a0c4f9a5c12 Mon Sep 17 00:00:00 2001 From: Yifan Zhang Date: Tue, 9 Jan 2024 09:19:22 +0800 Subject: [PATCH 0970/1290] drm/amdgpu: update regGL2C_CTRL4 value in golden setting This patch to update regGL2C_CTRL4 in golden setting. Signed-off-by: Yifan Zhang Reviewed-by: Tim Huang Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.7.x --- drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index c7242877d5d31..0ea0866c261f8 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -115,7 +115,7 @@ static const struct soc15_reg_golden golden_settings_gc_11_5_0[] = { SOC15_REG_GOLDEN_VALUE(GC, 0, regGL2C_ADDR_MATCH_MASK, 0xffffffff, 0xfffffff3), SOC15_REG_GOLDEN_VALUE(GC, 0, regGL2C_CTRL, 0xffffffff, 0xf37fff3f), SOC15_REG_GOLDEN_VALUE(GC, 0, regGL2C_CTRL3, 0xfffffffb, 0x00f40188), - SOC15_REG_GOLDEN_VALUE(GC, 0, regGL2C_CTRL4, 0xf0ffffff, 0x8000b007), + SOC15_REG_GOLDEN_VALUE(GC, 0, regGL2C_CTRL4, 0xf0ffffff, 0x80009007), SOC15_REG_GOLDEN_VALUE(GC, 0, regPA_CL_ENHANCE, 0xf1ffffff, 0x00880007), SOC15_REG_GOLDEN_VALUE(GC, 0, regPC_CONFIG_CNTL_1, 0xffffffff, 0x00010000), SOC15_REG_GOLDEN_VALUE(GC, 0, regTA_CNTL_AUX, 0xf7f7ffff, 0x01030000), -- GitLab From 7073934f5d73f8b53308963cee36f0d389ea857c Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Mon, 8 Jan 2024 21:20:28 +0530 Subject: [PATCH 0971/1290] drm/amd/display: Fix variable deferencing before NULL check in edp_setup_replay() In edp_setup_replay(), 'struct dc *dc' & 'struct dmub_replay *replay' was dereferenced before the pointer 'link' & 'replay' NULL check. Fixes the below: drivers/gpu/drm/amd/amdgpu/../display/dc/link/protocols/link_edp_panel_control.c:947 edp_setup_replay() warn: variable dereferenced before check 'link' (see line 933) Cc: stable@vger.kernel.org Cc: Bhawanpreet Lakha Cc: Harry Wentland Cc: Rodrigo Siqueira Cc: Aurabindo Pillai Cc: Alex Deucher Signed-off-by: Srinivasan Shanmugam Reviewed-by: Aurabindo Pillai Signed-off-by: Alex Deucher --- .../dc/link/protocols/link_edp_panel_control.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/link/protocols/link_edp_panel_control.c b/drivers/gpu/drm/amd/display/dc/link/protocols/link_edp_panel_control.c index 7f11965282186..046d3e2054153 100644 --- a/drivers/gpu/drm/amd/display/dc/link/protocols/link_edp_panel_control.c +++ b/drivers/gpu/drm/amd/display/dc/link/protocols/link_edp_panel_control.c @@ -930,8 +930,8 @@ bool edp_get_replay_state(const struct dc_link *link, uint64_t *state) bool edp_setup_replay(struct dc_link *link, const struct dc_stream_state *stream) { /* To-do: Setup Replay */ - struct dc *dc = link->ctx->dc; - struct dmub_replay *replay = dc->res_pool->replay; + struct dc *dc; + struct dmub_replay *replay; int i; unsigned int panel_inst; struct replay_context replay_context = { 0 }; @@ -947,6 +947,10 @@ bool edp_setup_replay(struct dc_link *link, const struct dc_stream_state *stream if (!link) return false; + dc = link->ctx->dc; + + replay = dc->res_pool->replay; + if (!replay) return false; @@ -975,8 +979,7 @@ bool edp_setup_replay(struct dc_link *link, const struct dc_stream_state *stream replay_context.line_time_in_ns = lineTimeInNs; - if (replay) - link->replay_settings.replay_feature_enabled = + link->replay_settings.replay_feature_enabled = replay->funcs->replay_copy_settings(replay, link, &replay_context, panel_inst); if (link->replay_settings.replay_feature_enabled) { -- GitLab From 6c5683bd9ecaa7f199c3122c1010ece5d59b1aef Mon Sep 17 00:00:00 2001 From: Le Ma Date: Tue, 9 Jan 2024 12:06:25 +0800 Subject: [PATCH 0972/1290] Revert "drm/amdgpu: add param to specify fw bo location for front-door loading" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit c572abffe9f50c8ba33060865449313b3f588c35. Will use debug module param instead of independent module param. Signed-off-by: Le Ma Reviewed-by: Lijo Lazar Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 -- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 5 ----- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c | 3 +-- 4 files changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 9da14436a3738..616b6c9117679 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -254,8 +254,6 @@ extern int amdgpu_agp; extern int amdgpu_wbrf; -extern int fw_bo_location; - #define AMDGPU_VM_MAX_NUM_CTX 4096 #define AMDGPU_SG_THRESHOLD (256*1024*1024) #define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS 3000 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 852cec98ff262..880137774b4eb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -210,7 +210,6 @@ int amdgpu_seamless = -1; /* auto */ uint amdgpu_debug_mask; int amdgpu_agp = -1; /* auto */ int amdgpu_wbrf = -1; -int fw_bo_location = -1; static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work); @@ -990,10 +989,6 @@ MODULE_PARM_DESC(wbrf, "Enable Wifi RFI interference mitigation (0 = disabled, 1 = enabled, -1 = auto(default)"); module_param_named(wbrf, amdgpu_wbrf, int, 0444); -MODULE_PARM_DESC(fw_bo_location, - "location to put firmware bo for frontdoor loading (-1 = auto (default), 0 = on ram, 1 = on vram"); -module_param(fw_bo_location, int, 0644); - /* These devices are not supported by amdgpu. * They are supported by the mach64, r128, radeon drivers */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 2addbdf88394b..1bf975b8d083e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -466,7 +466,7 @@ static int psp_sw_init(void *handle) } ret = amdgpu_bo_create_kernel(adev, PSP_1_MEG, PSP_1_MEG, - (amdgpu_sriov_vf(adev) || fw_bo_location == 1) ? + amdgpu_sriov_vf(adev) ? AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT, &psp->fw_pri_bo, &psp->fw_pri_mc_addr, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c index d334e42fe0ebe..0efb2568cb65f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c @@ -1062,8 +1062,7 @@ int amdgpu_ucode_create_bo(struct amdgpu_device *adev) { if (adev->firmware.load_type != AMDGPU_FW_LOAD_DIRECT) { amdgpu_bo_create_kernel(adev, adev->firmware.fw_size, PAGE_SIZE, - (amdgpu_sriov_vf(adev) || fw_bo_location == 1) ? - AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT, + amdgpu_sriov_vf(adev) ? AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT, &adev->firmware.fw_buf, &adev->firmware.fw_buf_mc, &adev->firmware.fw_buf_ptr); -- GitLab From d20e1aec8862e48a352ca86969cee6f530dd41d5 Mon Sep 17 00:00:00 2001 From: Le Ma Date: Tue, 9 Jan 2024 17:44:39 +0800 Subject: [PATCH 0973/1290] drm/amdgpu: add debug flag to place fw bo on vram for frontdoor loading MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use debug_mask=0x8 param to help isolating data path issues on new systems in early phase. v2: rename the flag for explicitness (lijo) Signed-off-by: Le Ma Reviewed-by: Lijo Lazar Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 6 ++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c | 3 ++- 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 616b6c9117679..3d8a48f46b015 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1144,6 +1144,7 @@ struct amdgpu_device { bool debug_vm; bool debug_largebar; bool debug_disable_soft_recovery; + bool debug_use_vram_fw_buf; }; static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 880137774b4eb..0776b0c5e4e4f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -128,6 +128,7 @@ enum AMDGPU_DEBUG_MASK { AMDGPU_DEBUG_VM = BIT(0), AMDGPU_DEBUG_LARGEBAR = BIT(1), AMDGPU_DEBUG_DISABLE_GPU_SOFT_RECOVERY = BIT(2), + AMDGPU_DEBUG_USE_VRAM_FW_BUF = BIT(3), }; unsigned int amdgpu_vram_limit = UINT_MAX; @@ -2117,6 +2118,11 @@ static void amdgpu_init_debug_options(struct amdgpu_device *adev) pr_info("debug: soft reset for GPU recovery disabled\n"); adev->debug_disable_soft_recovery = true; } + + if (amdgpu_debug_mask & AMDGPU_DEBUG_USE_VRAM_FW_BUF) { + pr_info("debug: place fw in vram for frontdoor loading\n"); + adev->debug_use_vram_fw_buf = true; + } } static unsigned long amdgpu_fix_asic_type(struct pci_dev *pdev, unsigned long flags) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 1bf975b8d083e..0328616473f80 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -466,7 +466,7 @@ static int psp_sw_init(void *handle) } ret = amdgpu_bo_create_kernel(adev, PSP_1_MEG, PSP_1_MEG, - amdgpu_sriov_vf(adev) ? + (amdgpu_sriov_vf(adev) || adev->debug_use_vram_fw_buf) ? AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT, &psp->fw_pri_bo, &psp->fw_pri_mc_addr, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c index 0efb2568cb65f..3e12763e477aa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c @@ -1062,7 +1062,8 @@ int amdgpu_ucode_create_bo(struct amdgpu_device *adev) { if (adev->firmware.load_type != AMDGPU_FW_LOAD_DIRECT) { amdgpu_bo_create_kernel(adev, adev->firmware.fw_size, PAGE_SIZE, - amdgpu_sriov_vf(adev) ? AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT, + (amdgpu_sriov_vf(adev) || adev->debug_use_vram_fw_buf) ? + AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT, &adev->firmware.fw_buf, &adev->firmware.fw_buf_mc, &adev->firmware.fw_buf_ptr); -- GitLab From 51258acdc4758d43f03ec9cab6f3fa72a2838f0e Mon Sep 17 00:00:00 2001 From: Le Ma Date: Tue, 9 Jan 2024 18:06:35 +0800 Subject: [PATCH 0974/1290] drm/amdgpu: move debug options init prior to amdgpu device init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To bring debug options into effect in early initialization phase Signed-off-by: Le Ma Reviewed-by: Lijo Lazar Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 0776b0c5e4e4f..5c9caf5fa0758 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2234,6 +2234,8 @@ static int amdgpu_pci_probe(struct pci_dev *pdev, pci_set_drvdata(pdev, ddev); + amdgpu_init_debug_options(adev); + ret = amdgpu_driver_load_kms(adev, flags); if (ret) goto err_pci; @@ -2314,8 +2316,6 @@ retry_init: amdgpu_get_secondary_funcs(adev); } - amdgpu_init_debug_options(adev); - return 0; err_pci: -- GitLab From c3d5e297dcae88274dc6924db337a2159279eced Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 9 Jan 2024 10:45:42 -0500 Subject: [PATCH 0975/1290] drm/amdgpu: drop exp hw support check for GC 9.4.3 No longer needed. Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.7.x --- drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c index 0431eafa86b53..c7d60dd0fb975 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c @@ -1963,8 +1963,6 @@ static int amdgpu_discovery_set_gc_ip_blocks(struct amdgpu_device *adev) amdgpu_device_ip_block_add(adev, &gfx_v9_0_ip_block); break; case IP_VERSION(9, 4, 3): - if (!amdgpu_exp_hw_support) - return -EINVAL; amdgpu_device_ip_block_add(adev, &gfx_v9_4_3_ip_block); break; case IP_VERSION(10, 1, 10): -- GitLab From d7a254fad873775ce6c32b77796c81e81e6b7f2e Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Tue, 9 Jan 2024 16:57:26 +0530 Subject: [PATCH 0976/1290] drm/amdkfd: Fix 'node' NULL check in 'svm_range_get_range_boundaries()' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Range interval [start, last] is ordered by rb_tree, rb_prev, rb_next return value still needs NULL check, thus modified from "node" to "rb_node". Fixes the below: drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_svm.c:2691 svm_range_get_range_boundaries() warn: can 'node' even be NULL? Suggested-by: Philip Yang Cc: Felix Kuehling Cc: Christian König Cc: Alex Deucher Signed-off-by: Srinivasan Shanmugam Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 4e9f07c7a9376..c50a0dc9c9c07 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -2654,6 +2654,7 @@ svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr, { struct vm_area_struct *vma; struct interval_tree_node *node; + struct rb_node *rb_node; unsigned long start_limit, end_limit; vma = vma_lookup(p->mm, addr << PAGE_SHIFT); @@ -2673,16 +2674,15 @@ svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr, if (node) { end_limit = min(end_limit, node->start); /* Last range that ends before the fault address */ - node = container_of(rb_prev(&node->rb), - struct interval_tree_node, rb); + rb_node = rb_prev(&node->rb); } else { /* Last range must end before addr because * there was no range after addr */ - node = container_of(rb_last(&p->svms.objects.rb_root), - struct interval_tree_node, rb); + rb_node = rb_last(&p->svms.objects.rb_root); } - if (node) { + if (rb_node) { + node = container_of(rb_node, struct interval_tree_node, rb); if (node->last >= addr) { WARN(1, "Overlap with prev node and page fault addr\n"); return -EFAULT; -- GitLab From 91739a897c12dcec699e53f390be1b4abdeef3a0 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Thu, 11 Jan 2024 09:47:33 +0530 Subject: [PATCH 0977/1290] drm/amd/pm: Add error log for smu v13.0.6 reset For all mode-2 reset fail cases, add error log. Signed-off-by: Lijo Lazar Reviewed-by: Asad Kamal Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.7.x --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 4ebc6b421c2cb..7513d1cfeebd7 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -2235,17 +2235,18 @@ static int smu_v13_0_6_mode2_reset(struct smu_context *smu) continue; } - if (ret) { - dev_err(adev->dev, - "failed to send mode2 message \tparam: 0x%08x error code %d\n", - SMU_RESET_MODE_2, ret); + if (ret) goto out; - } + } while (ret == -ETIME && timeout); out: mutex_unlock(&smu->message_lock); + if (ret) + dev_err(adev->dev, "failed to send mode2 reset, error code %d", + ret); + return ret; } -- GitLab From a992c90d8ed3929b70ae815ce21ca5651cc0a692 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Thu, 11 Jan 2024 15:28:53 +0530 Subject: [PATCH 0978/1290] drm/amd/pm: Fix smuv13.0.6 current clock reporting When current clock is equal to max dpm level clock, the level is not indicated correctly with *. Fix by comparing current clock against dpm level value. Signed-off-by: Lijo Lazar Reviewed-by: Asad Kamal Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.7.x --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 7513d1cfeebd7..a28649f210933 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -970,7 +970,9 @@ static int smu_v13_0_6_print_clks(struct smu_context *smu, char *buf, int size, if (i < (clocks.num_levels - 1)) clk2 = clocks.data[i + 1].clocks_in_khz / 1000; - if (curr_clk >= clk1 && curr_clk < clk2) { + if (curr_clk == clk1) { + level = i; + } else if (curr_clk >= clk1 && curr_clk < clk2) { level = (curr_clk - clk1) <= (clk2 - curr_clk) ? i : i + 1; -- GitLab From d7643fe6fb76edb1f2f1497bf5e8b8f4774b5129 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 10 Jan 2024 13:46:47 -0700 Subject: [PATCH 0979/1290] drm/amd/display: Avoid enum conversion warning Clang warns (or errors with CONFIG_WERROR=y) when performing arithmetic with different enumerated types, which is usually a bug: drivers/gpu/drm/amd/amdgpu/../display/dc/link/protocols/link_dp_dpia_bw.c:548:24: error: arithmetic between different enumeration types ('const enum dc_link_rate' and 'const enum dc_lane_count') [-Werror,-Wenum-enum-conversion] 548 | link_cap->link_rate * link_cap->lane_count * LINK_RATE_REF_FREQ_IN_KHZ * 8; | ~~~~~~~~~~~~~~~~~~~ ^ ~~~~~~~~~~~~~~~~~~~~ 1 error generated. In this case, there is not a problem because the enumerated types are basically treated as '#define' values. Add an explicit cast to an integral type to silence the warning. Closes: https://github.com/ClangBuiltLinux/linux/issues/1976 Fixes: 5f3bce13266e ("drm/amd/display: Request usb4 bw for mst streams") Signed-off-by: Nathan Chancellor Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/dc/link/protocols/link_dp_dpia_bw.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_dpia_bw.c b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_dpia_bw.c index 4ef1a6a1d1295..dd0d2b206462c 100644 --- a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_dpia_bw.c +++ b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_dpia_bw.c @@ -544,8 +544,9 @@ int link_dp_dpia_get_dp_overhead_in_dp_tunneling(struct dc_link *link) */ const struct dc_link_settings *link_cap = dc_link_get_link_cap(link); - uint32_t link_bw_in_kbps = - link_cap->link_rate * link_cap->lane_count * LINK_RATE_REF_FREQ_IN_KHZ * 8; + uint32_t link_bw_in_kbps = (uint32_t)link_cap->link_rate * + (uint32_t)link_cap->lane_count * + LINK_RATE_REF_FREQ_IN_KHZ * 8; link_mst_overhead = (link_bw_in_kbps / 64) + ((link_bw_in_kbps % 64) ? 1 : 0); } -- GitLab From c2518da8e6b0e248cfff1d4b6682e14020bd4d3f Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Wed, 10 Jan 2024 09:14:35 -0500 Subject: [PATCH 0980/1290] selftests: bonding: Change script interpreter The tests changed by this patch, as well as the scripts they source, use features which are not part of POSIX sh (ex. 'source' and 'local'). As a result, these tests fail when /bin/sh is dash such as on Debian. Change the interpreter to bash so that these tests can run successfully. Fixes: d43eff0b85ae ("selftests: bonding: up/down delay w/ slave link flapping") Tested-by: Hangbin Liu Reviewed-by: Hangbin Liu Reviewed-by: Petr Machata Signed-off-by: Benjamin Poirier Reviewed-by: Przemek Kitszel Signed-off-by: Paolo Abeni --- .../selftests/drivers/net/bonding/mode-1-recovery-updelay.sh | 2 +- .../selftests/drivers/net/bonding/mode-2-recovery-updelay.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/drivers/net/bonding/mode-1-recovery-updelay.sh b/tools/testing/selftests/drivers/net/bonding/mode-1-recovery-updelay.sh index ad4c845a4ac7c..b76bf50309524 100755 --- a/tools/testing/selftests/drivers/net/bonding/mode-1-recovery-updelay.sh +++ b/tools/testing/selftests/drivers/net/bonding/mode-1-recovery-updelay.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0 # Regression Test: diff --git a/tools/testing/selftests/drivers/net/bonding/mode-2-recovery-updelay.sh b/tools/testing/selftests/drivers/net/bonding/mode-2-recovery-updelay.sh index 2330d37453f95..8c26190021479 100755 --- a/tools/testing/selftests/drivers/net/bonding/mode-2-recovery-updelay.sh +++ b/tools/testing/selftests/drivers/net/bonding/mode-2-recovery-updelay.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0 # Regression Test: -- GitLab From 49078c1b80b6f7e214ff60cf6f44750b33019cad Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Wed, 10 Jan 2024 09:14:36 -0500 Subject: [PATCH 0981/1290] selftests: forwarding: Remove executable bits from lib.sh The lib.sh script is meant to be sourced from other scripts, not executed directly. Therefore, remove the executable bits from lib.sh's permissions. Fixes: fe32dffdcd33 ("selftests: forwarding: add TCPDUMP_EXTRA_FLAGS to lib.sh") Tested-by: Hangbin Liu Reviewed-by: Hangbin Liu Reviewed-by: Vladimir Oltean Signed-off-by: Benjamin Poirier Reviewed-by: Przemek Kitszel Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/forwarding/lib.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 tools/testing/selftests/net/forwarding/lib.sh diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh old mode 100755 new mode 100644 -- GitLab From a23aa04042187cbde16f470b49d4ad60d32e9206 Mon Sep 17 00:00:00 2001 From: Qiang Ma Date: Fri, 12 Jan 2024 10:12:49 +0800 Subject: [PATCH 0982/1290] net: stmmac: ethtool: Fixed calltrace caused by unbalanced disable_irq_wake calls We found the following dmesg calltrace when testing the GMAC NIC notebook: [9.448656] ------------[ cut here ]------------ [9.448658] Unbalanced IRQ 43 wake disable [9.448673] WARNING: CPU: 3 PID: 1083 at kernel/irq/manage.c:688 irq_set_irq_wake+0xe0/0x128 [9.448717] CPU: 3 PID: 1083 Comm: ethtool Tainted: G O 4.19 #1 [9.448773] ... [9.448774] Call Trace: [9.448781] [<9000000000209b5c>] show_stack+0x34/0x140 [9.448788] [<9000000000d52700>] dump_stack+0x98/0xd0 [9.448794] [<9000000000228610>] __warn+0xa8/0x120 [9.448797] [<9000000000d2fb60>] report_bug+0x98/0x130 [9.448800] [<900000000020a418>] do_bp+0x248/0x2f0 [9.448805] [<90000000002035f4>] handle_bp_int+0x4c/0x78 [9.448808] [<900000000029ea40>] irq_set_irq_wake+0xe0/0x128 [9.448813] [<9000000000a96a7c>] stmmac_set_wol+0x134/0x150 [9.448819] [<9000000000be6ed0>] dev_ethtool+0x1368/0x2440 [9.448824] [<9000000000c08350>] dev_ioctl+0x1f8/0x3e0 [9.448827] [<9000000000bb2a34>] sock_ioctl+0x2a4/0x450 [9.448832] [<900000000046f044>] do_vfs_ioctl+0xa4/0x738 [9.448834] [<900000000046f778>] ksys_ioctl+0xa0/0xe8 [9.448837] [<900000000046f7d8>] sys_ioctl+0x18/0x28 [9.448840] [<9000000000211ab4>] syscall_common+0x20/0x34 [9.448842] ---[ end trace 40c18d9aec863c3e ]--- Multiple disable_irq_wake() calls will keep decreasing the IRQ wake_depth, When wake_depth is 0, calling disable_irq_wake() again, will report the above calltrace. Due to the need to appear in pairs, we cannot call disable_irq_wake() without calling enable_irq_wake(). Fix this by making sure there are no unbalanced disable_irq_wake() calls. Fixes: 3172d3afa998 ("stmmac: support wake up irq from external sources (v3)") Signed-off-by: Qiang Ma Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240112021249.24598-1-maqianga@uniontech.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/stmicro/stmmac/stmmac.h | 1 + drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c | 10 ++++++++-- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 1 + 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h index 9f89acf310502..f155e4841c62b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h @@ -267,6 +267,7 @@ struct stmmac_priv { u32 msg_enable; int wolopts; int wol_irq; + bool wol_irq_disabled; int clk_csr; struct timer_list eee_ctrl_timer; int lpi_irq; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index 0f8bfba4443d1..42d27b97dd1d0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -832,10 +832,16 @@ static int stmmac_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol) if (wol->wolopts) { pr_info("stmmac: wakeup enable\n"); device_set_wakeup_enable(priv->device, 1); - enable_irq_wake(priv->wol_irq); + /* Avoid unbalanced enable_irq_wake calls */ + if (priv->wol_irq_disabled) + enable_irq_wake(priv->wol_irq); + priv->wol_irq_disabled = false; } else { device_set_wakeup_enable(priv->device, 0); - disable_irq_wake(priv->wol_irq); + /* Avoid unbalanced disable_irq_wake calls */ + if (!priv->wol_irq_disabled) + disable_irq_wake(priv->wol_irq); + priv->wol_irq_disabled = true; } mutex_lock(&priv->lock); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 47de466e432c0..c78a96b8eb649 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -3628,6 +3628,7 @@ static int stmmac_request_irq_multi_msi(struct net_device *dev) /* Request the Wake IRQ in case of another line * is used for WoL */ + priv->wol_irq_disabled = true; if (priv->wol_irq > 0 && priv->wol_irq != dev->irq) { int_name = priv->int_name_wol; sprintf(int_name, "%s:%s", dev->name, "wol"); -- GitLab From bc7863d18677df66b2c7a0e172c91296ff380f11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=87a=C4=9Fhan=20Demir?= Date: Mon, 15 Jan 2024 20:23:03 +0300 Subject: [PATCH 0983/1290] ALSA: hda/relatek: Enable Mute LED on HP Laptop 15s-fq2xxx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This HP Laptop uses ALC236 codec with COEF 0x07 idx 1 controlling the mute LED. This patch enables the already existing quirk for this device. Signed-off-by: Çağhan Demir Cc: Link: https://lore.kernel.org/r/20240115172303.4718-1-caghandemir@marun.edu.tr Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index b68c947570510..0f0a03e890156 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9861,6 +9861,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x87f5, "HP", ALC287_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x87f6, "HP Spectre x360 14", ALC245_FIXUP_HP_X360_AMP), SND_PCI_QUIRK(0x103c, 0x87f7, "HP Spectre x360 14", ALC245_FIXUP_HP_X360_AMP), + SND_PCI_QUIRK(0x103c, 0x87fe, "HP Laptop 15s-fq2xxx", ALC236_FIXUP_HP_MUTE_LED_COEFBIT2), SND_PCI_QUIRK(0x103c, 0x8805, "HP ProBook 650 G8 Notebook PC", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x880d, "HP EliteBook 830 G8 Notebook PC", ALC285_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8811, "HP Spectre x360 15-eb1xxx", ALC285_FIXUP_HP_SPECTRE_X360_EB1), -- GitLab From b018cee7369896c7a15bfdbe88f168f3dbd8ba27 Mon Sep 17 00:00:00 2001 From: Yo-Jung Lin Date: Tue, 16 Jan 2024 10:07:19 +0800 Subject: [PATCH 0984/1290] ALSA: hda/realtek: Enable mute/micmute LEDs and limit mic boost on HP ZBook On some HP ZBooks, the audio LEDs can be enabled by ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF. So use it accordingly. Signed-off-by: Yo-Jung Lin Cc: Link: https://lore.kernel.org/r/20240116020722.27236-1-leo.lin@canonical.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 0f0a03e890156..dbf31fe901daa 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9956,6 +9956,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8c71, "HP EliteBook 845 G11", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8c72, "HP EliteBook 865 G11", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8c96, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), + SND_PCI_QUIRK(0x103c, 0x8c97, "HP ZBook", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x8ca4, "HP ZBook Fury", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8ca7, "HP ZBook Fury", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8cf5, "HP ZBook Studio 16", ALC245_FIXUP_CS35L41_SPI_4_HP_GPIO_LED), -- GitLab From e37617c8e53a1f7fcba6d5e1041f4fd8a2425c27 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Sun, 14 Jan 2024 19:36:00 +0100 Subject: [PATCH 0985/1290] sched/fair: Fix frequency selection for non-invariant case Linus reported a ~50% performance regression on single-threaded workloads on his AMD Ryzen system, and bisected it to: 9c0b4bb7f630 ("sched/cpufreq: Rework schedutil governor performance estimation") When frequency invariance is not enabled, get_capacity_ref_freq(policy) is supposed to return the current frequency and the performance margin applied by map_util_perf(), enabling the utilization to go above the maximum compute capacity and to select a higher frequency than the current one. After the changes in 9c0b4bb7f630, the performance margin was applied earlier in the path to take into account utilization clampings and we couldn't get a utilization higher than the maximum compute capacity, and the CPU remained 'stuck' at lower frequencies. To fix this, we must use a frequency above the current frequency to get a chance to select a higher OPP when the current one becomes fully used. Apply the same margin and return a frequency 25% higher than the current one in order to switch to the next OPP before we fully use the CPU at the current one. [ mingo: Clarified the changelog. ] Fixes: 9c0b4bb7f630 ("sched/cpufreq: Rework schedutil governor performance estimation") Reported-by: Linus Torvalds Bisected-by: Linus Torvalds Reported-by: Wyes Karny Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Tested-by: Wyes Karny Link: https://lore.kernel.org/r/20240114183600.135316-1-vincent.guittot@linaro.org --- kernel/sched/cpufreq_schedutil.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 95c3c097083e5..eece6244f9d2f 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -133,7 +133,11 @@ unsigned long get_capacity_ref_freq(struct cpufreq_policy *policy) if (arch_scale_freq_invariant()) return policy->cpuinfo.max_freq; - return policy->cur; + /* + * Apply a 25% margin so that we select a higher frequency than + * the current one before the CPU is fully busy: + */ + return policy->cur + (policy->cur >> 2); } /** -- GitLab From 2c4ca7977298c6a3d237d7d703df97e043b75c12 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 14 Jan 2024 14:47:26 -0800 Subject: [PATCH 0986/1290] selftests: netdevsim: sprinkle more udevadm settle Number of tests are failing when netdev renaming is active on the system. Add udevadm settle in logic determining the names. Fixes: 242aaf03dc9b ("selftests: add a test for ethtool pause stats") Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240114224726.1210532-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- tools/testing/selftests/drivers/net/netdevsim/ethtool-common.sh | 1 + tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/tools/testing/selftests/drivers/net/netdevsim/ethtool-common.sh b/tools/testing/selftests/drivers/net/netdevsim/ethtool-common.sh index 922744059aaa2..80160579e0cc1 100644 --- a/tools/testing/selftests/drivers/net/netdevsim/ethtool-common.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/ethtool-common.sh @@ -51,6 +51,7 @@ function make_netdev { fi echo $NSIM_ID $@ > /sys/bus/netdevsim/new_device + udevadm settle # get new device name ls /sys/bus/netdevsim/devices/netdevsim${NSIM_ID}/net/ } diff --git a/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh b/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh index 1b08e042cf942..4855ef597a152 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh @@ -233,6 +233,7 @@ function print_tables { function get_netdev_name { local -n old=$1 + udevadm settle new=$(ls /sys/class/net) for netdev in $new; do -- GitLab From 4697381bd076adfea4d67394189c8bf27b9cc95b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 14 Jan 2024 14:47:48 -0800 Subject: [PATCH 0987/1290] selftests: netdevsim: correct expected FEC strings ethtool CLI has changed its output. Make the test compatible. Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20240114224748.1210578-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- .../drivers/net/netdevsim/ethtool-fec.sh | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/drivers/net/netdevsim/ethtool-fec.sh b/tools/testing/selftests/drivers/net/netdevsim/ethtool-fec.sh index 0c56746e9ce0e..7d7829f57550d 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/ethtool-fec.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/ethtool-fec.sh @@ -8,16 +8,20 @@ NSIM_NETDEV=$(make_netdev) set -o pipefail +# Since commit 2b3ddcb35357 ("ethtool: fec: Change the prompt ...") +# in ethtool CLI the Configured lines start with Supported/Configured. +configured=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2 | head -1 | cut -d' ' -f1) + # netdevsim starts out with None/None s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) -check $? "$s" "Configured FEC encodings: None +check $? "$s" "$configured FEC encodings: None Active FEC encoding: None" # Test Auto $ETHTOOL --set-fec $NSIM_NETDEV encoding auto check $? s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) -check $? "$s" "Configured FEC encodings: Auto +check $? "$s" "$configured FEC encodings: Auto Active FEC encoding: Off" # Test case in-sensitivity @@ -25,7 +29,7 @@ for o in off Off OFF; do $ETHTOOL --set-fec $NSIM_NETDEV encoding $o check $? s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) - check $? "$s" "Configured FEC encodings: Off + check $? "$s" "$configured FEC encodings: Off Active FEC encoding: Off" done @@ -33,7 +37,7 @@ for o in BaseR baser BAser; do $ETHTOOL --set-fec $NSIM_NETDEV encoding $o check $? s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) - check $? "$s" "Configured FEC encodings: BaseR + check $? "$s" "$configured FEC encodings: BaseR Active FEC encoding: BaseR" done @@ -41,7 +45,7 @@ for o in llrs rs; do $ETHTOOL --set-fec $NSIM_NETDEV encoding $o check $? s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) - check $? "$s" "Configured FEC encodings: ${o^^} + check $? "$s" "$configured FEC encodings: ${o^^} Active FEC encoding: ${o^^}" done @@ -49,13 +53,13 @@ done $ETHTOOL --set-fec $NSIM_NETDEV encoding rs llrs check $? s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) -check $? "$s" "Configured FEC encodings: RS LLRS +check $? "$s" "$configured FEC encodings: RS LLRS Active FEC encoding: LLRS" $ETHTOOL --set-fec $NSIM_NETDEV encoding rs off auto check $? s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) -check $? "$s" "Configured FEC encodings: Auto Off RS +check $? "$s" "$configured FEC encodings: Auto Off RS Active FEC encoding: RS" # Make sure other link modes are rejected -- GitLab From 03fb8565c880d57952d9b4ba0b36468bae52b554 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 15 Jan 2024 18:02:01 -0800 Subject: [PATCH 0988/1290] selftests: bonding: add missing build configs bonding tests also try to create bridge, veth and dummy interfaces. These are not currently listed in config. Fixes: bbb774d921e2 ("net: Add tests for bonding and team address list management") Fixes: c078290a2b76 ("selftests: include bonding tests into the kselftest infra") Acked-by: Muhammad Usama Anjum Link: https://lore.kernel.org/r/20240116020201.1883023-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/bonding/config | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config index 70638fa50b2cc..f85b16fc5128c 100644 --- a/tools/testing/selftests/drivers/net/bonding/config +++ b/tools/testing/selftests/drivers/net/bonding/config @@ -1,2 +1,5 @@ CONFIG_BONDING=y +CONFIG_BRIDGE=y +CONFIG_DUMMY=y CONFIG_MACVLAN=y +CONFIG_VETH=y -- GitLab From ecd2ada8a5e0b464dab54f71d4ba7bbf5708711f Mon Sep 17 00:00:00 2001 From: Greentime Hu Date: Mon, 15 Jan 2024 05:59:20 +0000 Subject: [PATCH 0989/1290] riscv: Add support for kernel mode vector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add kernel_vector_begin() and kernel_vector_end() function declarations and corresponding definitions in kernel_mode_vector.c These are needed to wrap uses of vector in kernel mode. Co-developed-by: Vincent Chen Signed-off-by: Vincent Chen Signed-off-by: Greentime Hu Signed-off-by: Andy Chiu Reviewed-by: Eric Biggers Tested-by: Björn Töpel Tested-by: Lad Prabhakar Link: https://lore.kernel.org/r/20240115055929.4736-2-andy.chiu@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/processor.h | 12 ++- arch/riscv/include/asm/simd.h | 44 ++++++++++ arch/riscv/include/asm/vector.h | 9 ++ arch/riscv/kernel/Makefile | 1 + arch/riscv/kernel/kernel_mode_vector.c | 116 +++++++++++++++++++++++++ arch/riscv/kernel/process.c | 1 + 6 files changed, 182 insertions(+), 1 deletion(-) create mode 100644 arch/riscv/include/asm/simd.h create mode 100644 arch/riscv/kernel/kernel_mode_vector.c diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index f19f861cda549..4809f20a20530 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -73,6 +73,15 @@ struct task_struct; struct pt_regs; +/* + * We use a flag to track in-kernel Vector context. Currently the flag has the + * following meaning: + * + * - bit 0: indicates whether the in-kernel Vector context is active. The + * activation of this state disables the preemption. + */ +#define RISCV_KERNEL_MODE_V 0x1 + /* CPU-specific state of a task */ struct thread_struct { /* Callee-saved registers */ @@ -81,7 +90,8 @@ struct thread_struct { unsigned long s[12]; /* s[0]: frame pointer */ struct __riscv_d_ext_state fstate; unsigned long bad_cause; - unsigned long vstate_ctrl; + u32 riscv_v_flags; + u32 vstate_ctrl; struct __riscv_v_ext_state vstate; unsigned long align_ctl; }; diff --git a/arch/riscv/include/asm/simd.h b/arch/riscv/include/asm/simd.h new file mode 100644 index 0000000000000..ef8af413a9fc7 --- /dev/null +++ b/arch/riscv/include/asm/simd.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2017 Linaro Ltd. + * Copyright (C) 2023 SiFive + */ + +#ifndef __ASM_SIMD_H +#define __ASM_SIMD_H + +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_RISCV_ISA_V +/* + * may_use_simd - whether it is allowable at this time to issue vector + * instructions or access the vector register file + * + * Callers must not assume that the result remains true beyond the next + * preempt_enable() or return from softirq context. + */ +static __must_check inline bool may_use_simd(void) +{ + /* + * RISCV_KERNEL_MODE_V is only set while preemption is disabled, + * and is clear whenever preemption is enabled. + */ + return !in_hardirq() && !in_nmi() && !(riscv_v_flags() & RISCV_KERNEL_MODE_V); +} + +#else /* ! CONFIG_RISCV_ISA_V */ + +static __must_check inline bool may_use_simd(void) +{ + return false; +} + +#endif /* ! CONFIG_RISCV_ISA_V */ + +#endif diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h index 87aaef656257c..71af3404fda14 100644 --- a/arch/riscv/include/asm/vector.h +++ b/arch/riscv/include/asm/vector.h @@ -22,6 +22,15 @@ extern unsigned long riscv_v_vsize; int riscv_v_setup_vsize(void); bool riscv_v_first_use_handler(struct pt_regs *regs); +void kernel_vector_begin(void); +void kernel_vector_end(void); +void get_cpu_vector_context(void); +void put_cpu_vector_context(void); + +static inline u32 riscv_v_flags(void) +{ + return current->thread.riscv_v_flags; +} static __always_inline bool has_vector(void) { diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index fee22a3d1b534..8c58595696b32 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -63,6 +63,7 @@ obj-$(CONFIG_MMU) += vdso.o vdso/ obj-$(CONFIG_RISCV_MISALIGNED) += traps_misaligned.o obj-$(CONFIG_FPU) += fpu.o obj-$(CONFIG_RISCV_ISA_V) += vector.o +obj-$(CONFIG_RISCV_ISA_V) += kernel_mode_vector.o obj-$(CONFIG_SMP) += smpboot.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_SMP) += cpu_ops.o diff --git a/arch/riscv/kernel/kernel_mode_vector.c b/arch/riscv/kernel/kernel_mode_vector.c new file mode 100644 index 0000000000000..114cf4f0a0eb6 --- /dev/null +++ b/arch/riscv/kernel/kernel_mode_vector.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2012 ARM Ltd. + * Author: Catalin Marinas + * Copyright (C) 2017 Linaro Ltd. + * Copyright (C) 2021 SiFive + */ +#include +#include +#include +#include +#include + +#include +#include +#include + +static inline void riscv_v_flags_set(u32 flags) +{ + current->thread.riscv_v_flags = flags; +} + +static inline void riscv_v_start(u32 flags) +{ + int orig; + + orig = riscv_v_flags(); + BUG_ON((orig & flags) != 0); + riscv_v_flags_set(orig | flags); +} + +static inline void riscv_v_stop(u32 flags) +{ + int orig; + + orig = riscv_v_flags(); + BUG_ON((orig & flags) == 0); + riscv_v_flags_set(orig & ~flags); +} + +/* + * Claim ownership of the CPU vector context for use by the calling context. + * + * The caller may freely manipulate the vector context metadata until + * put_cpu_vector_context() is called. + */ +void get_cpu_vector_context(void) +{ + preempt_disable(); + + riscv_v_start(RISCV_KERNEL_MODE_V); +} + +/* + * Release the CPU vector context. + * + * Must be called from a context in which get_cpu_vector_context() was + * previously called, with no call to put_cpu_vector_context() in the + * meantime. + */ +void put_cpu_vector_context(void) +{ + riscv_v_stop(RISCV_KERNEL_MODE_V); + + preempt_enable(); +} + +/* + * kernel_vector_begin(): obtain the CPU vector registers for use by the calling + * context + * + * Must not be called unless may_use_simd() returns true. + * Task context in the vector registers is saved back to memory as necessary. + * + * A matching call to kernel_vector_end() must be made before returning from the + * calling context. + * + * The caller may freely use the vector registers until kernel_vector_end() is + * called. + */ +void kernel_vector_begin(void) +{ + if (WARN_ON(!has_vector())) + return; + + BUG_ON(!may_use_simd()); + + get_cpu_vector_context(); + + riscv_v_vstate_save(current, task_pt_regs(current)); + + riscv_v_enable(); +} +EXPORT_SYMBOL_GPL(kernel_vector_begin); + +/* + * kernel_vector_end(): give the CPU vector registers back to the current task + * + * Must be called from a context in which kernel_vector_begin() was previously + * called, with no call to kernel_vector_end() in the meantime. + * + * The caller must not use the vector registers after this function is called, + * unless kernel_vector_begin() is called again in the meantime. + */ +void kernel_vector_end(void) +{ + if (WARN_ON(!has_vector())) + return; + + riscv_v_vstate_restore(current, task_pt_regs(current)); + + riscv_v_disable(); + + put_cpu_vector_context(); +} +EXPORT_SYMBOL_GPL(kernel_vector_end); diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 4f21d970a1292..4a1275db11460 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -221,6 +221,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) childregs->a0 = 0; /* Return value of fork() */ p->thread.s[0] = 0; } + p->thread.riscv_v_flags = 0; p->thread.ra = (unsigned long)ret_from_fork; p->thread.sp = (unsigned long)childregs; /* kernel sp */ return 0; -- GitLab From 956895b9d8f74df015636288a81872c07c4fded3 Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Mon, 15 Jan 2024 05:59:21 +0000 Subject: [PATCH 0990/1290] riscv: vector: make Vector always available for softirq context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The goal of this patch is to provide full support of Vector in kernel softirq context. So that some of the crypto alogrithms won't need scalar fallbacks. By disabling bottom halves in active kernel-mode Vector, softirq will not be able to nest on top of any kernel-mode Vector. So, softirq context is able to use Vector whenever it runs. After this patch, Vector context cannot start with irqs disabled. Otherwise local_bh_enable() may run in a wrong context. Disabling bh is not enough for RT-kernel to prevent preeemption. So we must disable preemption, which also implies disabling bh on RT. Related-to: commit 696207d4258b ("arm64/sve: Make kernel FPU protection RT friendly") Related-to: commit 66c3ec5a7120 ("arm64: neon: Forbid when irqs are disabled") Signed-off-by: Andy Chiu Reviewed-by: Eric Biggers Tested-by: Björn Töpel Tested-by: Lad Prabhakar Link: https://lore.kernel.org/r/20240115055929.4736-3-andy.chiu@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/processor.h | 3 ++- arch/riscv/include/asm/simd.h | 6 +++++- arch/riscv/kernel/kernel_mode_vector.c | 14 ++++++++++++-- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index 4809f20a20530..55ace554f2021 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -78,7 +78,8 @@ struct pt_regs; * following meaning: * * - bit 0: indicates whether the in-kernel Vector context is active. The - * activation of this state disables the preemption. + * activation of this state disables the preemption. On a non-RT kernel, it + * also disable bh. */ #define RISCV_KERNEL_MODE_V 0x1 diff --git a/arch/riscv/include/asm/simd.h b/arch/riscv/include/asm/simd.h index ef8af413a9fc7..4d699e16c9a96 100644 --- a/arch/riscv/include/asm/simd.h +++ b/arch/riscv/include/asm/simd.h @@ -28,8 +28,12 @@ static __must_check inline bool may_use_simd(void) /* * RISCV_KERNEL_MODE_V is only set while preemption is disabled, * and is clear whenever preemption is enabled. + * + * Kernel-mode Vector temporarily disables bh. So we must not return + * true on irq_disabled(). Otherwise we would fail the lockdep check + * calling local_bh_enable() */ - return !in_hardirq() && !in_nmi() && !(riscv_v_flags() & RISCV_KERNEL_MODE_V); + return !in_hardirq() && !in_nmi() && !irqs_disabled() && !(riscv_v_flags() & RISCV_KERNEL_MODE_V); } #else /* ! CONFIG_RISCV_ISA_V */ diff --git a/arch/riscv/kernel/kernel_mode_vector.c b/arch/riscv/kernel/kernel_mode_vector.c index 114cf4f0a0eb6..2fc145edae3dd 100644 --- a/arch/riscv/kernel/kernel_mode_vector.c +++ b/arch/riscv/kernel/kernel_mode_vector.c @@ -46,7 +46,14 @@ static inline void riscv_v_stop(u32 flags) */ void get_cpu_vector_context(void) { - preempt_disable(); + /* + * disable softirqs so it is impossible for softirqs to nest + * get_cpu_vector_context() when kernel is actively using Vector. + */ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_disable(); + else + preempt_disable(); riscv_v_start(RISCV_KERNEL_MODE_V); } @@ -62,7 +69,10 @@ void put_cpu_vector_context(void) { riscv_v_stop(RISCV_KERNEL_MODE_V); - preempt_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_enable(); + else + preempt_enable(); } /* -- GitLab From c5674d00cacdb1c47c72e19a552fbae401bc3532 Mon Sep 17 00:00:00 2001 From: Greentime Hu Date: Mon, 15 Jan 2024 05:59:22 +0000 Subject: [PATCH 0991/1290] riscv: Add vector extension XOR implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds support for vector optimized XOR and it is tested in qemu. Co-developed-by: Han-Kuan Chen Signed-off-by: Han-Kuan Chen Signed-off-by: Greentime Hu Signed-off-by: Andy Chiu Tested-by: Björn Töpel Tested-by: Lad Prabhakar Link: https://lore.kernel.org/r/20240115055929.4736-4-andy.chiu@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/asm-prototypes.h | 18 ++++++ arch/riscv/include/asm/xor.h | 68 +++++++++++++++++++++ arch/riscv/lib/Makefile | 1 + arch/riscv/lib/xor.S | 81 +++++++++++++++++++++++++ 4 files changed, 168 insertions(+) create mode 100644 arch/riscv/include/asm/xor.h create mode 100644 arch/riscv/lib/xor.S diff --git a/arch/riscv/include/asm/asm-prototypes.h b/arch/riscv/include/asm/asm-prototypes.h index 36b955c762ba0..6db1a9bbff4ce 100644 --- a/arch/riscv/include/asm/asm-prototypes.h +++ b/arch/riscv/include/asm/asm-prototypes.h @@ -9,6 +9,24 @@ long long __lshrti3(long long a, int b); long long __ashrti3(long long a, int b); long long __ashlti3(long long a, int b); +#ifdef CONFIG_RISCV_ISA_V + +void xor_regs_2_(unsigned long bytes, unsigned long *__restrict p1, + const unsigned long *__restrict p2); +void xor_regs_3_(unsigned long bytes, unsigned long *__restrict p1, + const unsigned long *__restrict p2, + const unsigned long *__restrict p3); +void xor_regs_4_(unsigned long bytes, unsigned long *__restrict p1, + const unsigned long *__restrict p2, + const unsigned long *__restrict p3, + const unsigned long *__restrict p4); +void xor_regs_5_(unsigned long bytes, unsigned long *__restrict p1, + const unsigned long *__restrict p2, + const unsigned long *__restrict p3, + const unsigned long *__restrict p4, + const unsigned long *__restrict p5); + +#endif /* CONFIG_RISCV_ISA_V */ #define DECLARE_DO_ERROR_INFO(name) asmlinkage void name(struct pt_regs *regs) diff --git a/arch/riscv/include/asm/xor.h b/arch/riscv/include/asm/xor.h new file mode 100644 index 0000000000000..96011861e46b4 --- /dev/null +++ b/arch/riscv/include/asm/xor.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2021 SiFive + */ + +#include +#include +#ifdef CONFIG_RISCV_ISA_V +#include +#include +#include + +static void xor_vector_2(unsigned long bytes, unsigned long *__restrict p1, + const unsigned long *__restrict p2) +{ + kernel_vector_begin(); + xor_regs_2_(bytes, p1, p2); + kernel_vector_end(); +} + +static void xor_vector_3(unsigned long bytes, unsigned long *__restrict p1, + const unsigned long *__restrict p2, + const unsigned long *__restrict p3) +{ + kernel_vector_begin(); + xor_regs_3_(bytes, p1, p2, p3); + kernel_vector_end(); +} + +static void xor_vector_4(unsigned long bytes, unsigned long *__restrict p1, + const unsigned long *__restrict p2, + const unsigned long *__restrict p3, + const unsigned long *__restrict p4) +{ + kernel_vector_begin(); + xor_regs_4_(bytes, p1, p2, p3, p4); + kernel_vector_end(); +} + +static void xor_vector_5(unsigned long bytes, unsigned long *__restrict p1, + const unsigned long *__restrict p2, + const unsigned long *__restrict p3, + const unsigned long *__restrict p4, + const unsigned long *__restrict p5) +{ + kernel_vector_begin(); + xor_regs_5_(bytes, p1, p2, p3, p4, p5); + kernel_vector_end(); +} + +static struct xor_block_template xor_block_rvv = { + .name = "rvv", + .do_2 = xor_vector_2, + .do_3 = xor_vector_3, + .do_4 = xor_vector_4, + .do_5 = xor_vector_5 +}; + +#undef XOR_TRY_TEMPLATES +#define XOR_TRY_TEMPLATES \ + do { \ + xor_speed(&xor_block_8regs); \ + xor_speed(&xor_block_32regs); \ + if (has_vector()) { \ + xor_speed(&xor_block_rvv);\ + } \ + } while (0) +#endif diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile index 26cb2502ecf89..494f9cd1a00c0 100644 --- a/arch/riscv/lib/Makefile +++ b/arch/riscv/lib/Makefile @@ -11,3 +11,4 @@ lib-$(CONFIG_64BIT) += tishift.o lib-$(CONFIG_RISCV_ISA_ZICBOZ) += clear_page.o obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o +lib-$(CONFIG_RISCV_ISA_V) += xor.o diff --git a/arch/riscv/lib/xor.S b/arch/riscv/lib/xor.S new file mode 100644 index 0000000000000..b28f2430e52fa --- /dev/null +++ b/arch/riscv/lib/xor.S @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2021 SiFive + */ +#include +#include +#include + +SYM_FUNC_START(xor_regs_2_) + vsetvli a3, a0, e8, m8, ta, ma + vle8.v v0, (a1) + vle8.v v8, (a2) + sub a0, a0, a3 + vxor.vv v16, v0, v8 + add a2, a2, a3 + vse8.v v16, (a1) + add a1, a1, a3 + bnez a0, xor_regs_2_ + ret +SYM_FUNC_END(xor_regs_2_) +EXPORT_SYMBOL(xor_regs_2_) + +SYM_FUNC_START(xor_regs_3_) + vsetvli a4, a0, e8, m8, ta, ma + vle8.v v0, (a1) + vle8.v v8, (a2) + sub a0, a0, a4 + vxor.vv v0, v0, v8 + vle8.v v16, (a3) + add a2, a2, a4 + vxor.vv v16, v0, v16 + add a3, a3, a4 + vse8.v v16, (a1) + add a1, a1, a4 + bnez a0, xor_regs_3_ + ret +SYM_FUNC_END(xor_regs_3_) +EXPORT_SYMBOL(xor_regs_3_) + +SYM_FUNC_START(xor_regs_4_) + vsetvli a5, a0, e8, m8, ta, ma + vle8.v v0, (a1) + vle8.v v8, (a2) + sub a0, a0, a5 + vxor.vv v0, v0, v8 + vle8.v v16, (a3) + add a2, a2, a5 + vxor.vv v0, v0, v16 + vle8.v v24, (a4) + add a3, a3, a5 + vxor.vv v16, v0, v24 + add a4, a4, a5 + vse8.v v16, (a1) + add a1, a1, a5 + bnez a0, xor_regs_4_ + ret +SYM_FUNC_END(xor_regs_4_) +EXPORT_SYMBOL(xor_regs_4_) + +SYM_FUNC_START(xor_regs_5_) + vsetvli a6, a0, e8, m8, ta, ma + vle8.v v0, (a1) + vle8.v v8, (a2) + sub a0, a0, a6 + vxor.vv v0, v0, v8 + vle8.v v16, (a3) + add a2, a2, a6 + vxor.vv v0, v0, v16 + vle8.v v24, (a4) + add a3, a3, a6 + vxor.vv v0, v0, v24 + vle8.v v8, (a5) + add a4, a4, a6 + vxor.vv v16, v0, v8 + add a5, a5, a6 + vse8.v v16, (a1) + add a1, a1, a6 + bnez a0, xor_regs_5_ + ret +SYM_FUNC_END(xor_regs_5_) +EXPORT_SYMBOL(xor_regs_5_) -- GitLab From 7df56cbc27e4239807b5d8860f79a7350d63a741 Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Mon, 15 Jan 2024 05:59:23 +0000 Subject: [PATCH 0992/1290] riscv: sched: defer restoring Vector context for user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User will use its Vector registers only after the kernel really returns to the userspace. So we can delay restoring Vector registers as long as we are still running in kernel mode. So, add a thread flag to indicates the need of restoring Vector and do the restore at the last arch-specific exit-to-user hook. This save the context restoring cost when we switch over multiple processes that run V in kernel mode. For example, if the kernel performs a context swicth from A->B->C, and returns to C's userspace, then there is no need to restore B's V-register. Besides, this also prevents us from repeatedly restoring V context when executing kernel-mode Vector multiple times. The cost of this is that we must disable preemption and mark vector as busy during vstate_{save,restore}. Because then the V context will not get restored back immediately when a trap-causing context switch happens in the middle of vstate_{save,restore}. Signed-off-by: Andy Chiu Acked-by: Conor Dooley Tested-by: Björn Töpel Tested-by: Lad Prabhakar Link: https://lore.kernel.org/r/20240115055929.4736-5-andy.chiu@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/entry-common.h | 17 +++++++++++++++++ arch/riscv/include/asm/thread_info.h | 2 ++ arch/riscv/include/asm/vector.h | 11 ++++++++++- arch/riscv/kernel/kernel_mode_vector.c | 2 +- arch/riscv/kernel/process.c | 2 ++ arch/riscv/kernel/ptrace.c | 5 ++++- arch/riscv/kernel/signal.c | 5 ++++- arch/riscv/kernel/vector.c | 2 +- 8 files changed, 41 insertions(+), 5 deletions(-) diff --git a/arch/riscv/include/asm/entry-common.h b/arch/riscv/include/asm/entry-common.h index 7ab5e34318c85..19023c430a9b5 100644 --- a/arch/riscv/include/asm/entry-common.h +++ b/arch/riscv/include/asm/entry-common.h @@ -4,6 +4,23 @@ #define _ASM_RISCV_ENTRY_COMMON_H #include +#include +#include + +static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, + unsigned long ti_work) +{ + if (ti_work & _TIF_RISCV_V_DEFER_RESTORE) { + clear_thread_flag(TIF_RISCV_V_DEFER_RESTORE); + /* + * We are already called with irq disabled, so go without + * keeping track of riscv_v_flags. + */ + riscv_v_vstate_restore(current, regs); + } +} + +#define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare void handle_page_fault(struct pt_regs *regs); void handle_break(struct pt_regs *regs); diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h index 574779900bfb3..1047a97ddbc8a 100644 --- a/arch/riscv/include/asm/thread_info.h +++ b/arch/riscv/include/asm/thread_info.h @@ -103,12 +103,14 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); #define TIF_NOTIFY_SIGNAL 9 /* signal notifications exist */ #define TIF_UPROBE 10 /* uprobe breakpoint or singlestep */ #define TIF_32BIT 11 /* compat-mode 32bit process */ +#define TIF_RISCV_V_DEFER_RESTORE 12 /* restore Vector before returing to user */ #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_UPROBE (1 << TIF_UPROBE) +#define _TIF_RISCV_V_DEFER_RESTORE (1 << TIF_RISCV_V_DEFER_RESTORE) #define _TIF_WORK_MASK \ (_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | _TIF_NEED_RESCHED | \ diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h index 71af3404fda14..961c4e3d1b620 100644 --- a/arch/riscv/include/asm/vector.h +++ b/arch/riscv/include/asm/vector.h @@ -193,6 +193,15 @@ static inline void riscv_v_vstate_restore(struct task_struct *task, } } +static inline void riscv_v_vstate_set_restore(struct task_struct *task, + struct pt_regs *regs) +{ + if ((regs->status & SR_VS) != SR_VS_OFF) { + set_tsk_thread_flag(task, TIF_RISCV_V_DEFER_RESTORE); + riscv_v_vstate_on(regs); + } +} + static inline void __switch_to_vector(struct task_struct *prev, struct task_struct *next) { @@ -200,7 +209,7 @@ static inline void __switch_to_vector(struct task_struct *prev, regs = task_pt_regs(prev); riscv_v_vstate_save(prev, regs); - riscv_v_vstate_restore(next, task_pt_regs(next)); + riscv_v_vstate_set_restore(next, task_pt_regs(next)); } void riscv_v_vstate_ctrl_init(struct task_struct *tsk); diff --git a/arch/riscv/kernel/kernel_mode_vector.c b/arch/riscv/kernel/kernel_mode_vector.c index 2fc145edae3dd..8422c881f4529 100644 --- a/arch/riscv/kernel/kernel_mode_vector.c +++ b/arch/riscv/kernel/kernel_mode_vector.c @@ -117,7 +117,7 @@ void kernel_vector_end(void) if (WARN_ON(!has_vector())) return; - riscv_v_vstate_restore(current, task_pt_regs(current)); + riscv_v_vstate_set_restore(current, task_pt_regs(current)); riscv_v_disable(); diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 4a1275db11460..36993f408de4a 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -171,6 +171,7 @@ void flush_thread(void) riscv_v_vstate_off(task_pt_regs(current)); kfree(current->thread.vstate.datap); memset(¤t->thread.vstate, 0, sizeof(struct __riscv_v_ext_state)); + clear_tsk_thread_flag(current, TIF_RISCV_V_DEFER_RESTORE); #endif } @@ -187,6 +188,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) *dst = *src; /* clear entire V context, including datap for a new task */ memset(&dst->thread.vstate, 0, sizeof(struct __riscv_v_ext_state)); + clear_tsk_thread_flag(dst, TIF_RISCV_V_DEFER_RESTORE); return 0; } diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c index 2afe460de16a6..7b93bcbdf9fab 100644 --- a/arch/riscv/kernel/ptrace.c +++ b/arch/riscv/kernel/ptrace.c @@ -99,8 +99,11 @@ static int riscv_vr_get(struct task_struct *target, * Ensure the vector registers have been saved to the memory before * copying them to membuf. */ - if (target == current) + if (target == current) { + get_cpu_vector_context(); riscv_v_vstate_save(current, task_pt_regs(current)); + put_cpu_vector_context(); + } ptrace_vstate.vstart = vstate->vstart; ptrace_vstate.vl = vstate->vl; diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c index 88b6220b26087..aca4a12c84162 100644 --- a/arch/riscv/kernel/signal.c +++ b/arch/riscv/kernel/signal.c @@ -86,7 +86,10 @@ static long save_v_state(struct pt_regs *regs, void __user **sc_vec) /* datap is designed to be 16 byte aligned for better performance */ WARN_ON(unlikely(!IS_ALIGNED((unsigned long)datap, 16))); + get_cpu_vector_context(); riscv_v_vstate_save(current, regs); + put_cpu_vector_context(); + /* Copy everything of vstate but datap. */ err = __copy_to_user(&state->v_state, ¤t->thread.vstate, offsetof(struct __riscv_v_ext_state, datap)); @@ -134,7 +137,7 @@ static long __restore_v_state(struct pt_regs *regs, void __user *sc_vec) if (unlikely(err)) return err; - riscv_v_vstate_restore(current, regs); + riscv_v_vstate_set_restore(current, regs); return err; } diff --git a/arch/riscv/kernel/vector.c b/arch/riscv/kernel/vector.c index 578b6292487e1..66e8c6ab09d25 100644 --- a/arch/riscv/kernel/vector.c +++ b/arch/riscv/kernel/vector.c @@ -167,7 +167,7 @@ bool riscv_v_first_use_handler(struct pt_regs *regs) return true; } riscv_v_vstate_on(regs); - riscv_v_vstate_restore(current, regs); + riscv_v_vstate_set_restore(current, regs); return true; } -- GitLab From c2a658d419246108c9bf065ec347355de5ba8a05 Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Mon, 15 Jan 2024 05:59:24 +0000 Subject: [PATCH 0993/1290] riscv: lib: vectorize copy_to_user/copy_from_user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch utilizes Vector to perform copy_to_user/copy_from_user. If Vector is available and the size of copy is large enough for Vector to perform better than scalar, then direct the kernel to do Vector copies for userspace. Though the best programming practice for users is to reduce the copy, this provides a faster variant when copies are inevitable. The optimal size for using Vector, copy_to_user_thres, is only a heuristic for now. We can add DT parsing if people feel the need of customizing it. The exception fixup code of the __asm_vector_usercopy must fallback to the scalar one because accessing user pages might fault, and must be sleepable. Current kernel-mode Vector does not allow tasks to be preemptible, so we must disactivate Vector and perform a scalar fallback in such case. The original implementation of Vector operations comes from https://github.com/sifive/sifive-libc, which we agree to contribute to Linux kernel. Co-developed-by: Jerry Shih Signed-off-by: Jerry Shih Co-developed-by: Nick Knight Signed-off-by: Nick Knight Suggested-by: Guo Ren Signed-off-by: Andy Chiu Tested-by: Björn Töpel Tested-by: Lad Prabhakar Link: https://lore.kernel.org/r/20240115055929.4736-6-andy.chiu@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 8 ++++ arch/riscv/include/asm/asm-prototypes.h | 4 ++ arch/riscv/lib/Makefile | 6 ++- arch/riscv/lib/riscv_v_helpers.c | 45 +++++++++++++++++++++ arch/riscv/lib/uaccess.S | 10 +++++ arch/riscv/lib/uaccess_vector.S | 53 +++++++++++++++++++++++++ 6 files changed, 125 insertions(+), 1 deletion(-) create mode 100644 arch/riscv/lib/riscv_v_helpers.c create mode 100644 arch/riscv/lib/uaccess_vector.S diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 95a2a06acc6a6..3c5ba05e8a2da 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -525,6 +525,14 @@ config RISCV_ISA_V_DEFAULT_ENABLE If you don't know what to do here, say Y. +config RISCV_ISA_V_UCOPY_THRESHOLD + int "Threshold size for vectorized user copies" + depends on RISCV_ISA_V + default 768 + help + Prefer using vectorized copy_to_user()/copy_from_user() when the + workload size exceeds this value. + config TOOLCHAIN_HAS_ZBB bool default y diff --git a/arch/riscv/include/asm/asm-prototypes.h b/arch/riscv/include/asm/asm-prototypes.h index 6db1a9bbff4ce..be438932f321f 100644 --- a/arch/riscv/include/asm/asm-prototypes.h +++ b/arch/riscv/include/asm/asm-prototypes.h @@ -11,6 +11,10 @@ long long __ashlti3(long long a, int b); #ifdef CONFIG_RISCV_ISA_V +#ifdef CONFIG_MMU +asmlinkage int enter_vector_usercopy(void *dst, void *src, size_t n); +#endif /* CONFIG_MMU */ + void xor_regs_2_(unsigned long bytes, unsigned long *__restrict p1, const unsigned long *__restrict p2); void xor_regs_3_(unsigned long bytes, unsigned long *__restrict p1, diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile index 494f9cd1a00c0..c8a6787d58273 100644 --- a/arch/riscv/lib/Makefile +++ b/arch/riscv/lib/Makefile @@ -6,9 +6,13 @@ lib-y += memmove.o lib-y += strcmp.o lib-y += strlen.o lib-y += strncmp.o -lib-$(CONFIG_MMU) += uaccess.o +ifeq ($(CONFIG_MMU), y) +lib-y += uaccess.o +lib-$(CONFIG_RISCV_ISA_V) += uaccess_vector.o +endif lib-$(CONFIG_64BIT) += tishift.o lib-$(CONFIG_RISCV_ISA_ZICBOZ) += clear_page.o obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o lib-$(CONFIG_RISCV_ISA_V) += xor.o +lib-$(CONFIG_RISCV_ISA_V) += riscv_v_helpers.o diff --git a/arch/riscv/lib/riscv_v_helpers.c b/arch/riscv/lib/riscv_v_helpers.c new file mode 100644 index 0000000000000..be38a93cedaec --- /dev/null +++ b/arch/riscv/lib/riscv_v_helpers.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2023 SiFive + * Author: Andy Chiu + */ +#include +#include + +#include +#include + +#ifdef CONFIG_MMU +#include +#endif + +#ifdef CONFIG_MMU +size_t riscv_v_usercopy_threshold = CONFIG_RISCV_ISA_V_UCOPY_THRESHOLD; +int __asm_vector_usercopy(void *dst, void *src, size_t n); +int fallback_scalar_usercopy(void *dst, void *src, size_t n); +asmlinkage int enter_vector_usercopy(void *dst, void *src, size_t n) +{ + size_t remain, copied; + + /* skip has_vector() check because it has been done by the asm */ + if (!may_use_simd()) + goto fallback; + + kernel_vector_begin(); + remain = __asm_vector_usercopy(dst, src, n); + kernel_vector_end(); + + if (remain) { + copied = n - remain; + dst += copied; + src += copied; + n = remain; + goto fallback; + } + + return remain; + +fallback: + return fallback_scalar_usercopy(dst, src, n); +} +#endif diff --git a/arch/riscv/lib/uaccess.S b/arch/riscv/lib/uaccess.S index 3ab438f30d132..a1e4a3c429254 100644 --- a/arch/riscv/lib/uaccess.S +++ b/arch/riscv/lib/uaccess.S @@ -3,6 +3,8 @@ #include #include #include +#include +#include .macro fixup op reg addr lbl 100: @@ -11,6 +13,13 @@ .endm SYM_FUNC_START(__asm_copy_to_user) +#ifdef CONFIG_RISCV_ISA_V + ALTERNATIVE("j fallback_scalar_usercopy", "nop", 0, RISCV_ISA_EXT_v, CONFIG_RISCV_ISA_V) + REG_L t0, riscv_v_usercopy_threshold + bltu a2, t0, fallback_scalar_usercopy + tail enter_vector_usercopy +#endif +SYM_FUNC_START(fallback_scalar_usercopy) /* Enable access to user memory */ li t6, SR_SUM @@ -181,6 +190,7 @@ SYM_FUNC_START(__asm_copy_to_user) sub a0, t5, a0 ret SYM_FUNC_END(__asm_copy_to_user) +SYM_FUNC_END(fallback_scalar_usercopy) EXPORT_SYMBOL(__asm_copy_to_user) SYM_FUNC_ALIAS(__asm_copy_from_user, __asm_copy_to_user) EXPORT_SYMBOL(__asm_copy_from_user) diff --git a/arch/riscv/lib/uaccess_vector.S b/arch/riscv/lib/uaccess_vector.S new file mode 100644 index 0000000000000..51ab5588e9ff3 --- /dev/null +++ b/arch/riscv/lib/uaccess_vector.S @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#include +#include +#include +#include +#include + +#define pDst a0 +#define pSrc a1 +#define iNum a2 + +#define iVL a3 + +#define ELEM_LMUL_SETTING m8 +#define vData v0 + + .macro fixup op reg addr lbl +100: + \op \reg, \addr + _asm_extable 100b, \lbl + .endm + +SYM_FUNC_START(__asm_vector_usercopy) + /* Enable access to user memory */ + li t6, SR_SUM + csrs CSR_STATUS, t6 + +loop: + vsetvli iVL, iNum, e8, ELEM_LMUL_SETTING, ta, ma + fixup vle8.v vData, (pSrc), 10f + sub iNum, iNum, iVL + add pSrc, pSrc, iVL + fixup vse8.v vData, (pDst), 11f + add pDst, pDst, iVL + bnez iNum, loop + + /* Exception fixup for vector load is shared with normal exit */ +10: + /* Disable access to user memory */ + csrc CSR_STATUS, t6 + mv a0, iNum + ret + + /* Exception fixup code for vector store. */ +11: + /* Undo the subtraction after vle8.v */ + add iNum, iNum, iVL + /* Make sure the scalar fallback skip already processed bytes */ + csrr t2, CSR_VSTART + sub iNum, iNum, t2 + j 10b +SYM_FUNC_END(__asm_vector_usercopy) -- GitLab From a93fdaf183125fea81f66b9bd756ef5a0c30859e Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Mon, 15 Jan 2024 05:59:25 +0000 Subject: [PATCH 0994/1290] riscv: fpu: drop SR_SD bit checking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SR_SD summarizes the dirty status of FS/VS/XS. However, the current code structure does not fully utilize it because each extension specific code is divided into an individual segment. So remove the SR_SD check for now. Signed-off-by: Andy Chiu Reviewed-by: Song Shuai Reviewed-by: Guo Ren Tested-by: Björn Töpel Tested-by: Lad Prabhakar Link: https://lore.kernel.org/r/20240115055929.4736-7-andy.chiu@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/switch_to.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/riscv/include/asm/switch_to.h b/arch/riscv/include/asm/switch_to.h index f90d8e42f3c79..7efdb0584d47a 100644 --- a/arch/riscv/include/asm/switch_to.h +++ b/arch/riscv/include/asm/switch_to.h @@ -53,8 +53,7 @@ static inline void __switch_to_fpu(struct task_struct *prev, struct pt_regs *regs; regs = task_pt_regs(prev); - if (unlikely(regs->status & SR_SD)) - fstate_save(prev, regs); + fstate_save(prev, regs); fstate_restore(next, task_pt_regs(next)); } -- GitLab From d6c78f1ca3e8ec3fd1afa1bc567cdf083e7af9fe Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Mon, 15 Jan 2024 05:59:26 +0000 Subject: [PATCH 0995/1290] riscv: vector: do not pass task_struct into riscv_v_vstate_{save,restore}() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit riscv_v_vstate_{save,restore}() can operate only on the knowlege of struct __riscv_v_ext_state, and struct pt_regs. Let the caller decides which should be passed into the function. Meanwhile, the kernel-mode Vector is going to introduce another vstate, so this also makes functions potentially able to be reused. Signed-off-by: Andy Chiu Acked-by: Conor Dooley Tested-by: Björn Töpel Tested-by: Lad Prabhakar Link: https://lore.kernel.org/r/20240115055929.4736-8-andy.chiu@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/entry-common.h | 2 +- arch/riscv/include/asm/vector.h | 14 +++++--------- arch/riscv/kernel/kernel_mode_vector.c | 2 +- arch/riscv/kernel/ptrace.c | 2 +- arch/riscv/kernel/signal.c | 2 +- 5 files changed, 9 insertions(+), 13 deletions(-) diff --git a/arch/riscv/include/asm/entry-common.h b/arch/riscv/include/asm/entry-common.h index 19023c430a9b5..2293e535f8659 100644 --- a/arch/riscv/include/asm/entry-common.h +++ b/arch/riscv/include/asm/entry-common.h @@ -16,7 +16,7 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, * We are already called with irq disabled, so go without * keeping track of riscv_v_flags. */ - riscv_v_vstate_restore(current, regs); + riscv_v_vstate_restore(¤t->thread.vstate, regs); } } diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h index 961c4e3d1b620..d750795206299 100644 --- a/arch/riscv/include/asm/vector.h +++ b/arch/riscv/include/asm/vector.h @@ -171,23 +171,19 @@ static inline void riscv_v_vstate_discard(struct pt_regs *regs) __riscv_v_vstate_dirty(regs); } -static inline void riscv_v_vstate_save(struct task_struct *task, +static inline void riscv_v_vstate_save(struct __riscv_v_ext_state *vstate, struct pt_regs *regs) { if ((regs->status & SR_VS) == SR_VS_DIRTY) { - struct __riscv_v_ext_state *vstate = &task->thread.vstate; - __riscv_v_vstate_save(vstate, vstate->datap); __riscv_v_vstate_clean(regs); } } -static inline void riscv_v_vstate_restore(struct task_struct *task, +static inline void riscv_v_vstate_restore(struct __riscv_v_ext_state *vstate, struct pt_regs *regs) { if ((regs->status & SR_VS) != SR_VS_OFF) { - struct __riscv_v_ext_state *vstate = &task->thread.vstate; - __riscv_v_vstate_restore(vstate, vstate->datap); __riscv_v_vstate_clean(regs); } @@ -208,7 +204,7 @@ static inline void __switch_to_vector(struct task_struct *prev, struct pt_regs *regs; regs = task_pt_regs(prev); - riscv_v_vstate_save(prev, regs); + riscv_v_vstate_save(&prev->thread.vstate, regs); riscv_v_vstate_set_restore(next, task_pt_regs(next)); } @@ -226,8 +222,8 @@ static inline bool riscv_v_vstate_query(struct pt_regs *regs) { return false; } static inline bool riscv_v_vstate_ctrl_user_allowed(void) { return false; } #define riscv_v_vsize (0) #define riscv_v_vstate_discard(regs) do {} while (0) -#define riscv_v_vstate_save(task, regs) do {} while (0) -#define riscv_v_vstate_restore(task, regs) do {} while (0) +#define riscv_v_vstate_save(vstate, regs) do {} while (0) +#define riscv_v_vstate_restore(vstate, regs) do {} while (0) #define __switch_to_vector(__prev, __next) do {} while (0) #define riscv_v_vstate_off(regs) do {} while (0) #define riscv_v_vstate_on(regs) do {} while (0) diff --git a/arch/riscv/kernel/kernel_mode_vector.c b/arch/riscv/kernel/kernel_mode_vector.c index 8422c881f4529..241a8f834e1ce 100644 --- a/arch/riscv/kernel/kernel_mode_vector.c +++ b/arch/riscv/kernel/kernel_mode_vector.c @@ -97,7 +97,7 @@ void kernel_vector_begin(void) get_cpu_vector_context(); - riscv_v_vstate_save(current, task_pt_regs(current)); + riscv_v_vstate_save(¤t->thread.vstate, task_pt_regs(current)); riscv_v_enable(); } diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c index 7b93bcbdf9fab..e8515aa9d80bf 100644 --- a/arch/riscv/kernel/ptrace.c +++ b/arch/riscv/kernel/ptrace.c @@ -101,7 +101,7 @@ static int riscv_vr_get(struct task_struct *target, */ if (target == current) { get_cpu_vector_context(); - riscv_v_vstate_save(current, task_pt_regs(current)); + riscv_v_vstate_save(¤t->thread.vstate, task_pt_regs(current)); put_cpu_vector_context(); } diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c index aca4a12c84162..5d69f4db9e8f3 100644 --- a/arch/riscv/kernel/signal.c +++ b/arch/riscv/kernel/signal.c @@ -87,7 +87,7 @@ static long save_v_state(struct pt_regs *regs, void __user **sc_vec) WARN_ON(unlikely(!IS_ALIGNED((unsigned long)datap, 16))); get_cpu_vector_context(); - riscv_v_vstate_save(current, regs); + riscv_v_vstate_save(¤t->thread.vstate, regs); put_cpu_vector_context(); /* Copy everything of vstate but datap. */ -- GitLab From 5b6048f2ff710196c85ce14373febe8be5115bbe Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Mon, 15 Jan 2024 05:59:27 +0000 Subject: [PATCH 0996/1290] riscv: vector: use a mask to write vstate_ctrl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit riscv_v_ctrl_set() should only touch bits within PR_RISCV_V_VSTATE_CTRL_MASK. So, use the mask when we really set task's vstate_ctrl. Signed-off-by: Andy Chiu Tested-by: Björn Töpel Tested-by: Lad Prabhakar Link: https://lore.kernel.org/r/20240115055929.4736-9-andy.chiu@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/vector.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kernel/vector.c b/arch/riscv/kernel/vector.c index 66e8c6ab09d25..c1f28bc89ec64 100644 --- a/arch/riscv/kernel/vector.c +++ b/arch/riscv/kernel/vector.c @@ -122,7 +122,8 @@ static inline void riscv_v_ctrl_set(struct task_struct *tsk, int cur, int nxt, ctrl |= VSTATE_CTRL_MAKE_NEXT(nxt); if (inherit) ctrl |= PR_RISCV_V_VSTATE_CTRL_INHERIT; - tsk->thread.vstate_ctrl = ctrl; + tsk->thread.vstate_ctrl &= ~PR_RISCV_V_VSTATE_CTRL_MASK; + tsk->thread.vstate_ctrl |= ctrl; } bool riscv_v_vstate_ctrl_user_allowed(void) -- GitLab From bd446f5df5afab212917f6732ba6442a5e8de85e Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Mon, 15 Jan 2024 05:59:28 +0000 Subject: [PATCH 0997/1290] riscv: vector: use kmem_cache to manage vector context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The allocation size of thread.vstate.datap is always riscv_v_vsize. So it is possbile to use kmem_cache_* to manage the allocation. This gives users more information regarding allocation of vector context via /proc/slabinfo. And it potentially reduces the latency of the first-use trap because of the allocation caches. Signed-off-by: Andy Chiu Tested-by: Björn Töpel Tested-by: Lad Prabhakar Link: https://lore.kernel.org/r/20240115055929.4736-10-andy.chiu@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/vector.h | 4 ++++ arch/riscv/kernel/process.c | 7 ++++++- arch/riscv/kernel/vector.c | 19 ++++++++++++++++++- 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h index d750795206299..7b316050f24f7 100644 --- a/arch/riscv/include/asm/vector.h +++ b/arch/riscv/include/asm/vector.h @@ -26,6 +26,8 @@ void kernel_vector_begin(void); void kernel_vector_end(void); void get_cpu_vector_context(void); void put_cpu_vector_context(void); +void riscv_v_thread_free(struct task_struct *tsk); +void __init riscv_v_setup_ctx_cache(void); static inline u32 riscv_v_flags(void) { @@ -227,6 +229,8 @@ static inline bool riscv_v_vstate_ctrl_user_allowed(void) { return false; } #define __switch_to_vector(__prev, __next) do {} while (0) #define riscv_v_vstate_off(regs) do {} while (0) #define riscv_v_vstate_on(regs) do {} while (0) +#define riscv_v_thread_free(tsk) do {} while (0) +#define riscv_v_setup_ctx_cache() do {} while (0) #endif /* CONFIG_RISCV_ISA_V */ diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 36993f408de4a..862d59c3872e2 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -179,7 +179,7 @@ void arch_release_task_struct(struct task_struct *tsk) { /* Free the vector context of datap. */ if (has_vector()) - kfree(tsk->thread.vstate.datap); + riscv_v_thread_free(tsk); } int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) @@ -228,3 +228,8 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) p->thread.sp = (unsigned long)childregs; /* kernel sp */ return 0; } + +void __init arch_task_cache_init(void) +{ + riscv_v_setup_ctx_cache(); +} diff --git a/arch/riscv/kernel/vector.c b/arch/riscv/kernel/vector.c index c1f28bc89ec64..f7b4aeb9e4579 100644 --- a/arch/riscv/kernel/vector.c +++ b/arch/riscv/kernel/vector.c @@ -21,6 +21,7 @@ #include static bool riscv_v_implicit_uacc = IS_ENABLED(CONFIG_RISCV_ISA_V_DEFAULT_ENABLE); +static struct kmem_cache *riscv_v_user_cachep; unsigned long riscv_v_vsize __read_mostly; EXPORT_SYMBOL_GPL(riscv_v_vsize); @@ -47,6 +48,16 @@ int riscv_v_setup_vsize(void) return 0; } +void __init riscv_v_setup_ctx_cache(void) +{ + if (!has_vector()) + return; + + riscv_v_user_cachep = kmem_cache_create_usercopy("riscv_vector_ctx", + riscv_v_vsize, 16, SLAB_PANIC, + 0, riscv_v_vsize, NULL); +} + static bool insn_is_vector(u32 insn_buf) { u32 opcode = insn_buf & __INSN_OPCODE_MASK; @@ -84,7 +95,7 @@ static int riscv_v_thread_zalloc(void) { void *datap; - datap = kzalloc(riscv_v_vsize, GFP_KERNEL); + datap = kmem_cache_zalloc(riscv_v_user_cachep, GFP_KERNEL); if (!datap) return -ENOMEM; @@ -94,6 +105,12 @@ static int riscv_v_thread_zalloc(void) return 0; } +void riscv_v_thread_free(struct task_struct *tsk) +{ + if (tsk->thread.vstate.datap) + kmem_cache_free(riscv_v_user_cachep, tsk->thread.vstate.datap); +} + #define VSTATE_CTRL_GET_CUR(x) ((x) & PR_RISCV_V_VSTATE_CTRL_CUR_MASK) #define VSTATE_CTRL_GET_NEXT(x) (((x) & PR_RISCV_V_VSTATE_CTRL_NEXT_MASK) >> 2) #define VSTATE_CTRL_MAKE_NEXT(x) (((x) << 2) & PR_RISCV_V_VSTATE_CTRL_NEXT_MASK) -- GitLab From 2080ff9493072a94e42b1856d59f5f1bffb761b7 Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Mon, 15 Jan 2024 05:59:29 +0000 Subject: [PATCH 0998/1290] riscv: vector: allow kernel-mode Vector with preemption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add kernel_vstate to keep track of kernel-mode Vector registers when trap introduced context switch happens. Also, provide riscv_v_flags to let context save/restore routine track context status. Context tracking happens whenever the core starts its in-kernel Vector executions. An active (dirty) kernel task's V contexts will be saved to memory whenever a trap-introduced context switch happens. Or, when a softirq, which happens to nest on top of it, uses Vector. Context retoring happens when the execution transfer back to the original Kernel context where it first enable preempt_v. Also, provide a config CONFIG_RISCV_ISA_V_PREEMPTIVE to give users an option to disable preemptible kernel-mode Vector at build time. Users with constraint memory may want to disable this config as preemptible kernel-mode Vector needs extra space for tracking of per thread's kernel-mode V context. Or, users might as well want to disable it if all kernel-mode Vector code is time sensitive and cannot tolerate context switch overhead. Signed-off-by: Andy Chiu Tested-by: Björn Töpel Tested-by: Lad Prabhakar Link: https://lore.kernel.org/r/20240115055929.4736-11-andy.chiu@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 14 +++ arch/riscv/include/asm/asm-prototypes.h | 5 + arch/riscv/include/asm/processor.h | 30 +++++- arch/riscv/include/asm/simd.h | 26 ++++- arch/riscv/include/asm/vector.h | 58 ++++++++++- arch/riscv/kernel/entry.S | 8 ++ arch/riscv/kernel/kernel_mode_vector.c | 133 ++++++++++++++++++++++-- arch/riscv/kernel/process.c | 3 + arch/riscv/kernel/vector.c | 31 ++++-- 9 files changed, 286 insertions(+), 22 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 3c5ba05e8a2da..0a03d72706b54 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -533,6 +533,20 @@ config RISCV_ISA_V_UCOPY_THRESHOLD Prefer using vectorized copy_to_user()/copy_from_user() when the workload size exceeds this value. +config RISCV_ISA_V_PREEMPTIVE + bool "Run kernel-mode Vector with kernel preemption" + depends on PREEMPTION + depends on RISCV_ISA_V + default y + help + Usually, in-kernel SIMD routines are run with preemption disabled. + Functions which envoke long running SIMD thus must yield core's + vector unit to prevent blocking other tasks for too long. + + This config allows kernel to run SIMD without explicitly disable + preemption. Enabling this config will result in higher memory + consumption due to the allocation of per-task's kernel Vector context. + config TOOLCHAIN_HAS_ZBB bool default y diff --git a/arch/riscv/include/asm/asm-prototypes.h b/arch/riscv/include/asm/asm-prototypes.h index be438932f321f..cd627ec289f16 100644 --- a/arch/riscv/include/asm/asm-prototypes.h +++ b/arch/riscv/include/asm/asm-prototypes.h @@ -30,6 +30,11 @@ void xor_regs_5_(unsigned long bytes, unsigned long *__restrict p1, const unsigned long *__restrict p4, const unsigned long *__restrict p5); +#ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE +asmlinkage void riscv_v_context_nesting_start(struct pt_regs *regs); +asmlinkage void riscv_v_context_nesting_end(struct pt_regs *regs); +#endif /* CONFIG_RISCV_ISA_V_PREEMPTIVE */ + #endif /* CONFIG_RISCV_ISA_V */ #define DECLARE_DO_ERROR_INFO(name) asmlinkage void name(struct pt_regs *regs) diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index 55ace554f2021..b02119ff08fce 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -80,8 +80,35 @@ struct pt_regs; * - bit 0: indicates whether the in-kernel Vector context is active. The * activation of this state disables the preemption. On a non-RT kernel, it * also disable bh. + * - bits 8: is used for tracking preemptible kernel-mode Vector, when + * RISCV_ISA_V_PREEMPTIVE is enabled. Calling kernel_vector_begin() does not + * disable the preemption if the thread's kernel_vstate.datap is allocated. + * Instead, the kernel set this bit field. Then the trap entry/exit code + * knows if we are entering/exiting the context that owns preempt_v. + * - 0: the task is not using preempt_v + * - 1: the task is actively using preempt_v. But whether does the task own + * the preempt_v context is decided by bits in RISCV_V_CTX_DEPTH_MASK. + * - bit 16-23 are RISCV_V_CTX_DEPTH_MASK, used by context tracking routine + * when preempt_v starts: + * - 0: the task is actively using, and own preempt_v context. + * - non-zero: the task was using preempt_v, but then took a trap within. + * Thus, the task does not own preempt_v. Any use of Vector will have to + * save preempt_v, if dirty, and fallback to non-preemptible kernel-mode + * Vector. + * - bit 30: The in-kernel preempt_v context is saved, and requries to be + * restored when returning to the context that owns the preempt_v. + * - bit 31: The in-kernel preempt_v context is dirty, as signaled by the + * trap entry code. Any context switches out-of current task need to save + * it to the task's in-kernel V context. Also, any traps nesting on-top-of + * preempt_v requesting to use V needs a save. */ -#define RISCV_KERNEL_MODE_V 0x1 +#define RISCV_V_CTX_DEPTH_MASK 0x00ff0000 + +#define RISCV_V_CTX_UNIT_DEPTH 0x00010000 +#define RISCV_KERNEL_MODE_V 0x00000001 +#define RISCV_PREEMPT_V 0x00000100 +#define RISCV_PREEMPT_V_DIRTY 0x80000000 +#define RISCV_PREEMPT_V_NEED_RESTORE 0x40000000 /* CPU-specific state of a task */ struct thread_struct { @@ -95,6 +122,7 @@ struct thread_struct { u32 vstate_ctrl; struct __riscv_v_ext_state vstate; unsigned long align_ctl; + struct __riscv_v_ext_state kernel_vstate; }; /* Whitelist the fstate from the task_struct for hardened usercopy */ diff --git a/arch/riscv/include/asm/simd.h b/arch/riscv/include/asm/simd.h index 4d699e16c9a96..54efbf523d49c 100644 --- a/arch/riscv/include/asm/simd.h +++ b/arch/riscv/include/asm/simd.h @@ -12,6 +12,7 @@ #include #include #include +#include #include @@ -28,12 +29,27 @@ static __must_check inline bool may_use_simd(void) /* * RISCV_KERNEL_MODE_V is only set while preemption is disabled, * and is clear whenever preemption is enabled. - * - * Kernel-mode Vector temporarily disables bh. So we must not return - * true on irq_disabled(). Otherwise we would fail the lockdep check - * calling local_bh_enable() */ - return !in_hardirq() && !in_nmi() && !irqs_disabled() && !(riscv_v_flags() & RISCV_KERNEL_MODE_V); + if (in_hardirq() || in_nmi()) + return false; + + /* + * Nesting is acheived in preempt_v by spreading the control for + * preemptible and non-preemptible kernel-mode Vector into two fields. + * Always try to match with prempt_v if kernel V-context exists. Then, + * fallback to check non preempt_v if nesting happens, or if the config + * is not set. + */ + if (IS_ENABLED(CONFIG_RISCV_ISA_V_PREEMPTIVE) && current->thread.kernel_vstate.datap) { + if (!riscv_preempt_v_started(current)) + return true; + } + /* + * Non-preemptible kernel-mode Vector temporarily disables bh. So we + * must not return true on irq_disabled(). Otherwise we would fail the + * lockdep check calling local_bh_enable() + */ + return !irqs_disabled() && !(riscv_v_flags() & RISCV_KERNEL_MODE_V); } #else /* ! CONFIG_RISCV_ISA_V */ diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h index 7b316050f24f7..0cd6f0a027d1f 100644 --- a/arch/riscv/include/asm/vector.h +++ b/arch/riscv/include/asm/vector.h @@ -28,10 +28,11 @@ void get_cpu_vector_context(void); void put_cpu_vector_context(void); void riscv_v_thread_free(struct task_struct *tsk); void __init riscv_v_setup_ctx_cache(void); +void riscv_v_thread_alloc(struct task_struct *tsk); static inline u32 riscv_v_flags(void) { - return current->thread.riscv_v_flags; + return READ_ONCE(current->thread.riscv_v_flags); } static __always_inline bool has_vector(void) @@ -200,14 +201,62 @@ static inline void riscv_v_vstate_set_restore(struct task_struct *task, } } +#ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE +static inline bool riscv_preempt_v_dirty(struct task_struct *task) +{ + return !!(task->thread.riscv_v_flags & RISCV_PREEMPT_V_DIRTY); +} + +static inline bool riscv_preempt_v_restore(struct task_struct *task) +{ + return !!(task->thread.riscv_v_flags & RISCV_PREEMPT_V_NEED_RESTORE); +} + +static inline void riscv_preempt_v_clear_dirty(struct task_struct *task) +{ + barrier(); + task->thread.riscv_v_flags &= ~RISCV_PREEMPT_V_DIRTY; +} + +static inline void riscv_preempt_v_set_restore(struct task_struct *task) +{ + barrier(); + task->thread.riscv_v_flags |= RISCV_PREEMPT_V_NEED_RESTORE; +} + +static inline bool riscv_preempt_v_started(struct task_struct *task) +{ + return !!(task->thread.riscv_v_flags & RISCV_PREEMPT_V); +} + +#else /* !CONFIG_RISCV_ISA_V_PREEMPTIVE */ +static inline bool riscv_preempt_v_dirty(struct task_struct *task) { return false; } +static inline bool riscv_preempt_v_restore(struct task_struct *task) { return false; } +static inline bool riscv_preempt_v_started(struct task_struct *task) { return false; } +#define riscv_preempt_v_clear_dirty(tsk) do {} while (0) +#define riscv_preempt_v_set_restore(tsk) do {} while (0) +#endif /* CONFIG_RISCV_ISA_V_PREEMPTIVE */ + static inline void __switch_to_vector(struct task_struct *prev, struct task_struct *next) { struct pt_regs *regs; - regs = task_pt_regs(prev); - riscv_v_vstate_save(&prev->thread.vstate, regs); - riscv_v_vstate_set_restore(next, task_pt_regs(next)); + if (riscv_preempt_v_started(prev)) { + if (riscv_preempt_v_dirty(prev)) { + __riscv_v_vstate_save(&prev->thread.kernel_vstate, + prev->thread.kernel_vstate.datap); + riscv_preempt_v_clear_dirty(prev); + } + } else { + regs = task_pt_regs(prev); + riscv_v_vstate_save(&prev->thread.vstate, regs); + } + + if (riscv_preempt_v_started(next)) + riscv_preempt_v_set_restore(next); + else + riscv_v_vstate_set_restore(next, task_pt_regs(next)); } void riscv_v_vstate_ctrl_init(struct task_struct *tsk); @@ -231,6 +280,7 @@ static inline bool riscv_v_vstate_ctrl_user_allowed(void) { return false; } #define riscv_v_vstate_on(regs) do {} while (0) #define riscv_v_thread_free(tsk) do {} while (0) #define riscv_v_setup_ctx_cache() do {} while (0) +#define riscv_v_thread_alloc(tsk) do {} while (0) #endif /* CONFIG_RISCV_ISA_V */ diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 54ca4564a9263..9d1a305d55087 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -83,6 +83,10 @@ SYM_CODE_START(handle_exception) /* Load the kernel shadow call stack pointer if coming from userspace */ scs_load_current_if_task_changed s5 +#ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE + move a0, sp + call riscv_v_context_nesting_start +#endif move a0, sp /* pt_regs */ la ra, ret_from_exception @@ -138,6 +142,10 @@ SYM_CODE_START_NOALIGN(ret_from_exception) */ csrw CSR_SCRATCH, tp 1: +#ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE + move a0, sp + call riscv_v_context_nesting_end +#endif REG_L a0, PT_STATUS(sp) /* * The current load reservation is effectively part of the processor's diff --git a/arch/riscv/kernel/kernel_mode_vector.c b/arch/riscv/kernel/kernel_mode_vector.c index 241a8f834e1ce..6afe80c7f03ab 100644 --- a/arch/riscv/kernel/kernel_mode_vector.c +++ b/arch/riscv/kernel/kernel_mode_vector.c @@ -14,10 +14,13 @@ #include #include #include +#ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE +#include +#endif static inline void riscv_v_flags_set(u32 flags) { - current->thread.riscv_v_flags = flags; + WRITE_ONCE(current->thread.riscv_v_flags, flags); } static inline void riscv_v_start(u32 flags) @@ -27,12 +30,14 @@ static inline void riscv_v_start(u32 flags) orig = riscv_v_flags(); BUG_ON((orig & flags) != 0); riscv_v_flags_set(orig | flags); + barrier(); } static inline void riscv_v_stop(u32 flags) { int orig; + barrier(); orig = riscv_v_flags(); BUG_ON((orig & flags) == 0); riscv_v_flags_set(orig & ~flags); @@ -75,6 +80,117 @@ void put_cpu_vector_context(void) preempt_enable(); } +#ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE +static __always_inline u32 *riscv_v_flags_ptr(void) +{ + return ¤t->thread.riscv_v_flags; +} + +static inline void riscv_preempt_v_set_dirty(void) +{ + *riscv_v_flags_ptr() |= RISCV_PREEMPT_V_DIRTY; +} + +static inline void riscv_preempt_v_reset_flags(void) +{ + *riscv_v_flags_ptr() &= ~(RISCV_PREEMPT_V_DIRTY | RISCV_PREEMPT_V_NEED_RESTORE); +} + +static inline void riscv_v_ctx_depth_inc(void) +{ + *riscv_v_flags_ptr() += RISCV_V_CTX_UNIT_DEPTH; +} + +static inline void riscv_v_ctx_depth_dec(void) +{ + *riscv_v_flags_ptr() -= RISCV_V_CTX_UNIT_DEPTH; +} + +static inline u32 riscv_v_ctx_get_depth(void) +{ + return *riscv_v_flags_ptr() & RISCV_V_CTX_DEPTH_MASK; +} + +static int riscv_v_stop_kernel_context(void) +{ + if (riscv_v_ctx_get_depth() != 0 || !riscv_preempt_v_started(current)) + return 1; + + riscv_preempt_v_clear_dirty(current); + riscv_v_stop(RISCV_PREEMPT_V); + return 0; +} + +static int riscv_v_start_kernel_context(bool *is_nested) +{ + struct __riscv_v_ext_state *kvstate, *uvstate; + + kvstate = ¤t->thread.kernel_vstate; + if (!kvstate->datap) + return -ENOENT; + + if (riscv_preempt_v_started(current)) { + WARN_ON(riscv_v_ctx_get_depth() == 0); + *is_nested = true; + get_cpu_vector_context(); + if (riscv_preempt_v_dirty(current)) { + __riscv_v_vstate_save(kvstate, kvstate->datap); + riscv_preempt_v_clear_dirty(current); + } + riscv_preempt_v_set_restore(current); + return 0; + } + + /* Transfer the ownership of V from user to kernel, then save */ + riscv_v_start(RISCV_PREEMPT_V | RISCV_PREEMPT_V_DIRTY); + if ((task_pt_regs(current)->status & SR_VS) == SR_VS_DIRTY) { + uvstate = ¤t->thread.vstate; + __riscv_v_vstate_save(uvstate, uvstate->datap); + } + riscv_preempt_v_clear_dirty(current); + return 0; +} + +/* low-level V context handling code, called with irq disabled */ +asmlinkage void riscv_v_context_nesting_start(struct pt_regs *regs) +{ + int depth; + + if (!riscv_preempt_v_started(current)) + return; + + depth = riscv_v_ctx_get_depth(); + if (depth == 0 && (regs->status & SR_VS) == SR_VS_DIRTY) + riscv_preempt_v_set_dirty(); + + riscv_v_ctx_depth_inc(); +} + +asmlinkage void riscv_v_context_nesting_end(struct pt_regs *regs) +{ + struct __riscv_v_ext_state *vstate = ¤t->thread.kernel_vstate; + u32 depth; + + WARN_ON(!irqs_disabled()); + + if (!riscv_preempt_v_started(current)) + return; + + riscv_v_ctx_depth_dec(); + depth = riscv_v_ctx_get_depth(); + if (depth == 0) { + if (riscv_preempt_v_restore(current)) { + __riscv_v_vstate_restore(vstate, vstate->datap); + __riscv_v_vstate_clean(regs); + riscv_preempt_v_reset_flags(); + } + } +} +#else +#define riscv_v_start_kernel_context(nested) (-ENOENT) +#define riscv_v_stop_kernel_context() (-ENOENT) +#endif /* CONFIG_RISCV_ISA_V_PREEMPTIVE */ + /* * kernel_vector_begin(): obtain the CPU vector registers for use by the calling * context @@ -90,14 +206,20 @@ void put_cpu_vector_context(void) */ void kernel_vector_begin(void) { + bool nested = false; + if (WARN_ON(!has_vector())) return; BUG_ON(!may_use_simd()); - get_cpu_vector_context(); + if (riscv_v_start_kernel_context(&nested)) { + get_cpu_vector_context(); + riscv_v_vstate_save(¤t->thread.vstate, task_pt_regs(current)); + } - riscv_v_vstate_save(¤t->thread.vstate, task_pt_regs(current)); + if (!nested) + riscv_v_vstate_set_restore(current, task_pt_regs(current)); riscv_v_enable(); } @@ -117,10 +239,9 @@ void kernel_vector_end(void) if (WARN_ON(!has_vector())) return; - riscv_v_vstate_set_restore(current, task_pt_regs(current)); - riscv_v_disable(); - put_cpu_vector_context(); + if (riscv_v_stop_kernel_context()) + put_cpu_vector_context(); } EXPORT_SYMBOL_GPL(kernel_vector_end); diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 862d59c3872e2..92922dbd5b5c1 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -188,6 +188,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) *dst = *src; /* clear entire V context, including datap for a new task */ memset(&dst->thread.vstate, 0, sizeof(struct __riscv_v_ext_state)); + memset(&dst->thread.kernel_vstate, 0, sizeof(struct __riscv_v_ext_state)); clear_tsk_thread_flag(dst, TIF_RISCV_V_DEFER_RESTORE); return 0; @@ -224,6 +225,8 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) p->thread.s[0] = 0; } p->thread.riscv_v_flags = 0; + if (has_vector()) + riscv_v_thread_alloc(p); p->thread.ra = (unsigned long)ret_from_fork; p->thread.sp = (unsigned long)childregs; /* kernel sp */ return 0; diff --git a/arch/riscv/kernel/vector.c b/arch/riscv/kernel/vector.c index f7b4aeb9e4579..6727d1d3b8f28 100644 --- a/arch/riscv/kernel/vector.c +++ b/arch/riscv/kernel/vector.c @@ -22,6 +22,9 @@ static bool riscv_v_implicit_uacc = IS_ENABLED(CONFIG_RISCV_ISA_V_DEFAULT_ENABLE); static struct kmem_cache *riscv_v_user_cachep; +#ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE +static struct kmem_cache *riscv_v_kernel_cachep; +#endif unsigned long riscv_v_vsize __read_mostly; EXPORT_SYMBOL_GPL(riscv_v_vsize); @@ -56,6 +59,11 @@ void __init riscv_v_setup_ctx_cache(void) riscv_v_user_cachep = kmem_cache_create_usercopy("riscv_vector_ctx", riscv_v_vsize, 16, SLAB_PANIC, 0, riscv_v_vsize, NULL); +#ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE + riscv_v_kernel_cachep = kmem_cache_create("riscv_vector_kctx", + riscv_v_vsize, 16, + SLAB_PANIC, NULL); +#endif } static bool insn_is_vector(u32 insn_buf) @@ -91,24 +99,35 @@ static bool insn_is_vector(u32 insn_buf) return false; } -static int riscv_v_thread_zalloc(void) +static int riscv_v_thread_zalloc(struct kmem_cache *cache, + struct __riscv_v_ext_state *ctx) { void *datap; - datap = kmem_cache_zalloc(riscv_v_user_cachep, GFP_KERNEL); + datap = kmem_cache_zalloc(cache, GFP_KERNEL); if (!datap) return -ENOMEM; - current->thread.vstate.datap = datap; - memset(¤t->thread.vstate, 0, offsetof(struct __riscv_v_ext_state, - datap)); + ctx->datap = datap; + memset(ctx, 0, offsetof(struct __riscv_v_ext_state, datap)); return 0; } +void riscv_v_thread_alloc(struct task_struct *tsk) +{ +#ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE + riscv_v_thread_zalloc(riscv_v_kernel_cachep, &tsk->thread.kernel_vstate); +#endif +} + void riscv_v_thread_free(struct task_struct *tsk) { if (tsk->thread.vstate.datap) kmem_cache_free(riscv_v_user_cachep, tsk->thread.vstate.datap); +#ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE + if (tsk->thread.kernel_vstate.datap) + kmem_cache_free(riscv_v_kernel_cachep, tsk->thread.kernel_vstate.datap); +#endif } #define VSTATE_CTRL_GET_CUR(x) ((x) & PR_RISCV_V_VSTATE_CTRL_CUR_MASK) @@ -180,7 +199,7 @@ bool riscv_v_first_use_handler(struct pt_regs *regs) * context where VS has been off. So, try to allocate the user's V * context and resume execution. */ - if (riscv_v_thread_zalloc()) { + if (riscv_v_thread_zalloc(riscv_v_user_cachep, ¤t->thread.vstate)) { force_sig(SIGBUS); return true; } -- GitLab From 22c7fa171a02d310e3a3f6ed46a698ca8a0060ed Mon Sep 17 00:00:00 2001 From: Hao Sun Date: Mon, 15 Jan 2024 09:20:27 +0100 Subject: [PATCH 0999/1290] bpf: Reject variable offset alu on PTR_TO_FLOW_KEYS For PTR_TO_FLOW_KEYS, check_flow_keys_access() only uses fixed off for validation. However, variable offset ptr alu is not prohibited for this ptr kind. So the variable offset is not checked. The following prog is accepted: func#0 @0 0: R1=ctx() R10=fp0 0: (bf) r6 = r1 ; R1=ctx() R6_w=ctx() 1: (79) r7 = *(u64 *)(r6 +144) ; R6_w=ctx() R7_w=flow_keys() 2: (b7) r8 = 1024 ; R8_w=1024 3: (37) r8 /= 1 ; R8_w=scalar() 4: (57) r8 &= 1024 ; R8_w=scalar(smin=smin32=0, smax=umax=smax32=umax32=1024,var_off=(0x0; 0x400)) 5: (0f) r7 += r8 mark_precise: frame0: last_idx 5 first_idx 0 subseq_idx -1 mark_precise: frame0: regs=r8 stack= before 4: (57) r8 &= 1024 mark_precise: frame0: regs=r8 stack= before 3: (37) r8 /= 1 mark_precise: frame0: regs=r8 stack= before 2: (b7) r8 = 1024 6: R7_w=flow_keys(smin=smin32=0,smax=umax=smax32=umax32=1024,var_off =(0x0; 0x400)) R8_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=1024, var_off=(0x0; 0x400)) 6: (79) r0 = *(u64 *)(r7 +0) ; R0_w=scalar() 7: (95) exit This prog loads flow_keys to r7, and adds the variable offset r8 to r7, and finally causes out-of-bounds access: BUG: unable to handle page fault for address: ffffc90014c80038 [...] Call Trace: bpf_dispatcher_nop_func include/linux/bpf.h:1231 [inline] __bpf_prog_run include/linux/filter.h:651 [inline] bpf_prog_run include/linux/filter.h:658 [inline] bpf_prog_run_pin_on_cpu include/linux/filter.h:675 [inline] bpf_flow_dissect+0x15f/0x350 net/core/flow_dissector.c:991 bpf_prog_test_run_flow_dissector+0x39d/0x620 net/bpf/test_run.c:1359 bpf_prog_test_run kernel/bpf/syscall.c:4107 [inline] __sys_bpf+0xf8f/0x4560 kernel/bpf/syscall.c:5475 __do_sys_bpf kernel/bpf/syscall.c:5561 [inline] __se_sys_bpf kernel/bpf/syscall.c:5559 [inline] __x64_sys_bpf+0x73/0xb0 kernel/bpf/syscall.c:5559 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x3f/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b Fix this by rejecting ptr alu with variable offset on flow_keys. Applying the patch rejects the program with "R7 pointer arithmetic on flow_keys prohibited". Fixes: d58e468b1112 ("flow_dissector: implements flow dissector BPF hook") Signed-off-by: Hao Sun Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240115082028.9992-1-sunhao.th@gmail.com --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index adbf330d364bb..65f598694d550 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12826,6 +12826,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } switch (base_type(ptr_reg->type)) { + case PTR_TO_FLOW_KEYS: + if (known) + break; + fallthrough; case CONST_PTR_TO_MAP: /* smin_val represents the known value */ if (known && smin_val == 0 && opcode == BPF_ADD) -- GitLab From 33772ff3b887eb2f426ed66bcb1808837a40669c Mon Sep 17 00:00:00 2001 From: Hao Sun Date: Mon, 15 Jan 2024 09:20:28 +0100 Subject: [PATCH 1000/1290] selftests/bpf: Add test for alu on PTR_TO_FLOW_KEYS Add a test case for PTR_TO_FLOW_KEYS alu. Testing if alu with variable offset on flow_keys is rejected. For the fixed offset success case, we already have C code coverage to verify (e.g. via bpf_flow.c). Signed-off-by: Hao Sun Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240115082028.9992-2-sunhao.th@gmail.com --- .../bpf/progs/verifier_value_illegal_alu.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c b/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c index 71814a7532160..a9ab37d3b9e2d 100644 --- a/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c +++ b/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c @@ -146,4 +146,23 @@ l0_%=: exit; \ : __clobber_all); } +SEC("flow_dissector") +__description("flow_keys illegal alu op with variable offset") +__failure __msg("R7 pointer arithmetic on flow_keys prohibited") +__naked void flow_keys_illegal_variable_offset_alu(void) +{ + asm volatile(" \ + r6 = r1; \ + r7 = *(u64*)(r6 + %[flow_keys_off]); \ + r8 = 8; \ + r8 /= 1; \ + r8 &= 8; \ + r7 += r8; \ + r0 = *(u64*)(r7 + 0); \ + exit; \ +" : + : __imm_const(flow_keys_off, offsetof(struct __sk_buff, flow_keys)) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; -- GitLab From be50df31c4e2a69f961a3bb759346d299eaa2b23 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Tue, 16 Jan 2024 17:34:31 +0300 Subject: [PATCH 1001/1290] block: bio-integrity: fix kcalloc() arguments order When compiling with gcc version 14.0.1 20240116 (experimental) and W=1, I've noticed the following warning: block/bio-integrity.c: In function 'bio_integrity_map_user': block/bio-integrity.c:339:38: warning: 'kcalloc' sizes specified with 'sizeof' in the earlier argument and not in the later argument [-Wcalloc-transposed-args] 339 | bvec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL); | ^ block/bio-integrity.c:339:38: note: earlier argument should specify number of elements, later size of each element Since 'n' and 'size' arguments of 'kcalloc()' are multiplied to calculate the final size, their actual order doesn't affect the result and so this is not a bug. But it's still worth to fix it. Fixes: 492c5d455969 ("block: bio-integrity: directly map user buffers") Signed-off-by: Dmitry Antipov Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20240116143437.89060-1-dmantipov@yandex.ru Signed-off-by: Jens Axboe --- block/bio-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index feef615e2c9c8..c9a16fba58b9c 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -336,7 +336,7 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes, if (nr_vecs > BIO_MAX_VECS) return -E2BIG; if (nr_vecs > UIO_FASTIOV) { - bvec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL); + bvec = kcalloc(nr_vecs, sizeof(*bvec), GFP_KERNEL); if (!bvec) return -ENOMEM; pages = NULL; -- GitLab From 58f65f9db7e0de366a5a115c2e2c0703858bba69 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 16 Jan 2024 21:43:25 +0100 Subject: [PATCH 1002/1290] Input: atkbd - use ab83 as id when skipping the getid command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Barnabás reported that the change to skip the getid command when the controller is in translated mode on laptops caused the Version field of his "AT Translated Set 2 keyboard" input device to change from ab83 to abba, breaking a custom hwdb entry for this keyboard. Use the standard ab83 id for keyboards when getid is skipped (rather then that getid fails) to avoid reporting a different Version to userspace then before skipping the getid. Fixes: 936e4d49ecbc ("Input: atkbd - skip ATKBD_CMD_GETID in translated mode") Reported-by: Barnabás Pőcze Closes: https://lore.kernel.org/linux-input/W1ydwoG2fYv85Z3C3yfDOJcVpilEvGge6UGa9kZh8zI2-qkHXp7WLnl2hSkFz63j-c7WupUWI5TLL6n7Lt8DjRuU-yJBwLYWrreb1hbnd6A=@protonmail.com/ Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240116204325.7719-1-hdegoede@redhat.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/atkbd.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/input/keyboard/atkbd.c b/drivers/input/keyboard/atkbd.c index 786f00f6b7fd8..13ef6284223da 100644 --- a/drivers/input/keyboard/atkbd.c +++ b/drivers/input/keyboard/atkbd.c @@ -791,9 +791,9 @@ static bool atkbd_is_portable_device(void) * not work. So in this case simply assume a keyboard is connected to avoid * confusing some laptop keyboards. * - * Skipping ATKBD_CMD_GETID ends up using a fake keyboard id. Using a fake id is - * ok in translated mode, only atkbd_select_set() checks atkbd->id and in - * translated mode that is a no-op. + * Skipping ATKBD_CMD_GETID ends up using a fake keyboard id. Using the standard + * 0xab83 id is ok in translated mode, only atkbd_select_set() checks atkbd->id + * and in translated mode that is a no-op. */ static bool atkbd_skip_getid(struct atkbd *atkbd) { @@ -811,6 +811,7 @@ static int atkbd_probe(struct atkbd *atkbd) { struct ps2dev *ps2dev = &atkbd->ps2dev; unsigned char param[2]; + bool skip_getid; /* * Some systems, where the bit-twiddling when testing the io-lines of the @@ -832,7 +833,8 @@ static int atkbd_probe(struct atkbd *atkbd) */ param[0] = param[1] = 0xa5; /* initialize with invalid values */ - if (atkbd_skip_getid(atkbd) || ps2_command(ps2dev, param, ATKBD_CMD_GETID)) { + skip_getid = atkbd_skip_getid(atkbd); + if (skip_getid || ps2_command(ps2dev, param, ATKBD_CMD_GETID)) { /* * If the get ID command was skipped or failed, we check if we can at least set @@ -842,7 +844,7 @@ static int atkbd_probe(struct atkbd *atkbd) param[0] = 0; if (ps2_command(ps2dev, param, ATKBD_CMD_SETLEDS)) return -1; - atkbd->id = 0xabba; + atkbd->id = skip_getid ? 0xab83 : 0xabba; return 0; } -- GitLab From 53c41052ba3121761e6f62a813961164532a214f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 16 Jan 2024 16:12:18 -0500 Subject: [PATCH 1003/1290] eventfs: Have the inodes all for files and directories all be the same The dentries and inodes are created in the readdir for the sole purpose of getting a consistent inode number. Linus stated that is unnecessary, and that all inodes can have the same inode number. For a virtual file system they are pretty meaningless. Instead use a single unique inode number for all files and one for all directories. Link: https://lore.kernel.org/all/20240116133753.2808d45e@gandalf.local.home/ Link: https://lore.kernel.org/linux-trace-kernel/20240116211353.412180363@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Christian Brauner Cc: Al Viro Cc: Ajay Kaher Suggested-by: Linus Torvalds Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index fdff53d5a1f87..5edf0b96758ba 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -32,6 +32,10 @@ */ static DEFINE_MUTEX(eventfs_mutex); +/* Choose something "unique" ;-) */ +#define EVENTFS_FILE_INODE_INO 0x12c4e37 +#define EVENTFS_DIR_INODE_INO 0x134b2f5 + /* * The eventfs_inode (ei) itself is protected by SRCU. It is released from * its parent's list and will have is_freed set (under eventfs_mutex). @@ -352,6 +356,9 @@ static struct dentry *create_file(const char *name, umode_t mode, inode->i_fop = fop; inode->i_private = data; + /* All files will have the same inode number */ + inode->i_ino = EVENTFS_FILE_INODE_INO; + ti = get_tracefs(inode); ti->flags |= TRACEFS_EVENT_INODE; d_instantiate(dentry, inode); @@ -388,6 +395,9 @@ static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent inode->i_op = &eventfs_root_dir_inode_operations; inode->i_fop = &eventfs_file_operations; + /* All directories will have the same inode number */ + inode->i_ino = EVENTFS_DIR_INODE_INO; + ti = get_tracefs(inode); ti->flags |= TRACEFS_EVENT_INODE; -- GitLab From 7bed6f3d08b7af27b7015da8dc3acf2b9c1f21d7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 16 Jan 2024 21:29:59 +0000 Subject: [PATCH 1004/1290] block: Fix iterating over an empty bio with bio_for_each_folio_all If the bio contains no data, bio_first_folio() calls page_folio() on a NULL pointer and oopses. Move the test that we've reached the end of the bio from bio_next_folio() to bio_first_folio(). Reported-by: syzbot+8b23309d5788a79d3eea@syzkaller.appspotmail.com Reported-by: syzbot+004c1e0fced2b4bc3dcc@syzkaller.appspotmail.com Fixes: 640d1930bef4 ("block: Add bio_for_each_folio_all()") Cc: stable@vger.kernel.org Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20240116212959.3413014-1-willy@infradead.org [axboe: add unlikely() to error case] Signed-off-by: Jens Axboe --- include/linux/bio.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/include/linux/bio.h b/include/linux/bio.h index ec4db73e5f4ec..875d792bffff8 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -286,6 +286,11 @@ static inline void bio_first_folio(struct folio_iter *fi, struct bio *bio, { struct bio_vec *bvec = bio_first_bvec_all(bio) + i; + if (unlikely(i >= bio->bi_vcnt)) { + fi->folio = NULL; + return; + } + fi->folio = page_folio(bvec->bv_page); fi->offset = bvec->bv_offset + PAGE_SIZE * (bvec->bv_page - &fi->folio->page); @@ -303,10 +308,8 @@ static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio) fi->offset = 0; fi->length = min(folio_size(fi->folio), fi->_seg_count); fi->_next = folio_next(fi->folio); - } else if (fi->_i + 1 < bio->bi_vcnt) { - bio_first_folio(fi, bio, fi->_i + 1); } else { - fi->folio = NULL; + bio_first_folio(fi, bio, fi->_i + 1); } } -- GitLab From 852e46e239ee6db3cd220614cf8bce96e79227c2 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 16 Jan 2024 16:12:19 -0500 Subject: [PATCH 1005/1290] eventfs: Do not create dentries nor inodes in iterate_shared The original eventfs code added a wrapper around the dcache_readdir open callback and created all the dentries and inodes at open, and increment their ref count. A wrapper was added around the dcache_readdir release function to decrement all the ref counts of those created inodes and dentries. But this proved to be buggy[1] for when a kprobe was created during a dir read, it would create a dentry between the open and the release, and because the release would decrement all ref counts of all files and directories, that would include the kprobe directory that was not there to have its ref count incremented in open. This would cause the ref count to go to negative and later crash the kernel. To solve this, the dentries and inodes that were created and had their ref count upped in open needed to be saved. That list needed to be passed from the open to the release, so that the release would only decrement the ref counts of the entries that were incremented in the open. Unfortunately, the dcache_readdir logic was already using the file->private_data, which is the only field that can be used to pass information from the open to the release. What was done was the eventfs created another descriptor that had a void pointer to save the dcache_readdir pointer, and it wrapped all the callbacks, so that it could save the list of entries that had their ref counts incremented in the open, and pass it to the release. The wrapped callbacks would just put back the dcache_readdir pointer and call the functions it used so it could still use its data[2]. But Linus had an issue with the "hijacking" of the file->private_data (unfortunately this discussion was on a security list, so no public link). Which we finally agreed on doing everything within the iterate_shared callback and leave the dcache_readdir out of it[3]. All the information needed for the getents() could be created then. But this ended up being buggy too[4]. The iterate_shared callback was not the right place to create the dentries and inodes. Even Christian Brauner had issues with that[5]. An attempt was to go back to creating the inodes and dentries at the open, create an array to store the information in the file->private_data, and pass that information to the other callbacks.[6] The difference between that and the original method, is that it does not use dcache_readdir. It also does not up the ref counts of the dentries and pass them. Instead, it creates an array of a structure that saves the dentry's name and inode number. That information is used in the iterate_shared callback, and the array is freed in the dir release. The dentries and inodes created in the open are not used for the iterate_share or release callbacks. Just their names and inode numbers. Linus did not like that either[7] and just wanted to remove the dentries being created in iterate_shared and use the hard coded inode numbers. [ All this while Linus enjoyed an unexpected vacation during the merge window due to lack of power. ] [1] https://lore.kernel.org/linux-trace-kernel/20230919211804.230edf1e@gandalf.local.home/ [2] https://lore.kernel.org/linux-trace-kernel/20230922163446.1431d4fa@gandalf.local.home/ [3] https://lore.kernel.org/linux-trace-kernel/20240104015435.682218477@goodmis.org/ [4] https://lore.kernel.org/all/202401152142.bfc28861-oliver.sang@intel.com/ [5] https://lore.kernel.org/all/20240111-unzahl-gefegt-433acb8a841d@brauner/ [6] https://lore.kernel.org/all/20240116114711.7e8637be@gandalf.local.home/ [7] https://lore.kernel.org/all/20240116170154.5bf0a250@gandalf.local.home/ Link: https://lore.kernel.org/linux-trace-kernel/20240116211353.573784051@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Linus Torvalds Cc: Christian Brauner Cc: Al Viro Cc: Ajay Kaher Fixes: 493ec81a8fb8 ("eventfs: Stop using dcache_readdir() for getdents()") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202401152142.bfc28861-oliver.sang@intel.com Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 5edf0b96758ba..10580d6b50125 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -727,8 +727,6 @@ static int eventfs_iterate(struct file *file, struct dir_context *ctx) struct eventfs_inode *ei_child; struct tracefs_inode *ti; struct eventfs_inode *ei; - struct dentry *ei_dentry = NULL; - struct dentry *dentry; const char *name; umode_t mode; int idx; @@ -749,11 +747,11 @@ static int eventfs_iterate(struct file *file, struct dir_context *ctx) mutex_lock(&eventfs_mutex); ei = READ_ONCE(ti->private); - if (ei && !ei->is_freed) - ei_dentry = READ_ONCE(ei->dentry); + if (ei && ei->is_freed) + ei = NULL; mutex_unlock(&eventfs_mutex); - if (!ei || !ei_dentry) + if (!ei) goto out; /* @@ -780,11 +778,7 @@ static int eventfs_iterate(struct file *file, struct dir_context *ctx) if (r <= 0) continue; - dentry = create_file_dentry(ei, i, ei_dentry, name, mode, cdata, fops); - if (!dentry) - goto out; - ino = dentry->d_inode->i_ino; - dput(dentry); + ino = EVENTFS_FILE_INODE_INO; if (!dir_emit(ctx, name, strlen(name), ino, DT_REG)) goto out; @@ -808,11 +802,7 @@ static int eventfs_iterate(struct file *file, struct dir_context *ctx) name = ei_child->name; - dentry = create_dir_dentry(ei, ei_child, ei_dentry); - if (!dentry) - goto out_dec; - ino = dentry->d_inode->i_ino; - dput(dentry); + ino = EVENTFS_DIR_INODE_INO; if (!dir_emit(ctx, name, strlen(name), ino, DT_DIR)) goto out_dec; -- GitLab From 1057066009c4325bb1d8430c9274894d0860e7c3 Mon Sep 17 00:00:00 2001 From: Erick Archer Date: Mon, 15 Jan 2024 19:16:58 +0100 Subject: [PATCH 1006/1290] eventfs: Use kcalloc() instead of kzalloc() As noted in the "Deprecated Interfaces, Language Features, Attributes, and Conventions" documentation [1], size calculations (especially multiplication) should not be performed in memory allocator (or similar) function arguments due to the risk of them overflowing. This could lead to values wrapping around and a smaller allocation being made than the caller was expecting. Using those allocations could lead to linear overflows of heap memory and other misbehaviors. So, use the purpose specific kcalloc() function instead of the argument size * count in the kzalloc() function. [1] https://www.kernel.org/doc/html/next/process/deprecated.html#open-coded-arithmetic-in-allocator-arguments Link: https://lore.kernel.org/linux-trace-kernel/20240115181658.4562-1-erick.archer@gmx.com Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mark Rutland Link: https://github.com/KSPP/linux/issues/162 Signed-off-by: Erick Archer Reviewed-by: Gustavo A. R. Silva Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 10580d6b50125..6795fda2af191 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -97,7 +97,7 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry, /* Preallocate the children mode array if necessary */ if (!(dentry->d_inode->i_mode & S_IFDIR)) { if (!ei->entry_attrs) { - ei->entry_attrs = kzalloc(sizeof(*ei->entry_attrs) * ei->nr_entries, + ei->entry_attrs = kcalloc(ei->nr_entries, sizeof(*ei->entry_attrs), GFP_NOFS); if (!ei->entry_attrs) { ret = -ENOMEM; @@ -874,7 +874,7 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode } if (size) { - ei->d_children = kzalloc(sizeof(*ei->d_children) * size, GFP_KERNEL); + ei->d_children = kcalloc(size, sizeof(*ei->d_children), GFP_KERNEL); if (!ei->d_children) { kfree_const(ei->name); kfree(ei); @@ -941,7 +941,7 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry goto fail; if (size) { - ei->d_children = kzalloc(sizeof(*ei->d_children) * size, GFP_KERNEL); + ei->d_children = kcalloc(size, sizeof(*ei->d_children), GFP_KERNEL); if (!ei->d_children) goto fail; } -- GitLab From 8ca5d2641be217a78a891d4dbe2a46232d1d8eb9 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 16 Jan 2024 10:51:34 +0000 Subject: [PATCH 1007/1290] cifs: remove redundant variable tcon_exist The variable tcon_exist is being assigned however it is never read, the variable is redundant and can be removed. Cleans up clang scan build warning: warning: Although the value stored to 'tcon_exist' is used in the enclosing expression, the value is never actually readfrom 'tcon_exist' [deadcode.DeadStores] Signed-off-by: Colin Ian King Signed-off-by: Steve French --- fs/smb/client/smb2pdu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index bd25c34dc398b..50f6bf16b6245 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -3918,7 +3918,7 @@ void smb2_reconnect_server(struct work_struct *work) struct cifs_ses *ses, *ses2; struct cifs_tcon *tcon, *tcon2; struct list_head tmp_list, tmp_ses_list; - bool tcon_exist = false, ses_exist = false; + bool ses_exist = false; bool tcon_selected = false; int rc; bool resched = false; @@ -3964,7 +3964,7 @@ void smb2_reconnect_server(struct work_struct *work) if (tcon->need_reconnect || tcon->need_reopen_files) { tcon->tc_count++; list_add_tail(&tcon->rlist, &tmp_list); - tcon_selected = tcon_exist = true; + tcon_selected = true; } } /* @@ -3973,7 +3973,7 @@ void smb2_reconnect_server(struct work_struct *work) */ if (ses->tcon_ipc && ses->tcon_ipc->need_reconnect) { list_add_tail(&ses->tcon_ipc->rlist, &tmp_list); - tcon_selected = tcon_exist = true; + tcon_selected = true; cifs_smb_ses_inc_refcount(ses); } /* -- GitLab From 776dac5a662774f07a876b650ba578d0a62d20db Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Thu, 11 Jan 2024 15:20:18 +0800 Subject: [PATCH 1008/1290] net: dsa: vsc73xx: Add null pointer check to vsc73xx_gpio_probe devm_kasprintf() returns a pointer to dynamically allocated memory which can be NULL upon failure. Fixes: 05bd97fc559d ("net: dsa: Add Vitesse VSC73xx DSA router driver") Signed-off-by: Kunwu Chan Suggested-by: Jakub Kicinski Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240111072018.75971-1-chentao@kylinos.cn Signed-off-by: Jakub Kicinski --- drivers/net/dsa/vitesse-vsc73xx-core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/dsa/vitesse-vsc73xx-core.c b/drivers/net/dsa/vitesse-vsc73xx-core.c index dd50502e21229..ae70eac3be28f 100644 --- a/drivers/net/dsa/vitesse-vsc73xx-core.c +++ b/drivers/net/dsa/vitesse-vsc73xx-core.c @@ -1135,6 +1135,8 @@ static int vsc73xx_gpio_probe(struct vsc73xx *vsc) vsc->gc.label = devm_kasprintf(vsc->dev, GFP_KERNEL, "VSC%04x", vsc->chipid); + if (!vsc->gc.label) + return -ENOMEM; vsc->gc.ngpio = 4; vsc->gc.owner = THIS_MODULE; vsc->gc.parent = vsc->dev; -- GitLab From 97eb5d51b4a584a60e5d096bdb6b33edc9f50d8d Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 15 Jan 2024 12:43:38 +0000 Subject: [PATCH 1009/1290] net: sfp-bus: fix SFP mode detect from bitrate The referenced commit moved the setting of the Autoneg and pause bits early in sfp_parse_support(). However, we check whether the modes are empty before using the bitrate to set some modes. Setting these bits so early causes that test to always be false, preventing this working, and thus some modules that used to work no longer do. Move them just before the call to the quirk. Fixes: 8110633db49d ("net: sfp-bus: allow SFP quirks to override Autoneg and pause bits") Signed-off-by: Russell King (Oracle) Reviewed-by: Maxime Chevallier Link: https://lore.kernel.org/r/E1rPMJW-001Ahf-L0@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/sfp-bus.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c index 6fa679b36290e..db39dec7f2471 100644 --- a/drivers/net/phy/sfp-bus.c +++ b/drivers/net/phy/sfp-bus.c @@ -151,10 +151,6 @@ void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id, unsigned int br_min, br_nom, br_max; __ETHTOOL_DECLARE_LINK_MODE_MASK(modes) = { 0, }; - phylink_set(modes, Autoneg); - phylink_set(modes, Pause); - phylink_set(modes, Asym_Pause); - /* Decode the bitrate information to MBd */ br_min = br_nom = br_max = 0; if (id->base.br_nominal) { @@ -339,6 +335,10 @@ void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id, } } + phylink_set(modes, Autoneg); + phylink_set(modes, Pause); + phylink_set(modes, Asym_Pause); + if (bus->sfp_quirk && bus->sfp_quirk->modes) bus->sfp_quirk->modes(id, modes, interfaces); -- GitLab From e9ce7ededf14af3396312f02d1622f4889d676ca Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 15 Jan 2024 14:59:22 +0100 Subject: [PATCH 1010/1290] selftests: rtnetlink: use setup_ns in bonding test This is a follow-up of commit a159cbe81d3b ("selftests: rtnetlink: check enslaving iface in a bond") after the merge of net-next into net. The goal is to follow the new convention, see commit d3b6b1116127 ("selftests/net: convert rtnetlink.sh to run it in unique namespace") for more details. Let's use also the generic dummy name instead of defining a new one. Signed-off-by: Nicolas Dichtel Reviewed-by: Hangbin Liu Link: https://lore.kernel.org/r/20240115135922.3662648-1-nicolas.dichtel@6wind.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/rtnetlink.sh | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh index a31be0eaaa50f..4667d74579d13 100755 --- a/tools/testing/selftests/net/rtnetlink.sh +++ b/tools/testing/selftests/net/rtnetlink.sh @@ -1244,21 +1244,19 @@ kci_test_address_proto() kci_test_enslave_bonding() { - local testns="testns" local bond="bond123" - local dummy="dummy123" local ret=0 - run_cmd ip netns add "$testns" - if [ $ret -ne 0 ]; then + setup_ns testns + if [ $? -ne 0 ]; then end_test "SKIP bonding tests: cannot add net namespace $testns" return $ksft_skip fi run_cmd ip -netns $testns link add dev $bond type bond mode balance-rr - run_cmd ip -netns $testns link add dev $dummy type dummy - run_cmd ip -netns $testns link set dev $dummy up - run_cmd ip -netns $testns link set dev $dummy master $bond down + run_cmd ip -netns $testns link add dev $devdummy type dummy + run_cmd ip -netns $testns link set dev $devdummy up + run_cmd ip -netns $testns link set dev $devdummy master $bond down if [ $ret -ne 0 ]; then end_test "FAIL: initially up interface added to a bond and set down" ip netns del "$testns" -- GitLab From 2772ae4d66d17c6a8b4c167ddb660fc8d7972da5 Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Wed, 17 Jan 2024 12:42:59 +0800 Subject: [PATCH 1011/1290] modpost: Ignore relaxation and alignment marker relocs on LoongArch With recent trunk versions of binutils and gcc, alignment directives are represented with R_LARCH_ALIGN relocs on LoongArch, which is necessary for the linker to maintain alignment requirements during its relaxation passes. And even though the kernel is built with relaxation disabled, so far a small number of R_LARCH_RELAX marker relocs are still emitted as part of la.* pseudo instructions in assembly. These two kinds of relocs do not refer to symbols, which can trip up modpost's section mismatch checks, because the r_offset of said relocs can be zero or any other meaningless value, eventually leading to a `from == NULL` condition in default_mismatch_handler and SIGSEGV. As the two kinds of relocs are not concerned with symbols, just ignore them for section mismatch check purposes. Signed-off-by: WANG Xuerui Signed-off-by: Huacai Chen --- scripts/mod/modpost.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index cb6406f485a96..68ab45273a22f 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -1346,6 +1346,14 @@ static Elf_Addr addend_mips_rel(uint32_t *location, unsigned int r_type) #define R_LARCH_SUB32 55 #endif +#ifndef R_LARCH_RELAX +#define R_LARCH_RELAX 100 +#endif + +#ifndef R_LARCH_ALIGN +#define R_LARCH_ALIGN 102 +#endif + static void get_rel_type_and_sym(struct elf_info *elf, uint64_t r_info, unsigned int *r_type, unsigned int *r_sym) { @@ -1400,9 +1408,16 @@ static void section_rela(struct module *mod, struct elf_info *elf, continue; break; case EM_LOONGARCH: - if (!strcmp("__ex_table", fromsec) && - r_type == R_LARCH_SUB32) + switch (r_type) { + case R_LARCH_SUB32: + if (!strcmp("__ex_table", fromsec)) + continue; + break; + case R_LARCH_RELAX: + case R_LARCH_ALIGN: + /* These relocs do not refer to symbols */ continue; + } break; } -- GitLab From f58b0abae839f06be9d791d16196922a4b281777 Mon Sep 17 00:00:00 2001 From: WANG Rui Date: Wed, 17 Jan 2024 12:43:00 +0800 Subject: [PATCH 1012/1290] scripts/min-tool-version.sh: Raise minimum clang version to 18.0.0 for loongarch The existing mainline clang development version encounters difficulties compiling the LoongArch kernel module. It is anticipated that this issue will be resolved in the upcoming 18.0.0 release. To prevent user confusion arising from broken builds, it is advisable to raise the minimum required clang version for LoongArch to 18.0.0. Suggested-by: Nathan Chancellor Reviewed-by: Nathan Chancellor Acked-by: Nick Desaulniers Link: https://github.com/ClangBuiltLinux/linux/issues/1941 Signed-off-by: Tiezhu Yang Signed-off-by: WANG Rui Signed-off-by: Huacai Chen --- scripts/min-tool-version.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/min-tool-version.sh b/scripts/min-tool-version.sh index fd5ffdb81bab7..1c6ab10dc69ee 100755 --- a/scripts/min-tool-version.sh +++ b/scripts/min-tool-version.sh @@ -26,6 +26,8 @@ gcc) llvm) if [ "$SRCARCH" = s390 ]; then echo 15.0.0 + elif [ "$SRCARCH" = loongarch ]; then + echo 18.0.0 else echo 11.0.0 fi -- GitLab From 90868ff9cadecd46fa2a4f5501c66bfea8ade9b7 Mon Sep 17 00:00:00 2001 From: WANG Rui Date: Wed, 17 Jan 2024 12:43:00 +0800 Subject: [PATCH 1013/1290] LoongArch: Enable initial Rust support Enable initial Rust support for LoongArch. Tested-by: Miguel Ojeda Signed-off-by: WANG Rui Signed-off-by: Huacai Chen --- Documentation/rust/arch-support.rst | 13 +++++++------ arch/loongarch/Kconfig | 1 + arch/loongarch/Makefile | 3 +++ scripts/generate_rust_target.rs | 7 +++++++ 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/Documentation/rust/arch-support.rst b/Documentation/rust/arch-support.rst index b91e9ef4d0c21..73203ba1e9011 100644 --- a/Documentation/rust/arch-support.rst +++ b/Documentation/rust/arch-support.rst @@ -12,10 +12,11 @@ which uses ``libclang``. Below is a general summary of architectures that currently work. Level of support corresponds to ``S`` values in the ``MAINTAINERS`` file. -============ ================ ============================================== -Architecture Level of support Constraints -============ ================ ============================================== -``um`` Maintained ``x86_64`` only. -``x86`` Maintained ``x86_64`` only. -============ ================ ============================================== +============= ================ ============================================== +Architecture Level of support Constraints +============= ================ ============================================== +``loongarch`` Maintained - +``um`` Maintained ``x86_64`` only. +``x86`` Maintained ``x86_64`` only. +============= ================ ============================================== diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index ee123820a4760..6b9da3effdf8e 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -140,6 +140,7 @@ config LOONGARCH select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RETHOOK select HAVE_RSEQ + select HAVE_RUST select HAVE_SAMPLE_FTRACE_DIRECT select HAVE_SAMPLE_FTRACE_DIRECT_MULTI select HAVE_SETUP_PER_CPU_AREA if NUMA diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile index 4ba8d67ddb097..ba45cb7b621cd 100644 --- a/arch/loongarch/Makefile +++ b/arch/loongarch/Makefile @@ -81,8 +81,11 @@ KBUILD_AFLAGS_MODULE += -Wa,-mla-global-with-abs KBUILD_CFLAGS_MODULE += -fplt -Wa,-mla-global-with-abs,-mla-local-with-abs endif +KBUILD_RUSTFLAGS_MODULE += -Crelocation-model=pic + ifeq ($(CONFIG_RELOCATABLE),y) KBUILD_CFLAGS_KERNEL += -fPIE +KBUILD_RUSTFLAGS_KERNEL += -Crelocation-model=pie LDFLAGS_vmlinux += -static -pie --no-dynamic-linker -z notext $(call ld-option, --apply-dynamic-relocs) endif diff --git a/scripts/generate_rust_target.rs b/scripts/generate_rust_target.rs index 3c6cbe2b278d3..0da52b548ba50 100644 --- a/scripts/generate_rust_target.rs +++ b/scripts/generate_rust_target.rs @@ -161,6 +161,13 @@ fn main() { ts.push("features", features); ts.push("llvm-target", "x86_64-linux-gnu"); ts.push("target-pointer-width", "64"); + } else if cfg.has("LOONGARCH") { + ts.push("arch", "loongarch64"); + ts.push("data-layout", "e-m:e-p:64:64-i64:64-i128:128-n64-S128"); + ts.push("features", "-f,-d"); + ts.push("llvm-target", "loongarch64-linux-gnusf"); + ts.push("llvm-abiname", "lp64s"); + ts.push("target-pointer-width", "64"); } else { panic!("Unsupported architecture"); } -- GitLab From 8e07e0e3964ca4e23ce7b68e2096fe660a888942 Mon Sep 17 00:00:00 2001 From: Binbin Zhou Date: Wed, 17 Jan 2024 12:43:00 +0800 Subject: [PATCH 1014/1290] dt-bindings: loongarch: Add CPU bindings for LoongArch Add the available CPUs in LoongArch binding with DT schema format using json-schema. Reviewed-by: Conor Dooley Signed-off-by: Binbin Zhou Signed-off-by: Huacai Chen --- .../devicetree/bindings/loongarch/cpus.yaml | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 Documentation/devicetree/bindings/loongarch/cpus.yaml diff --git a/Documentation/devicetree/bindings/loongarch/cpus.yaml b/Documentation/devicetree/bindings/loongarch/cpus.yaml new file mode 100644 index 0000000000000..f175872995e11 --- /dev/null +++ b/Documentation/devicetree/bindings/loongarch/cpus.yaml @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/loongarch/cpus.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: LoongArch CPUs + +maintainers: + - Binbin Zhou + +description: + This document describes the list of LoongArch CPU cores that support FDT, + it describe the layout of CPUs in a system through the "cpus" node. + +allOf: + - $ref: /schemas/cpu.yaml# + +properties: + compatible: + enum: + - loongson,la264 + - loongson,la364 + + reg: + maxItems: 1 + + clocks: + maxItems: 1 + +required: + - compatible + - reg + - clocks + +unevaluatedProperties: false + +examples: + - | + #include + + cpus { + #size-cells = <0>; + #address-cells = <1>; + + cpu@0 { + compatible = "loongson,la264"; + device_type = "cpu"; + reg = <0>; + clocks = <&clk LOONGSON2_NODE_CLK>; + }; + + cpu@1 { + compatible = "loongson,la264"; + device_type = "cpu"; + reg = <1>; + clocks = <&clk LOONGSON2_NODE_CLK>; + }; + }; + +... -- GitLab From ec6b36edf0cea06d651ef14092921a339b4eea2f Mon Sep 17 00:00:00 2001 From: Binbin Zhou Date: Wed, 17 Jan 2024 12:43:00 +0800 Subject: [PATCH 1015/1290] dt-bindings: loongarch: Add Loongson SoC boards compatibles Add Loongson SoC boards binding with DT schema format using json-schema. Reviewed-by: Conor Dooley Signed-off-by: Binbin Zhou Signed-off-by: Huacai Chen --- .../bindings/loongarch/loongson.yaml | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 Documentation/devicetree/bindings/loongarch/loongson.yaml diff --git a/Documentation/devicetree/bindings/loongarch/loongson.yaml b/Documentation/devicetree/bindings/loongarch/loongson.yaml new file mode 100644 index 0000000000000..e1a4a97b7576c --- /dev/null +++ b/Documentation/devicetree/bindings/loongarch/loongson.yaml @@ -0,0 +1,34 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/loongarch/loongson.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Loongson SoC-based boards + +maintainers: + - Binbin Zhou + +properties: + $nodename: + const: '/' + compatible: + oneOf: + - description: Loongson-2K0500 processor based boards + items: + - const: loongson,ls2k0500-ref + - const: loongson,ls2k0500 + + - description: Loongson-2K1000 processor based boards + items: + - const: loongson,ls2k1000-ref + - const: loongson,ls2k1000 + + - description: Loongson-2K2000 processor based boards + items: + - const: loongson,ls2k2000-ref + - const: loongson,ls2k2000 + +additionalProperties: true + +... -- GitLab From aaeebb3ea4f2d6674b0fbc8bd48bf8862309f191 Mon Sep 17 00:00:00 2001 From: Binbin Zhou Date: Wed, 17 Jan 2024 12:43:00 +0800 Subject: [PATCH 1016/1290] dt-bindings: interrupt-controller: loongson,liointc: Fix dtbs_check warning for reg-names As we know, the Loongson-2K0500 is a single-core CPU, and the core1- related register (isr1) does not exist. So "reg" and "reg-names" should be set to "minItems 2"(main nad isr0). This fixes dtbs_check warning: DTC_CHK arch/loongarch/boot/dts/loongson-2k0500-ref.dtb arch/loongarch/boot/dts/loongson-2k0500-ref.dtb: interrupt-controller@1fe11400: reg-names: ['main', 'isr0'] is too short From schema: Documentation/devicetree/bindings/interrupt-controller/loongson,liointc.yaml arch/loongarch/boot/dts/loongson-2k0500-ref.dtb: interrupt-controller@1fe11400: Unevaluated properties are not allowed ('reg-names' was unexpected) From schema: Documentation/devicetree/bindings/interrupt-controller/loongson,liointc.yaml arch/loongarch/boot/dts/loongson-2k0500-ref.dtb: interrupt-controller@1fe11400: reg: [[0, 534844416, 0, 64], [0, 534843456, 0, 8]] is too short From schema: Documentation/devicetree/bindings/interrupt-controller/loongson,liointc.yaml arch/loongarch/boot/dts/loongson-2k0500-ref.dtb: interrupt-controller@1fe11440: reg-names: ['main', 'isr0'] is too short From schema: Documentation/devicetree/bindings/interrupt-controller/loongson,liointc.yaml Acked-by: Jiaxun Yang Reviewed-by: Rob Herring Signed-off-by: Binbin Zhou Signed-off-by: Huacai Chen --- .../interrupt-controller/loongson,liointc.yaml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/interrupt-controller/loongson,liointc.yaml b/Documentation/devicetree/bindings/interrupt-controller/loongson,liointc.yaml index 00b570c829039..a3276c1d9b59f 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/loongson,liointc.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/loongson,liointc.yaml @@ -11,8 +11,13 @@ maintainers: description: | This interrupt controller is found in the Loongson-3 family of chips and - Loongson-2K1000 chip, as the primary package interrupt controller which + Loongson-2K series chips, as the primary package interrupt controller which can route local I/O interrupt to interrupt lines of cores. + Be aware of the following points. + 1.The Loongson-2K0500 is a single core CPU; + 2.The Loongson-2K0500/2K1000 has 64 device interrupt sources as inputs, so we + need to define two nodes in dts{i} to describe the "0-31" and "32-61" interrupt + sources respectively. allOf: - $ref: /schemas/interrupt-controller.yaml# @@ -33,6 +38,7 @@ properties: - const: main - const: isr0 - const: isr1 + minItems: 2 interrupt-controller: true @@ -86,7 +92,8 @@ if: then: properties: reg: - minItems: 3 + minItems: 2 + maxItems: 3 required: - reg-names -- GitLab From db8ce2407090f695339e3406a034377dcdc2c942 Mon Sep 17 00:00:00 2001 From: Binbin Zhou Date: Wed, 17 Jan 2024 12:43:00 +0800 Subject: [PATCH 1017/1290] dt-bindings: interrupt-controller: loongson,liointc: Fix dtbs_check warning for interrupt-names The Loongson-2K0500/2K1000 CPUs have 64 interrupt sources as inputs, and a route-mapped node handles up to 32 interrupt sources, so two liointc nodes are defined in dts{i}. Of course, we have to make sure that the routing outputs ("intx") of the two nodes do not conflict, i.e. "int0" can only be used as a routing output for one of them. Therefore, "interrupt-names" should be defined as "pattern". In addition, since "interrupt-names" and "interrupts" are one-to-one correspondence, we pass it to get the corresponding interrupt number in the driver. Setting it to "required" does not break ABI, because it is already logically represented as "required". This fixes dtbs_check warning: DTC_CHK arch/loongarch/boot/dts/loongson-2k0500-ref.dtb arch/loongarch/boot/dts/loongson-2k0500-ref.dtb: interrupt-controller@1fe11440: interrupt-names:0: 'int0' was expected From schema: Documentation/devicetree/bindings/interrupt-controller/loongson,liointc.yaml arch/loongarch/boot/dts/loongson-2k0500-ref.dtb: interrupt-controller@1fe11440: Unevaluated properties are not allowed ('interrupt-names' was unexpected) From schema: Documentation/devicetree/bindings/interrupt-controller/loongson,liointc.yaml DTC_CHK arch/loongarch/boot/dts/loongson-2k1000-ref.dtb arch/loongarch/boot/dts/loongson-2k1000-ref.dtb: interrupt-controller@1fe01440: interrupt-names:0: 'int0' was expected From schema: Documentation/devicetree/bindings/interrupt-controller/loongson,liointc.yaml arch/loongarch/boot/dts/loongson-2k1000-ref.dtb: interrupt-controller@1fe01440: Unevaluated properties are not allowed ('interrupt-names' was unexpected) From schema: Documentation/devicetree/bindings/interrupt-controller/loongson,liointc.yaml Acked-by: Jiaxun Yang Reviewed-by: Rob Herring Signed-off-by: Binbin Zhou Signed-off-by: Huacai Chen --- .../bindings/interrupt-controller/loongson,liointc.yaml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/interrupt-controller/loongson,liointc.yaml b/Documentation/devicetree/bindings/interrupt-controller/loongson,liointc.yaml index a3276c1d9b59f..60441f0c5d721 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/loongson,liointc.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/loongson,liointc.yaml @@ -51,11 +51,9 @@ properties: interrupt-names: description: List of names for the parent interrupts. items: - - const: int0 - - const: int1 - - const: int2 - - const: int3 + pattern: int[0-3] minItems: 1 + maxItems: 4 '#interrupt-cells': const: 2 @@ -75,6 +73,7 @@ required: - compatible - reg - interrupts + - interrupt-names - interrupt-controller - '#interrupt-cells' - loongson,parent_int_map -- GitLab From 5f346a6e5970229c19c059e8fa62c3dbdde56e7b Mon Sep 17 00:00:00 2001 From: Binbin Zhou Date: Wed, 17 Jan 2024 12:43:00 +0800 Subject: [PATCH 1018/1290] LoongArch: Allow device trees be built into the kernel During the upstream progress of those DT-based drivers, DT properties are changed a lot so very different from those in existing bootloaders. It is inevitably that some existing systems do not provide a standard, canonical device tree to the kernel at boot time. So let's provide a device tree table in the kernel, keyed by the dts filename, containing the relevant DTBs. We can use the built-in dts files as references. Each SoC has only one built-in dts file which describes all possible device information of that SoC, so the dts files are good examples during development. And as a reference, our built-in dts file only enables the most basic bootable combinations (so it is generic enough), acts as an alternative in case the dts in the bootloader is unexpected. Signed-off-by: Binbin Zhou Signed-off-by: Huacai Chen --- arch/loongarch/Kbuild | 1 + arch/loongarch/Kconfig | 18 ++++++++++++++++++ arch/loongarch/Makefile | 3 ++- arch/loongarch/boot/dts/Makefile | 3 +-- arch/loongarch/kernel/setup.c | 12 +++++++++--- 5 files changed, 31 insertions(+), 6 deletions(-) diff --git a/arch/loongarch/Kbuild b/arch/loongarch/Kbuild index beb8499dd8ed8..bfa21465d83af 100644 --- a/arch/loongarch/Kbuild +++ b/arch/loongarch/Kbuild @@ -4,6 +4,7 @@ obj-y += net/ obj-y += vdso/ obj-$(CONFIG_KVM) += kvm/ +obj-$(CONFIG_BUILTIN_DTB) += boot/dts/ # for cleaning subdir- += boot diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 6b9da3effdf8e..ede2ef26726a8 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -375,6 +375,24 @@ config CMDLINE_FORCE endchoice +config BUILTIN_DTB + bool "Enable built-in dtb in kernel" + depends on OF + help + Some existing systems do not provide a canonical device tree to + the kernel at boot time. Let's provide a device tree table in the + kernel, keyed by the dts filename, containing the relevant DTBs. + + Built-in DTBs are generic enough and can be used as references. + +config BUILTIN_DTB_NAME + string "Source file for built-in dtb" + depends on BUILTIN_DTB + help + Base name (without suffix, relative to arch/loongarch/boot/dts/) + for the DTS file that will be used to produce the DTB linked into + the kernel. + config DMI bool "Enable DMI scanning" select DMI_SCAN_MACHINE_NON_EFI_FALLBACK diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile index ba45cb7b621cd..983aa2b1629a6 100644 --- a/arch/loongarch/Makefile +++ b/arch/loongarch/Makefile @@ -6,6 +6,7 @@ boot := arch/loongarch/boot KBUILD_DEFCONFIG := loongson3_defconfig +KBUILD_DTBS := dtbs image-name-y := vmlinux image-name-$(CONFIG_EFI_ZBOOT) := vmlinuz @@ -144,7 +145,7 @@ endif vdso-install-y += arch/loongarch/vdso/vdso.so.dbg -all: $(notdir $(KBUILD_IMAGE)) +all: $(notdir $(KBUILD_IMAGE)) $(KBUILD_DTBS) vmlinuz.efi: vmlinux.efi diff --git a/arch/loongarch/boot/dts/Makefile b/arch/loongarch/boot/dts/Makefile index 5f1f55e911adf..1e24cdb5180af 100644 --- a/arch/loongarch/boot/dts/Makefile +++ b/arch/loongarch/boot/dts/Makefile @@ -1,4 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -dtstree := $(srctree)/$(src) -dtb-y := $(patsubst $(dtstree)/%.dts,%.dtb, $(wildcard $(dtstree)/*.dts)) +obj-$(CONFIG_BUILTIN_DTB) += $(addsuffix .dtb.o, $(CONFIG_BUILTIN_DTB_NAME)) diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index d183a745fb85d..15d366b8407c3 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -295,8 +295,12 @@ static void __init fdt_setup(void) if (acpi_os_get_root_pointer()) return; - /* Look for a device tree configuration table entry */ - fdt_pointer = efi_fdt_pointer(); + /* Prefer to use built-in dtb, checking its legality first. */ + if (!fdt_check_header(__dtb_start)) + fdt_pointer = __dtb_start; + else + fdt_pointer = efi_fdt_pointer(); /* Fallback to firmware dtb */ + if (!fdt_pointer || fdt_check_header(fdt_pointer)) return; @@ -330,7 +334,9 @@ static void __init bootcmdline_init(char **cmdline_p) if (boot_command_line[0]) strlcat(boot_command_line, " ", COMMAND_LINE_SIZE); - strlcat(boot_command_line, init_command_line, COMMAND_LINE_SIZE); + if (!strstr(boot_command_line, init_command_line)) + strlcat(boot_command_line, init_command_line, COMMAND_LINE_SIZE); + goto out; } #endif -- GitLab From 0f66569c85948c8b3c3edbe3e4ada6f98a4937ea Mon Sep 17 00:00:00 2001 From: Binbin Zhou Date: Wed, 17 Jan 2024 12:43:07 +0800 Subject: [PATCH 1019/1290] LoongArch: dts: DeviceTree for Loongson-2K0500 Add DeviceTree file for Loongson-2K0500 processor, which integrates one 64-bit 2-issue superscalar LA264 processor core. Signed-off-by: Binbin Zhou Signed-off-by: Huacai Chen --- arch/loongarch/boot/dts/Makefile | 2 + .../boot/dts/loongson-2k0500-ref.dts | 88 ++++++ arch/loongarch/boot/dts/loongson-2k0500.dtsi | 266 ++++++++++++++++++ 3 files changed, 356 insertions(+) create mode 100644 arch/loongarch/boot/dts/loongson-2k0500-ref.dts create mode 100644 arch/loongarch/boot/dts/loongson-2k0500.dtsi diff --git a/arch/loongarch/boot/dts/Makefile b/arch/loongarch/boot/dts/Makefile index 1e24cdb5180af..89c9758bba7f1 100644 --- a/arch/loongarch/boot/dts/Makefile +++ b/arch/loongarch/boot/dts/Makefile @@ -1,3 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only +dtb-y = loongson-2k0500-ref.dtb + obj-$(CONFIG_BUILTIN_DTB) += $(addsuffix .dtb.o, $(CONFIG_BUILTIN_DTB_NAME)) diff --git a/arch/loongarch/boot/dts/loongson-2k0500-ref.dts b/arch/loongarch/boot/dts/loongson-2k0500-ref.dts new file mode 100644 index 0000000000000..b38071a4d0b02 --- /dev/null +++ b/arch/loongarch/boot/dts/loongson-2k0500-ref.dts @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023 Loongson Technology Corporation Limited + */ + +/dts-v1/; + +#include "loongson-2k0500.dtsi" + +/ { + compatible = "loongson,ls2k0500-ref", "loongson,ls2k0500"; + model = "Loongson-2K0500 Reference Board"; + + aliases { + ethernet0 = &gmac0; + ethernet1 = &gmac1; + serial0 = &uart0; + }; + + chosen { + stdout-path = "serial0:115200n8"; + }; + + memory@200000 { + device_type = "memory"; + reg = <0x0 0x00200000 0x0 0x0ee00000>, + <0x0 0x90000000 0x0 0x60000000>; + }; + + reserved-memory { + #address-cells = <2>; + #size-cells = <2>; + ranges; + + linux,cma { + compatible = "shared-dma-pool"; + reusable; + size = <0x0 0x2000000>; + linux,cma-default; + }; + }; +}; + +&gmac0 { + status = "okay"; + + phy-mode = "rgmii"; + bus_id = <0x0>; +}; + +&gmac1 { + status = "okay"; + + phy-mode = "rgmii"; + bus_id = <0x1>; +}; + +&i2c0 { + status = "okay"; + + #address-cells = <1>; + #size-cells = <0>; + eeprom@57{ + compatible = "atmel,24c16"; + reg = <0x57>; + pagesize = <16>; + }; +}; + +&ehci0 { + status = "okay"; +}; + +&ohci0 { + status = "okay"; +}; + +&sata { + status = "okay"; +}; + +&uart0 { + status = "okay"; +}; + +&rtc0 { + status = "okay"; +}; diff --git a/arch/loongarch/boot/dts/loongson-2k0500.dtsi b/arch/loongarch/boot/dts/loongson-2k0500.dtsi new file mode 100644 index 0000000000000..444779c21034b --- /dev/null +++ b/arch/loongarch/boot/dts/loongson-2k0500.dtsi @@ -0,0 +1,266 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023 Loongson Technology Corporation Limited + */ + +/dts-v1/; + +#include + +/ { + #address-cells = <2>; + #size-cells = <2>; + + cpus { + #address-cells = <1>; + #size-cells = <0>; + + cpu0: cpu@0 { + compatible = "loongson,la264"; + device_type = "cpu"; + reg = <0x0>; + clocks = <&cpu_clk>; + }; + }; + + cpu_clk: cpu-clk { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <500000000>; + }; + + cpuintc: interrupt-controller { + compatible = "loongson,cpu-interrupt-controller"; + #interrupt-cells = <1>; + interrupt-controller; + }; + + bus@10000000 { + compatible = "simple-bus"; + ranges = <0x0 0x10000000 0x0 0x10000000 0x0 0x10000000>, + <0x0 0x02000000 0x0 0x02000000 0x0 0x02000000>, + <0x0 0x20000000 0x0 0x20000000 0x0 0x10000000>, + <0x0 0x40000000 0x0 0x40000000 0x0 0x40000000>, + <0xfe 0x0 0xfe 0x0 0x0 0x40000000>; + #address-cells = <2>; + #size-cells = <2>; + + isa@16400000 { + compatible = "isa"; + #size-cells = <1>; + #address-cells = <2>; + ranges = <1 0x0 0x0 0x16400000 0x4000>; + }; + + liointc0: interrupt-controller@1fe11400 { + compatible = "loongson,liointc-2.0"; + reg = <0x0 0x1fe11400 0x0 0x40>, + <0x0 0x1fe11040 0x0 0x8>; + reg-names = "main", "isr0"; + + interrupt-controller; + #interrupt-cells = <2>; + interrupt-parent = <&cpuintc>; + interrupts = <2>; + interrupt-names = "int0"; + + loongson,parent_int_map = <0xffffffff>, /* int0 */ + <0x00000000>, /* int1 */ + <0x00000000>, /* int2 */ + <0x00000000>; /* int3 */ + }; + + liointc1: interrupt-controller@1fe11440 { + compatible = "loongson,liointc-2.0"; + reg = <0x0 0x1fe11440 0x0 0x40>, + <0x0 0x1fe11048 0x0 0x8>; + reg-names = "main", "isr0"; + + interrupt-controller; + #interrupt-cells = <2>; + interrupt-parent = <&cpuintc>; + interrupts = <4>; + interrupt-names = "int2"; + + loongson,parent_int_map = <0x00000000>, /* int0 */ + <0x00000000>, /* int1 */ + <0xffffffff>, /* int2 */ + <0x00000000>; /* int3 */ + }; + + eiointc: interrupt-controller@1fe11600 { + compatible = "loongson,ls2k0500-eiointc"; + reg = <0x0 0x1fe11600 0x0 0xea00>; + interrupt-controller; + #interrupt-cells = <1>; + interrupt-parent = <&cpuintc>; + interrupts = <3>; + }; + + gmac0: ethernet@1f020000 { + compatible = "snps,dwmac-3.70a"; + reg = <0x0 0x1f020000 0x0 0x10000>; + interrupt-parent = <&liointc0>; + interrupts = <12 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "macirq"; + status = "disabled"; + }; + + gmac1: ethernet@1f030000 { + compatible = "snps,dwmac-3.70a"; + reg = <0x0 0x1f030000 0x0 0x10000>; + interrupt-parent = <&liointc0>; + interrupts = <14 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "macirq"; + status = "disabled"; + }; + + sata: sata@1f040000 { + compatible = "snps,spear-ahci"; + reg = <0x0 0x1f040000 0x0 0x10000>; + interrupt-parent = <&eiointc>; + interrupts = <75>; + status = "disabled"; + }; + + ehci0: usb@1f050000 { + compatible = "generic-ehci"; + reg = <0x0 0x1f050000 0x0 0x8000>; + interrupt-parent = <&eiointc>; + interrupts = <71>; + status = "disabled"; + }; + + ohci0: usb@1f058000 { + compatible = "generic-ohci"; + reg = <0x0 0x1f058000 0x0 0x8000>; + interrupt-parent = <&eiointc>; + interrupts = <72>; + status = "disabled"; + }; + + uart0: serial@1ff40800 { + compatible = "ns16550a"; + reg = <0x0 0x1ff40800 0x0 0x10>; + clock-frequency = <100000000>; + interrupt-parent = <&eiointc>; + interrupts = <2>; + no-loopback-test; + status = "disabled"; + }; + + i2c0: i2c@1ff48000 { + compatible = "loongson,ls2k-i2c"; + reg = <0x0 0x1ff48000 0x0 0x0800>; + interrupt-parent = <&eiointc>; + interrupts = <14>; + status = "disabled"; + }; + + i2c@1ff48800 { + compatible = "loongson,ls2k-i2c"; + reg = <0x0 0x1ff48800 0x0 0x0800>; + interrupt-parent = <&eiointc>; + interrupts = <15>; + status = "disabled"; + }; + + i2c@1ff49000 { + compatible = "loongson,ls2k-i2c"; + reg = <0x0 0x1ff49000 0x0 0x0800>; + interrupt-parent = <&eiointc>; + interrupts = <16>; + status = "disabled"; + }; + + i2c@1ff49800 { + compatible = "loongson,ls2k-i2c"; + reg = <0x0 0x1ff49800 0x0 0x0800>; + interrupt-parent = <&eiointc>; + interrupts = <17>; + status = "disabled"; + }; + + i2c@1ff4a000 { + compatible = "loongson,ls2k-i2c"; + reg = <0x0 0x1ff4a000 0x0 0x0800>; + interrupt-parent = <&eiointc>; + interrupts = <18>; + status = "disabled"; + }; + + i2c@1ff4a800 { + compatible = "loongson,ls2k-i2c"; + reg = <0x0 0x1ff4a800 0x0 0x0800>; + interrupt-parent = <&eiointc>; + interrupts = <19>; + status = "disabled"; + }; + + pmc: power-management@1ff6c000 { + compatible = "loongson,ls2k0500-pmc", "syscon"; + reg = <0x0 0x1ff6c000 0x0 0x58>; + interrupt-parent = <&eiointc>; + interrupts = <56>; + loongson,suspend-address = <0x0 0x1c000500>; + + syscon-reboot { + compatible = "syscon-reboot"; + offset = <0x30>; + mask = <0x1>; + }; + + syscon-poweroff { + compatible = "syscon-poweroff"; + regmap = <&pmc>; + offset = <0x14>; + mask = <0x3c00>; + value = <0x3c00>; + }; + }; + + rtc0: rtc@1ff6c100 { + compatible = "loongson,ls2k0500-rtc", "loongson,ls7a-rtc"; + reg = <0x0 0x1ff6c100 0x0 0x100>; + interrupt-parent = <&eiointc>; + interrupts = <35>; + status = "disabled"; + }; + + pcie@1a000000 { + compatible = "loongson,ls2k-pci"; + reg = <0x0 0x1a000000 0x0 0x02000000>, + <0xfe 0x0 0x0 0x20000000>; + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + bus-range = <0x0 0x5>; + ranges = <0x01000000 0x0 0x00004000 0x0 0x16404000 0x0 0x00004000>, + <0x02000000 0x0 0x40000000 0x0 0x40000000 0x0 0x40000000>; + + pcie@0,0 { + reg = <0x0000 0x0 0x0 0x0 0x0>; + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + interrupt-parent = <&eiointc>; + #interrupt-cells = <1>; + interrupt-map-mask = <0x0 0x0 0x0 0x0>; + interrupt-map = <0x0 0x0 0x0 0x0 &eiointc 81>; + ranges; + }; + + pcie@1,0 { + reg = <0x0800 0x0 0x0 0x0 0x0>; + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + interrupt-parent = <&eiointc>; + #interrupt-cells = <1>; + interrupt-map-mask = <0x0 0x0 0x0 0x0>; + interrupt-map = <0x0 0x0 0x0 0x0 &eiointc 82>; + ranges; + }; + }; + }; +}; -- GitLab From 30a5532a32066233a2e9a4751989276e91c82210 Mon Sep 17 00:00:00 2001 From: Binbin Zhou Date: Wed, 17 Jan 2024 12:43:08 +0800 Subject: [PATCH 1020/1290] LoongArch: dts: DeviceTree for Loongson-2K1000 Add DeviceTree file for Loongson-2K1000 processor, which integrates two 64-bit 2-issue superscalar LA264 processor cores. Signed-off-by: Binbin Zhou Signed-off-by: Huacai Chen --- arch/loongarch/boot/dts/Makefile | 2 +- .../boot/dts/loongson-2k1000-ref.dts | 183 +++++++ arch/loongarch/boot/dts/loongson-2k1000.dtsi | 492 ++++++++++++++++++ 3 files changed, 676 insertions(+), 1 deletion(-) create mode 100644 arch/loongarch/boot/dts/loongson-2k1000-ref.dts create mode 100644 arch/loongarch/boot/dts/loongson-2k1000.dtsi diff --git a/arch/loongarch/boot/dts/Makefile b/arch/loongarch/boot/dts/Makefile index 89c9758bba7f1..cfb0a122d91c1 100644 --- a/arch/loongarch/boot/dts/Makefile +++ b/arch/loongarch/boot/dts/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -dtb-y = loongson-2k0500-ref.dtb +dtb-y = loongson-2k0500-ref.dtb loongson-2k1000-ref.dtb obj-$(CONFIG_BUILTIN_DTB) += $(addsuffix .dtb.o, $(CONFIG_BUILTIN_DTB_NAME)) diff --git a/arch/loongarch/boot/dts/loongson-2k1000-ref.dts b/arch/loongarch/boot/dts/loongson-2k1000-ref.dts new file mode 100644 index 0000000000000..132a2d1ea8bce --- /dev/null +++ b/arch/loongarch/boot/dts/loongson-2k1000-ref.dts @@ -0,0 +1,183 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023 Loongson Technology Corporation Limited + */ + +/dts-v1/; + +#include "loongson-2k1000.dtsi" + +/ { + compatible = "loongson,ls2k1000-ref", "loongson,ls2k1000"; + model = "Loongson-2K1000 Reference Board"; + + aliases { + serial0 = &uart0; + }; + + chosen { + stdout-path = "serial0:115200n8"; + }; + + memory@200000 { + device_type = "memory"; + reg = <0x0 0x00200000 0x0 0x06e00000>, + <0x0 0x08000000 0x0 0x07000000>, + <0x0 0x90000000 0x1 0xe0000000>; + }; + + reserved-memory { + #address-cells = <2>; + #size-cells = <2>; + ranges; + + linux,cma { + compatible = "shared-dma-pool"; + reusable; + size = <0x0 0x2000000>; + linux,cma-default; + }; + }; +}; + +&gmac0 { + status = "okay"; + + phy-mode = "rgmii"; + phy-handle = <&phy0>; + mdio { + compatible = "snps,dwmac-mdio"; + #address-cells = <1>; + #size-cells = <0>; + phy0: ethernet-phy@0 { + reg = <0>; + }; + }; +}; + +&gmac1 { + status = "okay"; + + phy-mode = "rgmii"; + phy-handle = <&phy1>; + mdio { + compatible = "snps,dwmac-mdio"; + #address-cells = <1>; + #size-cells = <0>; + phy1: ethernet-phy@1 { + reg = <16>; + }; + }; +}; + +&i2c2 { + status = "okay"; + + pinctrl-0 = <&i2c0_pins_default>; + pinctrl-names = "default"; + + #address-cells = <1>; + #size-cells = <0>; + eeprom@57{ + compatible = "atmel,24c16"; + reg = <0x57>; + pagesize = <16>; + }; +}; + +&spi0 { + status = "okay"; + + #address-cells = <1>; + #size-cells = <0>; + spidev@0 { + compatible = "rohm,dh2228fv"; + spi-max-frequency = <100000000>; + reg = <0>; + }; +}; + +&ehci0 { + status = "okay"; +}; + +&ohci0 { + status = "okay"; +}; + +&sata { + status = "okay"; +}; + +&uart0 { + status = "okay"; +}; + +&clk { + status = "okay"; +}; + +&rtc0 { + status = "okay"; +}; + +&pctrl { + status = "okay"; + + sdio_pins_default: sdio-pins { + sdio-pinmux { + groups = "sdio"; + function = "sdio"; + }; + sdio-det-pinmux { + groups = "pwm2"; + function = "gpio"; + }; + }; + + pwm1_pins_default: pwm1-pins { + pinmux { + groups = "pwm1"; + function = "pwm1"; + }; + }; + + pwm0_pins_default: pwm0-pins { + pinmux { + groups = "pwm0"; + function = "pwm0"; + }; + }; + + i2c1_pins_default: i2c1-pins { + pinmux { + groups = "i2c1"; + function = "i2c1"; + }; + }; + + i2c0_pins_default: i2c0-pins { + pinmux { + groups = "i2c0"; + function = "i2c0"; + }; + }; + + nand_pins_default: nand-pins { + pinmux { + groups = "nand"; + function = "nand"; + }; + }; + + hda_pins_default: hda-pins { + grp0-pinmux { + groups = "hda"; + function = "hda"; + }; + grp1-pinmux { + groups = "i2s"; + function = "gpio"; + }; + }; +}; diff --git a/arch/loongarch/boot/dts/loongson-2k1000.dtsi b/arch/loongarch/boot/dts/loongson-2k1000.dtsi new file mode 100644 index 0000000000000..49a70f8c3cab2 --- /dev/null +++ b/arch/loongarch/boot/dts/loongson-2k1000.dtsi @@ -0,0 +1,492 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023 Loongson Technology Corporation Limited + */ + +/dts-v1/; + +#include +#include +#include + +/ { + #address-cells = <2>; + #size-cells = <2>; + + cpus { + #address-cells = <1>; + #size-cells = <0>; + + cpu0: cpu@0 { + compatible = "loongson,la264"; + device_type = "cpu"; + reg= <0x0>; + clocks = <&clk LOONGSON2_NODE_CLK>; + }; + + cpu1: cpu@1 { + compatible = "loongson,la264"; + device_type = "cpu"; + reg = <0x1>; + clocks = <&clk LOONGSON2_NODE_CLK>; + }; + }; + + ref_100m: clock-ref-100m { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <100000000>; + clock-output-names = "ref_100m"; + }; + + cpuintc: interrupt-controller { + compatible = "loongson,cpu-interrupt-controller"; + #interrupt-cells = <1>; + interrupt-controller; + }; + + /* i2c of the dvi eeprom edid */ + i2c-gpio-0 { + compatible = "i2c-gpio"; + scl-gpios = <&gpio0 0 (GPIO_ACTIVE_HIGH | GPIO_OPEN_DRAIN)>; + sda-gpios = <&gpio0 1 (GPIO_ACTIVE_HIGH | GPIO_OPEN_DRAIN)>; + i2c-gpio,delay-us = <5>; /* ~100 kHz */ + #address-cells = <1>; + #size-cells = <0>; + status = "disabled"; + }; + + /* i2c of the eeprom edid */ + i2c-gpio-1 { + compatible = "i2c-gpio"; + scl-gpios = <&gpio0 33 (GPIO_ACTIVE_HIGH | GPIO_OPEN_DRAIN)>; + sda-gpios = <&gpio0 32 (GPIO_ACTIVE_HIGH | GPIO_OPEN_DRAIN)>; + i2c-gpio,delay-us = <5>; /* ~100 kHz */ + #address-cells = <1>; + #size-cells = <0>; + status = "disabled"; + }; + + thermal-zones { + cpu-thermal { + polling-delay-passive = <1000>; + polling-delay = <5000>; + thermal-sensors = <&tsensor 0>; + + trips { + cpu_alert: cpu-alert { + temperature = <33000>; + hysteresis = <2000>; + type = "active"; + }; + + cpu_crit: cpu-crit { + temperature = <85000>; + hysteresis = <5000>; + type = "critical"; + }; + }; + }; + }; + + bus@10000000 { + compatible = "simple-bus"; + ranges = <0x0 0x10000000 0x0 0x10000000 0x0 0x10000000>, + <0x0 0x02000000 0x0 0x02000000 0x0 0x02000000>, + <0x0 0x20000000 0x0 0x20000000 0x0 0x10000000>, + <0x0 0x40000000 0x0 0x40000000 0x0 0x40000000>, + <0xfe 0x0 0xfe 0x0 0x0 0x40000000>; + #address-cells = <2>; + #size-cells = <2>; + dma-coherent; + + liointc0: interrupt-controller@1fe01400 { + compatible = "loongson,liointc-2.0"; + reg = <0x0 0x1fe01400 0x0 0x40>, + <0x0 0x1fe01040 0x0 0x8>, + <0x0 0x1fe01140 0x0 0x8>; + reg-names = "main", "isr0", "isr1"; + interrupt-controller; + #interrupt-cells = <2>; + interrupt-parent = <&cpuintc>; + interrupts = <2>; + interrupt-names = "int0"; + loongson,parent_int_map = <0xffffffff>, /* int0 */ + <0x00000000>, /* int1 */ + <0x00000000>, /* int2 */ + <0x00000000>; /* int3 */ + }; + + liointc1: interrupt-controller@1fe01440 { + compatible = "loongson,liointc-2.0"; + reg = <0x0 0x1fe01440 0x0 0x40>, + <0x0 0x1fe01048 0x0 0x8>, + <0x0 0x1fe01148 0x0 0x8>; + reg-names = "main", "isr0", "isr1"; + interrupt-controller; + #interrupt-cells = <2>; + interrupt-parent = <&cpuintc>; + interrupts = <3>; + interrupt-names = "int1"; + loongson,parent_int_map = <0x00000000>, /* int0 */ + <0xffffffff>, /* int1 */ + <0x00000000>, /* int2 */ + <0x00000000>; /* int3 */ + }; + + chipid@1fe00000 { + compatible = "loongson,ls2k-chipid"; + reg = <0x0 0x1fe00000 0x0 0x30>; + little-endian; + }; + + pctrl: pinctrl@1fe00420 { + compatible = "loongson,ls2k-pinctrl"; + reg = <0x0 0x1fe00420 0x0 0x18>; + status = "disabled"; + }; + + clk: clock-controller@1fe00480 { + compatible = "loongson,ls2k-clk"; + reg = <0x0 0x1fe00480 0x0 0x58>; + #clock-cells = <1>; + clocks = <&ref_100m>; + clock-names = "ref_100m"; + status = "disabled"; + }; + + gpio0: gpio@1fe00500 { + compatible = "loongson,ls2k-gpio"; + reg = <0x0 0x1fe00500 0x0 0x38>; + ngpios = <64>; + #gpio-cells = <2>; + gpio-controller; + gpio-ranges = <&pctrl 0x0 0x0 15>, + <&pctrl 16 16 15>, + <&pctrl 32 32 10>, + <&pctrl 44 44 20>; + interrupt-parent = <&liointc1>; + interrupts = <28 IRQ_TYPE_LEVEL_HIGH>, + <29 IRQ_TYPE_LEVEL_HIGH>, + <30 IRQ_TYPE_LEVEL_HIGH>, + <30 IRQ_TYPE_LEVEL_HIGH>, + <26 IRQ_TYPE_LEVEL_HIGH>, + <26 IRQ_TYPE_LEVEL_HIGH>, + <26 IRQ_TYPE_LEVEL_HIGH>, + <26 IRQ_TYPE_LEVEL_HIGH>, + <26 IRQ_TYPE_LEVEL_HIGH>, + <26 IRQ_TYPE_LEVEL_HIGH>, + <26 IRQ_TYPE_LEVEL_HIGH>, + <26 IRQ_TYPE_LEVEL_HIGH>, + <26 IRQ_TYPE_LEVEL_HIGH>, + <26 IRQ_TYPE_LEVEL_HIGH>, + <26 IRQ_TYPE_LEVEL_HIGH>, + <>, + <26 IRQ_TYPE_LEVEL_HIGH>, + <26 IRQ_TYPE_LEVEL_HIGH>, + <26 IRQ_TYPE_LEVEL_HIGH>, + <26 IRQ_TYPE_LEVEL_HIGH>, + <26 IRQ_TYPE_LEVEL_HIGH>, + <26 IRQ_TYPE_LEVEL_HIGH>, + <26 IRQ_TYPE_LEVEL_HIGH>, + <26 IRQ_TYPE_LEVEL_HIGH>, + <26 IRQ_TYPE_LEVEL_HIGH>, + <26 IRQ_TYPE_LEVEL_HIGH>, + <26 IRQ_TYPE_LEVEL_HIGH>, + <26 IRQ_TYPE_LEVEL_HIGH>, + <26 IRQ_TYPE_LEVEL_HIGH>, + <26 IRQ_TYPE_LEVEL_HIGH>, + <26 IRQ_TYPE_LEVEL_HIGH>, + <26 IRQ_TYPE_LEVEL_HIGH>, + <27 IRQ_TYPE_LEVEL_HIGH>, + <27 IRQ_TYPE_LEVEL_HIGH>, + <27 IRQ_TYPE_LEVEL_HIGH>, + <27 IRQ_TYPE_LEVEL_HIGH>, + <27 IRQ_TYPE_LEVEL_HIGH>, + <>, + <27 IRQ_TYPE_LEVEL_HIGH>, + <27 IRQ_TYPE_LEVEL_HIGH>, + <27 IRQ_TYPE_LEVEL_HIGH>, + <27 IRQ_TYPE_LEVEL_HIGH>, + <>, + <>, + <27 IRQ_TYPE_LEVEL_HIGH>, + <27 IRQ_TYPE_LEVEL_HIGH>, + <27 IRQ_TYPE_LEVEL_HIGH>, + <27 IRQ_TYPE_LEVEL_HIGH>, + <27 IRQ_TYPE_LEVEL_HIGH>, + <27 IRQ_TYPE_LEVEL_HIGH>, + <27 IRQ_TYPE_LEVEL_HIGH>, + <27 IRQ_TYPE_LEVEL_HIGH>, + <27 IRQ_TYPE_LEVEL_HIGH>, + <27 IRQ_TYPE_LEVEL_HIGH>, + <27 IRQ_TYPE_LEVEL_HIGH>, + <27 IRQ_TYPE_LEVEL_HIGH>, + <27 IRQ_TYPE_LEVEL_HIGH>, + <27 IRQ_TYPE_LEVEL_HIGH>, + <27 IRQ_TYPE_LEVEL_HIGH>, + <27 IRQ_TYPE_LEVEL_HIGH>, + <27 IRQ_TYPE_LEVEL_HIGH>, + <27 IRQ_TYPE_LEVEL_HIGH>, + <27 IRQ_TYPE_LEVEL_HIGH>, + <27 IRQ_TYPE_LEVEL_HIGH>; + }; + + tsensor: thermal-sensor@1fe01500 { + compatible = "loongson,ls2k1000-thermal"; + reg = <0x0 0x1fe01500 0x0 0x30>; + interrupt-parent = <&liointc0>; + interrupts = <7 IRQ_TYPE_LEVEL_HIGH>; + #thermal-sensor-cells = <1>; + }; + + dma-controller@1fe00c00 { + compatible = "loongson,ls2k1000-apbdma"; + reg = <0x0 0x1fe00c00 0x0 0x8>; + interrupt-parent = <&liointc1>; + interrupts = <12 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_APB_CLK>; + #dma-cells = <1>; + status = "disabled"; + }; + + dma-controller@1fe00c10 { + compatible = "loongson,ls2k1000-apbdma"; + reg = <0x0 0x1fe00c10 0x0 0x8>; + interrupt-parent = <&liointc1>; + interrupts = <13 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_APB_CLK>; + #dma-cells = <1>; + status = "disabled"; + }; + + dma-controller@1fe00c20 { + compatible = "loongson,ls2k1000-apbdma"; + reg = <0x0 0x1fe00c20 0x0 0x8>; + interrupt-parent = <&liointc1>; + interrupts = <14 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_APB_CLK>; + #dma-cells = <1>; + status = "disabled"; + }; + + dma-controller@1fe00c30 { + compatible = "loongson,ls2k1000-apbdma"; + reg = <0x0 0x1fe00c30 0x0 0x8>; + interrupt-parent = <&liointc1>; + interrupts = <15 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_APB_CLK>; + #dma-cells = <1>; + status = "disabled"; + }; + + dma-controller@1fe00c40 { + compatible = "loongson,ls2k1000-apbdma"; + reg = <0x0 0x1fe00c40 0x0 0x8>; + interrupt-parent = <&liointc1>; + interrupts = <16 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_APB_CLK>; + #dma-cells = <1>; + status = "disabled"; + }; + + uart0: serial@1fe20000 { + compatible = "ns16550a"; + reg = <0x0 0x1fe20000 0x0 0x10>; + clock-frequency = <125000000>; + interrupt-parent = <&liointc0>; + interrupts = <0x0 IRQ_TYPE_LEVEL_HIGH>; + no-loopback-test; + status = "disabled"; + }; + + i2c2: i2c@1fe21000 { + compatible = "loongson,ls2k-i2c"; + reg = <0x0 0x1fe21000 0x0 0x8>; + interrupt-parent = <&liointc0>; + interrupts = <22 IRQ_TYPE_LEVEL_HIGH>; + status = "disabled"; + }; + + i2c3: i2c@1fe21800 { + compatible = "loongson,ls2k-i2c"; + reg = <0x0 0x1fe21800 0x0 0x8>; + interrupt-parent = <&liointc0>; + interrupts = <23 IRQ_TYPE_LEVEL_HIGH>; + status = "disabled"; + }; + + pmc: power-management@1fe27000 { + compatible = "loongson,ls2k1000-pmc", "loongson,ls2k0500-pmc", "syscon"; + reg = <0x0 0x1fe27000 0x0 0x58>; + interrupt-parent = <&liointc1>; + interrupts = <11 IRQ_TYPE_LEVEL_HIGH>; + loongson,suspend-address = <0x0 0x1c000500>; + + syscon-reboot { + compatible = "syscon-reboot"; + offset = <0x30>; + mask = <0x1>; + }; + + syscon-poweroff { + compatible = "syscon-poweroff"; + regmap = <&pmc>; + offset = <0x14>; + mask = <0x3c00>; + value = <0x3c00>; + }; + }; + + rtc0: rtc@1fe27800 { + compatible = "loongson,ls2k1000-rtc"; + reg = <0x0 0x1fe27800 0x0 0x100>; + interrupt-parent = <&liointc1>; + interrupts = <8 IRQ_TYPE_LEVEL_HIGH>; + status = "disabled"; + }; + + spi0: spi@1fff0220 { + compatible = "loongson,ls2k1000-spi"; + reg = <0x0 0x1fff0220 0x0 0x10>; + clocks = <&clk LOONGSON2_BOOT_CLK>; + status = "disabled"; + }; + + pcie@1a000000 { + compatible = "loongson,ls2k-pci"; + reg = <0x0 0x1a000000 0x0 0x02000000>, + <0xfe 0x0 0x0 0x20000000>; + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + bus-range = <0x0 0xff>; + ranges = <0x01000000 0x0 0x00008000 0x0 0x18008000 0x0 0x00008000>, + <0x02000000 0x0 0x60000000 0x0 0x60000000 0x0 0x20000000>; + + gmac0: ethernet@3,0 { + reg = <0x1800 0x0 0x0 0x0 0x0>; + interrupt-parent = <&liointc0>; + interrupts = <12 IRQ_TYPE_LEVEL_HIGH>, + <13 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "macirq", "eth_lpi"; + status = "disabled"; + }; + + gmac1: ethernet@3,1 { + reg = <0x1900 0x0 0x0 0x0 0x0>; + interrupt-parent = <&liointc0>; + interrupts = <14 IRQ_TYPE_LEVEL_HIGH>, + <15 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "macirq", "eth_lpi"; + status = "disabled"; + }; + + ehci0: usb@4,1 { + reg = <0x2100 0x0 0x0 0x0 0x0>; + interrupt-parent = <&liointc1>; + interrupts = <18 IRQ_TYPE_LEVEL_HIGH>; + status = "disabled"; + }; + + ohci0: usb@4,2 { + reg = <0x2200 0x0 0x0 0x0 0x0>; + interrupt-parent = <&liointc1>; + interrupts = <19 IRQ_TYPE_LEVEL_HIGH>; + status = "disabled"; + }; + + display@6,0 { + reg = <0x3000 0x0 0x0 0x0 0x0>; + interrupt-parent = <&liointc0>; + interrupts = <28 IRQ_TYPE_LEVEL_HIGH>; + status = "disabled"; + }; + + hda@7,0 { + reg = <0x3800 0x0 0x0 0x0 0x0>; + interrupt-parent = <&liointc0>; + interrupts = <4 IRQ_TYPE_LEVEL_HIGH>; + status = "disabled"; + }; + + sata: sata@8,0 { + reg = <0x4000 0x0 0x0 0x0 0x0>; + interrupt-parent = <&liointc0>; + interrupts = <19 IRQ_TYPE_LEVEL_HIGH>; + status = "disabled"; + }; + + pcie@9,0 { + reg = <0x4800 0x0 0x0 0x0 0x0>; + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + #interrupt-cells = <1>; + interrupt-map-mask = <0x0 0x0 0x0 0x0>; + interrupt-map = <0x0 0x0 0x0 0x0 &liointc1 0x0 IRQ_TYPE_LEVEL_HIGH>; + ranges; + }; + + pcie@a,0 { + reg = <0x5000 0x0 0x0 0x0 0x0>; + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + interrupt-parent = <&liointc1>; + #interrupt-cells = <1>; + interrupt-map-mask = <0x0 0x0 0x0 0x0>; + interrupt-map = <0x0 0x0 0x0 0x0 &liointc1 1 IRQ_TYPE_LEVEL_HIGH>; + ranges; + }; + + pcie@b,0 { + reg = <0x5800 0x0 0x0 0x0 0x0>; + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + interrupt-parent = <&liointc1>; + #interrupt-cells = <1>; + interrupt-map-mask = <0x0 0x0 0x0 0x0>; + interrupt-map = <0x0 0x0 0x0 0x0 &liointc1 2 IRQ_TYPE_LEVEL_HIGH>; + ranges; + }; + + pcie@c,0 { + reg = <0x6000 0x0 0x0 0x0 0x0>; + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + interrupt-parent = <&liointc1>; + #interrupt-cells = <1>; + interrupt-map-mask = <0x0 0x0 0x0 0x0>; + interrupt-map = <0x0 0x0 0x0 0x0 &liointc1 3 IRQ_TYPE_LEVEL_HIGH>; + ranges; + }; + + pcie@d,0 { + reg = <0x6800 0x0 0x0 0x0 0x0>; + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + interrupt-parent = <&liointc1>; + #interrupt-cells = <1>; + interrupt-map-mask = <0x0 0x0 0x0 0x0>; + interrupt-map = <0x0 0x0 0x0 0x0 &liointc1 4 IRQ_TYPE_LEVEL_HIGH>; + ranges; + }; + + pcie@e,0 { + reg = <0x7000 0x0 0x0 0x0 0x0>; + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + interrupt-parent = <&liointc1>; + #interrupt-cells = <1>; + interrupt-map-mask = <0x0 0x0 0x0 0x0>; + interrupt-map = <0x0 0x0 0x0 0x0 &liointc1 5 IRQ_TYPE_LEVEL_HIGH>; + ranges; + }; + }; + }; +}; -- GitLab From 2905844f682808275ea5daf6f5b668fbfe547363 Mon Sep 17 00:00:00 2001 From: Binbin Zhou Date: Wed, 17 Jan 2024 12:43:08 +0800 Subject: [PATCH 1021/1290] LoongArch: dts: DeviceTree for Loongson-2K2000 Add DeviceTree file for Loongson-2K2000 processor, which integrates two 64-bit 3-issue superscalar LA364 processor cores. Signed-off-by: Binbin Zhou Signed-off-by: Huacai Chen --- arch/loongarch/boot/dts/Makefile | 2 +- .../boot/dts/loongson-2k2000-ref.dts | 72 +++++ arch/loongarch/boot/dts/loongson-2k2000.dtsi | 300 ++++++++++++++++++ 3 files changed, 373 insertions(+), 1 deletion(-) create mode 100644 arch/loongarch/boot/dts/loongson-2k2000-ref.dts create mode 100644 arch/loongarch/boot/dts/loongson-2k2000.dtsi diff --git a/arch/loongarch/boot/dts/Makefile b/arch/loongarch/boot/dts/Makefile index cfb0a122d91c1..747d0c3f63892 100644 --- a/arch/loongarch/boot/dts/Makefile +++ b/arch/loongarch/boot/dts/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -dtb-y = loongson-2k0500-ref.dtb loongson-2k1000-ref.dtb +dtb-y = loongson-2k0500-ref.dtb loongson-2k1000-ref.dtb loongson-2k2000-ref.dtb obj-$(CONFIG_BUILTIN_DTB) += $(addsuffix .dtb.o, $(CONFIG_BUILTIN_DTB_NAME)) diff --git a/arch/loongarch/boot/dts/loongson-2k2000-ref.dts b/arch/loongarch/boot/dts/loongson-2k2000-ref.dts new file mode 100644 index 0000000000000..dca91caf895e3 --- /dev/null +++ b/arch/loongarch/boot/dts/loongson-2k2000-ref.dts @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023 Loongson Technology Corporation Limited + */ + +/dts-v1/; + +#include "loongson-2k2000.dtsi" + +/ { + compatible = "loongson,ls2k2000-ref", "loongson,ls2k2000"; + model = "Loongson-2K2000 Reference Board"; + + aliases { + serial0 = &uart0; + }; + + chosen { + stdout-path = "serial0:115200n8"; + }; + + memory@200000 { + device_type = "memory"; + reg = <0x0 0x00200000 0x0 0x0ee00000>, + <0x0 0x90000000 0x0 0x70000000>; + }; + + reserved-memory { + #address-cells = <2>; + #size-cells = <2>; + ranges; + + linux,cma { + compatible = "shared-dma-pool"; + reusable; + size = <0x0 0x2000000>; + linux,cma-default; + }; + }; +}; + +&sata { + status = "okay"; +}; + +&uart0 { + status = "okay"; +}; + +&rtc0 { + status = "okay"; +}; + +&xhci0 { + status = "okay"; +}; + +&xhci1 { + status = "okay"; +}; + +&gmac0 { + status = "okay"; +}; + +&gmac1 { + status = "okay"; +}; + +&gmac2 { + status = "okay"; +}; diff --git a/arch/loongarch/boot/dts/loongson-2k2000.dtsi b/arch/loongarch/boot/dts/loongson-2k2000.dtsi new file mode 100644 index 0000000000000..a231949b5f553 --- /dev/null +++ b/arch/loongarch/boot/dts/loongson-2k2000.dtsi @@ -0,0 +1,300 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023 Loongson Technology Corporation Limited + */ + +/dts-v1/; + +#include + +/ { + #address-cells = <2>; + #size-cells = <2>; + + cpus { + #address-cells = <1>; + #size-cells = <0>; + + cpu0: cpu@1 { + compatible = "loongson,la364"; + device_type = "cpu"; + reg = <0x0>; + clocks = <&cpu_clk>; + }; + + cpu1: cpu@2 { + compatible = "loongson,la364"; + device_type = "cpu"; + reg = <0x1>; + clocks = <&cpu_clk>; + }; + }; + + cpu_clk: cpu-clk { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <1400000000>; + }; + + cpuintc: interrupt-controller { + compatible = "loongson,cpu-interrupt-controller"; + #interrupt-cells = <1>; + interrupt-controller; + }; + + bus@10000000 { + compatible = "simple-bus"; + ranges = <0x0 0x10000000 0x0 0x10000000 0x0 0x10000000>, + <0x0 0x02000000 0x0 0x02000000 0x0 0x02000000>, + <0x0 0x40000000 0x0 0x40000000 0x0 0x40000000>, + <0xfe 0x0 0xfe 0x0 0x0 0x40000000>; + #address-cells = <2>; + #size-cells = <2>; + + pmc: power-management@100d0000 { + compatible = "loongson,ls2k2000-pmc", "loongson,ls2k0500-pmc", "syscon"; + reg = <0x0 0x100d0000 0x0 0x58>; + interrupt-parent = <&eiointc>; + interrupts = <47>; + loongson,suspend-address = <0x0 0x1c000500>; + + syscon-reboot { + compatible = "syscon-reboot"; + offset = <0x30>; + mask = <0x1>; + }; + + syscon-poweroff { + compatible = "syscon-poweroff"; + regmap = <&pmc>; + offset = <0x14>; + mask = <0x3c00>; + value = <0x3c00>; + }; + }; + + liointc: interrupt-controller@1fe01400 { + compatible = "loongson,liointc-1.0"; + reg = <0x0 0x1fe01400 0x0 0x64>; + + interrupt-controller; + #interrupt-cells = <2>; + interrupt-parent = <&cpuintc>; + interrupts = <2>; + interrupt-names = "int0"; + loongson,parent_int_map = <0xffffffff>, /* int0 */ + <0x00000000>, /* int1 */ + <0x00000000>, /* int2 */ + <0x00000000>; /* int3 */ + }; + + eiointc: interrupt-controller@1fe01600 { + compatible = "loongson,ls2k2000-eiointc"; + reg = <0x0 0x1fe01600 0x0 0xea00>; + interrupt-controller; + #interrupt-cells = <1>; + interrupt-parent = <&cpuintc>; + interrupts = <3>; + }; + + pic: interrupt-controller@10000000 { + compatible = "loongson,pch-pic-1.0"; + reg = <0x0 0x10000000 0x0 0x400>; + interrupt-controller; + #interrupt-cells = <2>; + loongson,pic-base-vec = <0>; + interrupt-parent = <&eiointc>; + }; + + msi: msi-controller@1fe01140 { + compatible = "loongson,pch-msi-1.0"; + reg = <0x0 0x1fe01140 0x0 0x8>; + msi-controller; + loongson,msi-base-vec = <64>; + loongson,msi-num-vecs = <192>; + interrupt-parent = <&eiointc>; + }; + + rtc0: rtc@100d0100 { + compatible = "loongson,ls2k2000-rtc", "loongson,ls7a-rtc"; + reg = <0x0 0x100d0100 0x0 0x100>; + interrupt-parent = <&pic>; + interrupts = <52 IRQ_TYPE_LEVEL_HIGH>; + status = "disabled"; + }; + + uart0: serial@1fe001e0 { + compatible = "ns16550a"; + reg = <0x0 0x1fe001e0 0x0 0x10>; + clock-frequency = <100000000>; + interrupt-parent = <&liointc>; + interrupts = <10 IRQ_TYPE_LEVEL_HIGH>; + no-loopback-test; + status = "disabled"; + }; + + pcie@1a000000 { + compatible = "loongson,ls2k-pci"; + reg = <0x0 0x1a000000 0x0 0x02000000>, + <0xfe 0x0 0x0 0x20000000>; + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + bus-range = <0x0 0xff>; + ranges = <0x01000000 0x0 0x00008000 0x0 0x18400000 0x0 0x00008000>, + <0x02000000 0x0 0x60000000 0x0 0x60000000 0x0 0x20000000>; + + gmac0: ethernet@3,0 { + reg = <0x1800 0x0 0x0 0x0 0x0>; + interrupts = <12 IRQ_TYPE_LEVEL_HIGH>; + interrupt-parent = <&pic>; + status = "disabled"; + }; + + gmac1: ethernet@3,1 { + reg = <0x1900 0x0 0x0 0x0 0x0>; + interrupts = <14 IRQ_TYPE_LEVEL_HIGH>; + interrupt-parent = <&pic>; + status = "disabled"; + }; + + gmac2: ethernet@3,2 { + reg = <0x1a00 0x0 0x0 0x0 0x0>; + interrupts = <17 IRQ_TYPE_LEVEL_HIGH>; + interrupt-parent = <&pic>; + status = "disabled"; + }; + + xhci0: usb@4,0 { + reg = <0x2000 0x0 0x0 0x0 0x0>; + interrupts = <48 IRQ_TYPE_LEVEL_HIGH>; + interrupt-parent = <&pic>; + status = "disabled"; + }; + + xhci1: usb@19,0 { + reg = <0xc800 0x0 0x0 0x0 0x0>; + interrupts = <22 IRQ_TYPE_LEVEL_HIGH>; + interrupt-parent = <&pic>; + status = "disabled"; + }; + + display@6,1 { + reg = <0x3100 0x0 0x0 0x0 0x0>; + interrupts = <28 IRQ_TYPE_LEVEL_HIGH>; + interrupt-parent = <&pic>; + status = "disabled"; + }; + + hda@7,0 { + reg = <0x3800 0x0 0x0 0x0 0x0>; + interrupts = <58 IRQ_TYPE_LEVEL_HIGH>; + interrupt-parent = <&pic>; + status = "disabled"; + }; + + sata: sata@8,0 { + reg = <0x4000 0x0 0x0 0x0 0x0>; + interrupts = <16 IRQ_TYPE_LEVEL_HIGH>; + interrupt-parent = <&pic>; + status = "disabled"; + }; + + pcie@9,0 { + reg = <0x4800 0x0 0x0 0x0 0x0>; + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + interrupt-parent = <&pic>; + #interrupt-cells = <1>; + interrupt-map-mask = <0x0 0x0 0x0 0x0>; + interrupt-map = <0x0 0x0 0x0 0x0 &pic 32 IRQ_TYPE_LEVEL_HIGH>; + ranges; + }; + + pcie@a,0 { + reg = <0x5000 0x0 0x0 0x0 0x0>; + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + interrupt-parent = <&pic>; + #interrupt-cells = <1>; + interrupt-map-mask = <0x0 0x0 0x0 0x0>; + interrupt-map = <0x0 0x0 0x0 0x0 &pic 33 IRQ_TYPE_LEVEL_HIGH>; + ranges; + }; + + pcie@b,0 { + reg = <0x5800 0x0 0x0 0x0 0x0>; + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + interrupt-parent = <&pic>; + #interrupt-cells = <1>; + interrupt-map-mask = <0x0 0x0 0x0 0x0>; + interrupt-map = <0x0 0x0 0x0 0x0 &pic 34 IRQ_TYPE_LEVEL_HIGH>; + ranges; + }; + + pcie@c,0 { + reg = <0x6000 0x0 0x0 0x0 0x0>; + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + interrupt-parent = <&pic>; + #interrupt-cells = <1>; + interrupt-map-mask = <0x0 0x0 0x0 0x0>; + interrupt-map = <0x0 0x0 0x0 0x0 &pic 35 IRQ_TYPE_LEVEL_HIGH>; + ranges; + }; + + pcie@d,0 { + reg = <0x6800 0x0 0x0 0x0 0x0>; + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + interrupt-parent = <&pic>; + #interrupt-cells = <1>; + interrupt-map-mask = <0x0 0x0 0x0 0x0>; + interrupt-map = <0x0 0x0 0x0 0x0 &pic 36 IRQ_TYPE_LEVEL_HIGH>; + ranges; + }; + + pcie@e,0 { + reg = <0x7000 0x0 0x0 0x0 0x0>; + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + interrupt-parent = <&pic>; + #interrupt-cells = <1>; + interrupt-map-mask = <0x0 0x0 0x0 0x0>; + interrupt-map = <0x0 0x0 0x0 0x0 &pic 37 IRQ_TYPE_LEVEL_HIGH>; + ranges; + }; + + pcie@f,0 { + reg = <0x7800 0x0 0x0 0x0 0x0>; + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + interrupt-parent = <&pic>; + #interrupt-cells = <1>; + interrupt-map-mask = <0x0 0x0 0x0 0x0>; + interrupt-map = <0x0 0x0 0x0 0x0 &pic 40 IRQ_TYPE_LEVEL_HIGH>; + ranges; + }; + + pcie@10,0 { + reg = <0x8000 0x0 0x0 0x0 0x0>; + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + interrupt-parent = <&pic>; + #interrupt-cells = <1>; + interrupt-map-mask = <0x0 0x0 0x0 0x0>; + interrupt-map = <0x0 0x0 0x0 0x0 &pic 30 IRQ_TYPE_LEVEL_HIGH>; + ranges; + }; + }; + }; +}; -- GitLab From 44a01f1f726ab1d2050fb741eca4fabfa3cab799 Mon Sep 17 00:00:00 2001 From: Binbin Zhou Date: Wed, 17 Jan 2024 12:43:08 +0800 Subject: [PATCH 1022/1290] LoongArch: Parsing CPU-related information from DTS Generally, we can get cpu-related information, such as model name, from /proc/cpuinfo. For FDT-based systems, we need to parse the relevant information from DTS. BTW, set loongson_sysconf.cores_per_package to num_processors if SMBIOS doesn't provide a valid number (usually FDT-based systems). Signed-off-by: Binbin Zhou Signed-off-by: Hongliang Wang Signed-off-by: Huacai Chen --- arch/loongarch/kernel/env.c | 34 +++++++++++++++++++++++++++++++++- arch/loongarch/kernel/smp.c | 3 +++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/arch/loongarch/kernel/env.c b/arch/loongarch/kernel/env.c index 6b3bfb0092e60..2f1f5b08638f8 100644 --- a/arch/loongarch/kernel/env.c +++ b/arch/loongarch/kernel/env.c @@ -5,13 +5,16 @@ * Copyright (C) 2020-2022 Loongson Technology Corporation Limited */ #include +#include #include #include #include +#include #include #include #include #include +#include u64 efi_system_table; struct loongson_system_configuration loongson_sysconf; @@ -36,7 +39,16 @@ void __init init_environ(void) static int __init init_cpu_fullname(void) { - int cpu; + struct device_node *root; + int cpu, ret; + char *model; + + /* Parsing cpuname from DTS model property */ + root = of_find_node_by_path("/"); + ret = of_property_read_string(root, "model", (const char **)&model); + of_node_put(root); + if (ret == 0) + loongson_sysconf.cpuname = strsep(&model, " "); if (loongson_sysconf.cpuname && !strncmp(loongson_sysconf.cpuname, "Loongson", 8)) { for (cpu = 0; cpu < NR_CPUS; cpu++) @@ -46,6 +58,26 @@ static int __init init_cpu_fullname(void) } arch_initcall(init_cpu_fullname); +static int __init fdt_cpu_clk_init(void) +{ + struct clk *clk; + struct device_node *np; + + np = of_get_cpu_node(0, NULL); + if (!np) + return -ENODEV; + + clk = of_clk_get(np, 0); + if (IS_ERR(clk)) + return -ENODEV; + + cpu_clock_freq = clk_get_rate(clk); + clk_put(clk); + + return 0; +} +late_initcall(fdt_cpu_clk_init); + static ssize_t boardinfo_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c index 5bca12d16e069..9e33b5e361225 100644 --- a/arch/loongarch/kernel/smp.c +++ b/arch/loongarch/kernel/smp.c @@ -216,6 +216,9 @@ void __init loongson_smp_setup(void) { fdt_smp_setup(); + if (loongson_sysconf.cores_per_package == 0) + loongson_sysconf.cores_per_package = num_processors; + cpu_data[0].core = cpu_logical_map(0) % loongson_sysconf.cores_per_package; cpu_data[0].package = cpu_logical_map(0) / loongson_sysconf.cores_per_package; -- GitLab From 9499daeade0edcbd3d5d20ffdd642a8e3a194b1f Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 17 Jan 2024 12:43:08 +0800 Subject: [PATCH 1023/1290] LoongArch: Add a missing call to efi_esrt_init() ESRT (EFI System Resource Table) is needed for UEFI's "Capsule Update" feature. But ESRT initialization is missing on LoongArch now, so add a call to efi_esrt_init() at the end of efi_init(). Signed-off-by: Huacai Chen --- arch/loongarch/kernel/efi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/loongarch/kernel/efi.c b/arch/loongarch/kernel/efi.c index acb5d3385675c..000825406c1f6 100644 --- a/arch/loongarch/kernel/efi.c +++ b/arch/loongarch/kernel/efi.c @@ -140,4 +140,6 @@ void __init efi_init(void) early_memunmap(tbl, sizeof(*tbl)); } + + efi_esrt_init(); } -- GitLab From d23b77953f5a4fbf94c05157b186aac2a247ae32 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 17 Jan 2024 12:43:08 +0800 Subject: [PATCH 1024/1290] LoongArch: Change SHMLBA from SZ_64K to PAGE_SIZE LoongArch has hardware page coloring for L1 Cache, so we don't have cache aliases. But SFB (Store Fill Buffer) still has aliases. So we define SHMLBA to SZ_64K previously. But there are losts of applications use PAGE_SIZE rather than SHMLBA to mmap() file pages and shared pages. Of course we can fix them one by one, but not easy. On the other hand, we can simply disable SFB for 4KB page size to fix cache alias (there will be performance decrease, but acceptable), and in future we will fix SFB in hardware. So we can safely define SHMLBA to PAGE_SIZE (use the generic shmparam.h) to make life easier. Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/shmparam.h | 12 ------------ arch/loongarch/kernel/head.S | 10 ++++++++++ 2 files changed, 10 insertions(+), 12 deletions(-) delete mode 100644 arch/loongarch/include/asm/shmparam.h diff --git a/arch/loongarch/include/asm/shmparam.h b/arch/loongarch/include/asm/shmparam.h deleted file mode 100644 index c9554f48d2dfa..0000000000000 --- a/arch/loongarch/include/asm/shmparam.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2020-2022 Loongson Technology Corporation Limited - */ -#ifndef _ASM_SHMPARAM_H -#define _ASM_SHMPARAM_H - -#define __ARCH_FORCE_SHMLBA 1 - -#define SHMLBA SZ_64K /* attach addr a multiple of this */ - -#endif /* _ASM_SHMPARAM_H */ diff --git a/arch/loongarch/kernel/head.S b/arch/loongarch/kernel/head.S index 53b883db07862..be187e99d3587 100644 --- a/arch/loongarch/kernel/head.S +++ b/arch/loongarch/kernel/head.S @@ -75,6 +75,11 @@ SYM_CODE_START(kernel_entry) # kernel entry point la.pcrel t0, fw_arg2 st.d a2, t0, 0 +#ifdef CONFIG_PAGE_SIZE_4KB + li.d t0, 0 + li.d t1, CSR_STFILL + csrxchg t0, t1, LOONGARCH_CSR_IMPCTL1 +#endif /* KSave3 used for percpu base, initialized as 0 */ csrwr zero, PERCPU_BASE_KS /* GPR21 used for percpu base (runtime), initialized as 0 */ @@ -127,6 +132,11 @@ SYM_CODE_START(smpboot_entry) JUMP_VIRT_ADDR t0, t1 +#ifdef CONFIG_PAGE_SIZE_4KB + li.d t0, 0 + li.d t1, CSR_STFILL + csrxchg t0, t1, LOONGARCH_CSR_IMPCTL1 +#endif /* Enable PG */ li.w t0, 0xb0 # PLV=0, IE=0, PG=1 csrwr t0, LOONGARCH_CSR_CRMD -- GitLab From ce68ff3528e6eff4a1a4770600ec6c66779ba7b9 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 17 Jan 2024 12:43:08 +0800 Subject: [PATCH 1025/1290] LoongArch: Let cores_io_master cover the largest NR_CPUS Now loongson_system_configuration::cores_io_master only covers 64 cpus, if NR_CPUS > 64 there will be memory corruption. So let cores_io_master cover the largest NR_CPUS (256). Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/bootinfo.h | 6 ++++-- arch/loongarch/kernel/acpi.c | 2 +- arch/loongarch/kernel/smp.c | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/loongarch/include/asm/bootinfo.h b/arch/loongarch/include/asm/bootinfo.h index c60796869b2b8..6d5846dd075cb 100644 --- a/arch/loongarch/include/asm/bootinfo.h +++ b/arch/loongarch/include/asm/bootinfo.h @@ -24,13 +24,15 @@ struct loongson_board_info { const char *board_vendor; }; +#define NR_WORDS DIV_ROUND_UP(NR_CPUS, BITS_PER_LONG) + struct loongson_system_configuration { int nr_cpus; int nr_nodes; int boot_cpu_id; int cores_per_node; int cores_per_package; - unsigned long cores_io_master; + unsigned long cores_io_master[NR_WORDS]; unsigned long suspend_addr; const char *cpuname; }; @@ -42,7 +44,7 @@ extern struct loongson_system_configuration loongson_sysconf; static inline bool io_master(int cpu) { - return test_bit(cpu, &loongson_sysconf.cores_io_master); + return test_bit(cpu, loongson_sysconf.cores_io_master); } #endif /* _ASM_BOOTINFO_H */ diff --git a/arch/loongarch/kernel/acpi.c b/arch/loongarch/kernel/acpi.c index 8e00a754e5489..b6b097bbf8668 100644 --- a/arch/loongarch/kernel/acpi.c +++ b/arch/loongarch/kernel/acpi.c @@ -119,7 +119,7 @@ acpi_parse_eio_master(union acpi_subtable_headers *header, const unsigned long e return -EINVAL; core = eiointc->node * CORES_PER_EIO_NODE; - set_bit(core, &(loongson_sysconf.cores_io_master)); + set_bit(core, loongson_sysconf.cores_io_master); return 0; } diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c index 9e33b5e361225..a16e3dbe9f09e 100644 --- a/arch/loongarch/kernel/smp.c +++ b/arch/loongarch/kernel/smp.c @@ -208,7 +208,7 @@ static void __init fdt_smp_setup(void) } loongson_sysconf.nr_cpus = num_processors; - set_bit(0, &(loongson_sysconf.cores_io_master)); + set_bit(0, loongson_sysconf.cores_io_master); #endif } -- GitLab From c2396651309eba291c15e32db8fbe44c738b5921 Mon Sep 17 00:00:00 2001 From: Xi Ruoyao Date: Wed, 17 Jan 2024 12:43:08 +0800 Subject: [PATCH 1026/1290] LoongArch: Fix and simplify fcsr initialization on execve() There has been a lingering bug in LoongArch Linux systems causing some GCC tests to intermittently fail (see Closes link). I've made a minimal reproducer: zsh% cat measure.s .align 4 .globl _start _start: movfcsr2gr $a0, $fcsr0 bstrpick.w $a0, $a0, 16, 16 beqz $a0, .ok break 0 .ok: li.w $a7, 93 syscall 0 zsh% cc mesaure.s -o measure -nostdlib zsh% echo $((1.0/3)) 0.33333333333333331 zsh% while ./measure; do ; done This while loop should not stop as POSIX is clear that execve must set fenv to the default, where FCSR should be zero. But in fact it will just stop after running for a while (normally less than 30 seconds). Note that "$((1.0/3))" is needed to reproduce this issue because it raises FE_INVALID and makes fcsr0 non-zero. The problem is we are currently relying on SET_PERSONALITY2() to reset current->thread.fpu.fcsr. But SET_PERSONALITY2() is executed before start_thread which calls lose_fpu(0). We can see if kernel preempt is enabled, we may switch to another thread after SET_PERSONALITY2() but before lose_fpu(0). Then bad thing happens: during the thread switch the value of the fcsr0 register is stored into current->thread.fpu.fcsr, making it dirty again. The issue can be fixed by setting current->thread.fpu.fcsr after lose_fpu(0) because lose_fpu() clears TIF_USEDFPU, then the thread switch won't touch current->thread.fpu.fcsr. The only other architecture setting FCSR in SET_PERSONALITY2() is MIPS. I've ran a similar test on MIPS with mainline kernel and it turns out MIPS is buggy, too. Anyway MIPS do this for supporting different FP flavors (NaN encodings, etc.) which do not exist on LoongArch. So for LoongArch, we can simply remove the current->thread.fpu.fcsr setting from SET_PERSONALITY2() and do it in start_thread(), after lose_fpu(0). The while loop failing with the mainline kernel has survived one hour after this change on LoongArch. Fixes: 803b0fc5c3f2baa ("LoongArch: Add process management") Closes: https://github.com/loongson-community/discussions/issues/7 Link: https://lore.kernel.org/linux-mips/7a6aa1bbdbbe2e63ae96ff163fab0349f58f1b9e.camel@xry111.site/ Cc: stable@vger.kernel.org Signed-off-by: Xi Ruoyao Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/elf.h | 5 ----- arch/loongarch/kernel/elf.c | 5 ----- arch/loongarch/kernel/process.c | 1 + 3 files changed, 1 insertion(+), 10 deletions(-) diff --git a/arch/loongarch/include/asm/elf.h b/arch/loongarch/include/asm/elf.h index 9b16a3b8e7060..f16bd42456e4c 100644 --- a/arch/loongarch/include/asm/elf.h +++ b/arch/loongarch/include/asm/elf.h @@ -241,8 +241,6 @@ void loongarch_dump_regs64(u64 *uregs, const struct pt_regs *regs); do { \ current->thread.vdso = &vdso_info; \ \ - loongarch_set_personality_fcsr(state); \ - \ if (personality(current->personality) != PER_LINUX) \ set_personality(PER_LINUX); \ } while (0) @@ -259,7 +257,6 @@ do { \ clear_thread_flag(TIF_32BIT_ADDR); \ \ current->thread.vdso = &vdso_info; \ - loongarch_set_personality_fcsr(state); \ \ p = personality(current->personality); \ if (p != PER_LINUX32 && p != PER_LINUX) \ @@ -340,6 +337,4 @@ extern int arch_elf_pt_proc(void *ehdr, void *phdr, struct file *elf, extern int arch_check_elf(void *ehdr, bool has_interpreter, void *interp_ehdr, struct arch_elf_state *state); -extern void loongarch_set_personality_fcsr(struct arch_elf_state *state); - #endif /* _ASM_ELF_H */ diff --git a/arch/loongarch/kernel/elf.c b/arch/loongarch/kernel/elf.c index 183e94fc9c69c..0fa81ced28dcd 100644 --- a/arch/loongarch/kernel/elf.c +++ b/arch/loongarch/kernel/elf.c @@ -23,8 +23,3 @@ int arch_check_elf(void *_ehdr, bool has_interpreter, void *_interp_ehdr, { return 0; } - -void loongarch_set_personality_fcsr(struct arch_elf_state *state) -{ - current->thread.fpu.fcsr = boot_cpu_data.fpu_csr0; -} diff --git a/arch/loongarch/kernel/process.c b/arch/loongarch/kernel/process.c index 767d94cce0de0..f2ff8b5d591e4 100644 --- a/arch/loongarch/kernel/process.c +++ b/arch/loongarch/kernel/process.c @@ -85,6 +85,7 @@ void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp) regs->csr_euen = euen; lose_fpu(0); lose_lbt(0); + current->thread.fpu.fcsr = boot_cpu_data.fpu_csr0; clear_thread_flag(TIF_LSX_CTX_LIVE); clear_thread_flag(TIF_LASX_CTX_LIVE); -- GitLab From 78de91b45860da175bab73f4521d9ad875f3a7d4 Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Wed, 17 Jan 2024 12:43:08 +0800 Subject: [PATCH 1027/1290] LoongArch: Use generic interface to support crashkernel=X,[high,low] LoongArch already supports two crashkernel regions in kexec-tools, so we can directly use the common interface to support crashkernel=X,[high,low] after commit 0ab97169aa0517079b ("crash_core: add generic function to do reservation"). With the help of newly changed function parse_crashkernel() and generic reserve_crashkernel_generic(), crashkernel reservation can be simplified by steps: 1) Add a new header file , then define CRASH_ALIGN, CRASH_ADDR_LOW_MAX and CRASH_ADDR_HIGH_MAX and in ; 2) Add arch_reserve_crashkernel() to call parse_crashkernel() and reserve_crashkernel_generic(); 3) Add ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION Kconfig in arch/loongarch/Kconfig. One can reserve the crash kernel from high memory above DMA zone range by explicitly passing "crashkernel=X,high"; or reserve a memory range below 4G with "crashkernel=X,low". Besides, there are few rules need to take notice: 1) "crashkernel=X,[high,low]" will be ignored if "crashkernel=size" is specified. 2) "crashkernel=X,low" is valid only when "crashkernel=X,high" is passed and there is enough memory to be allocated under 4G. 3) When allocating crashkernel above 4G and no "crashkernel=X,low" is specified, a 128M low memory will be allocated automatically for swiotlb bounce buffer. See Documentation/admin-guide/kernel-parameters.txt for more information. Following test cases have been performed as expected: 1) crashkernel=256M //low=256M 2) crashkernel=1G //low=1G 3) crashkernel=4G //high=4G, low=128M(default) 4) crashkernel=4G crashkernel=256M,high //high=4G, low=128M(default), high is ignored 5) crashkernel=4G crashkernel=256M,low //high=4G, low=128M(default), low is ignored 6) crashkernel=4G,high //high=4G, low=128M(default) 7) crashkernel=256M,low //low=0M, invalid 8) crashkernel=4G,high crashkernel=256M,low //high=4G, low=256M 9) crashkernel=4G,high crashkernel=4G,low //high=0M, low=0M, invalid 10) crashkernel=512M@2560M //low=512M 11) crashkernel=1G,high crashkernel=0M,low //high=1G, low=0M Recommended usage in general: 1) In the case of small memory: crashkernel=512M 2) In the case of large memory: crashkernel=1024M,high crashkernel=128M,low Signed-off-by: Youling Tang Signed-off-by: Huacai Chen --- .../admin-guide/kernel-parameters.txt | 24 +++++----- arch/loongarch/Kconfig | 3 ++ arch/loongarch/include/asm/crash_core.h | 12 +++++ arch/loongarch/kernel/setup.c | 44 +++++-------------- 4 files changed, 38 insertions(+), 45 deletions(-) create mode 100644 arch/loongarch/include/asm/crash_core.h diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 65731b060e3fe..f2633dd87a974 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -877,9 +877,9 @@ memory region [offset, offset + size] for that kernel image. If '@offset' is omitted, then a suitable offset is selected automatically. - [KNL, X86-64, ARM64, RISCV] Select a region under 4G first, and - fall back to reserve region above 4G when '@offset' - hasn't been specified. + [KNL, X86-64, ARM64, RISCV, LoongArch] Select a region + under 4G first, and fall back to reserve region above + 4G when '@offset' hasn't been specified. See Documentation/admin-guide/kdump/kdump.rst for further details. crashkernel=range1:size1[,range2:size2,...][@offset] @@ -890,25 +890,27 @@ Documentation/admin-guide/kdump/kdump.rst for an example. crashkernel=size[KMG],high - [KNL, X86-64, ARM64, RISCV] range could be above 4G. + [KNL, X86-64, ARM64, RISCV, LoongArch] range could be + above 4G. Allow kernel to allocate physical memory region from top, so could be above 4G if system have more than 4G ram installed. Otherwise memory region will be allocated below 4G, if available. It will be ignored if crashkernel=X is specified. crashkernel=size[KMG],low - [KNL, X86-64, ARM64, RISCV] range under 4G. When crashkernel=X,high - is passed, kernel could allocate physical memory region - above 4G, that cause second kernel crash on system - that require some amount of low memory, e.g. swiotlb - requires at least 64M+32K low memory, also enough extra - low memory is needed to make sure DMA buffers for 32-bit - devices won't run out. Kernel would try to allocate + [KNL, X86-64, ARM64, RISCV, LoongArch] range under 4G. + When crashkernel=X,high is passed, kernel could allocate + physical memory region above 4G, that cause second kernel + crash on system that require some amount of low memory, + e.g. swiotlb requires at least 64M+32K low memory, also + enough extra low memory is needed to make sure DMA buffers + for 32-bit devices won't run out. Kernel would try to allocate default size of memory below 4G automatically. The default size is platform dependent. --> x86: max(swiotlb_size_or_default() + 8MiB, 256MiB) --> arm64: 128MiB --> riscv: 128MiB + --> loongarch: 128MiB This one lets the user specify own low range under 4G for second kernel instead. 0: to disable low allocation. diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index ede2ef26726a8..c997223beae00 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -594,6 +594,9 @@ config ARCH_SELECTS_CRASH_DUMP depends on CRASH_DUMP select RELOCATABLE +config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION + def_bool CRASH_CORE + config RELOCATABLE bool "Relocatable kernel" help diff --git a/arch/loongarch/include/asm/crash_core.h b/arch/loongarch/include/asm/crash_core.h new file mode 100644 index 0000000000000..218bdbfa527ba --- /dev/null +++ b/arch/loongarch/include/asm/crash_core.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _LOONGARCH_CRASH_CORE_H +#define _LOONGARCH_CRASH_CORE_H + +#define CRASH_ALIGN SZ_2M + +#define CRASH_ADDR_LOW_MAX SZ_4G +#define CRASH_ADDR_HIGH_MAX memblock_end_of_DRAM() + +extern phys_addr_t memblock_end_of_DRAM(void); + +#endif diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index 15d366b8407c3..edf2bba801306 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -252,38 +252,23 @@ static void __init arch_reserve_vmcore(void) #endif } -/* 2MB alignment for crash kernel regions */ -#define CRASH_ALIGN SZ_2M -#define CRASH_ADDR_MAX SZ_4G - -static void __init arch_parse_crashkernel(void) +static void __init arch_reserve_crashkernel(void) { -#ifdef CONFIG_KEXEC int ret; - unsigned long long total_mem; + unsigned long long low_size = 0; unsigned long long crash_base, crash_size; + char *cmdline = boot_command_line; + bool high = false; - total_mem = memblock_phys_mem_size(); - ret = parse_crashkernel(boot_command_line, total_mem, - &crash_size, &crash_base, - NULL, NULL); - if (ret < 0 || crash_size <= 0) + if (!IS_ENABLED(CONFIG_KEXEC_CORE)) return; - if (crash_base <= 0) { - crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN, CRASH_ALIGN, CRASH_ADDR_MAX); - if (!crash_base) { - pr_warn("crashkernel reservation failed - No suitable area found.\n"); - return; - } - } else if (!memblock_phys_alloc_range(crash_size, CRASH_ALIGN, crash_base, crash_base + crash_size)) { - pr_warn("Invalid memory region reserved for crash kernel\n"); + ret = parse_crashkernel(cmdline, memblock_phys_mem_size(), + &crash_size, &crash_base, &low_size, &high); + if (ret) return; - } - crashk_res.start = crash_base; - crashk_res.end = crash_base + crash_size - 1; -#endif + reserve_crashkernel_generic(cmdline, crash_size, crash_base, low_size, high); } static void __init fdt_setup(void) @@ -363,7 +348,7 @@ out: void __init platform_init(void) { arch_reserve_vmcore(); - arch_parse_crashkernel(); + arch_reserve_crashkernel(); #ifdef CONFIG_ACPI_TABLE_UPGRADE acpi_table_upgrade(); @@ -473,15 +458,6 @@ static void __init resource_init(void) request_resource(res, &data_resource); request_resource(res, &bss_resource); } - -#ifdef CONFIG_KEXEC - if (crashk_res.start < crashk_res.end) { - insert_resource(&iomem_resource, &crashk_res); - pr_info("Reserving %ldMB of memory at %ldMB for crashkernel\n", - (unsigned long)((crashk_res.end - crashk_res.start + 1) >> 20), - (unsigned long)(crashk_res.start >> 20)); - } -#endif } static int __init add_legacy_isa_io(struct fwnode_handle *fwnode, -- GitLab From 91af17cd7d03db8836554c91ba7c38b0817aa980 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Wed, 17 Jan 2024 12:43:08 +0800 Subject: [PATCH 1028/1290] LoongArch: Fix definition of ftrace_regs_set_instruction_pointer() The current definition of ftrace_regs_set_instruction_pointer() is not correct. Obviously, this function is used to set instruction pointer but not return value, so it should call instruction_pointer_set() instead of regs_set_return_value(). There is no side effect by now because it is only used for kernel live- patching which is not supported, so fix it to avoid failure when testing livepatch in the future. Fixes: 6fbff14a6382 ("LoongArch: ftrace: Abstract DYNAMIC_FTRACE_WITH_ARGS accesses") Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/ftrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/loongarch/include/asm/ftrace.h b/arch/loongarch/include/asm/ftrace.h index a11996eb5892d..de891c2c83d4a 100644 --- a/arch/loongarch/include/asm/ftrace.h +++ b/arch/loongarch/include/asm/ftrace.h @@ -63,7 +63,7 @@ ftrace_regs_get_instruction_pointer(struct ftrace_regs *fregs) static __always_inline void ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs, unsigned long ip) { - regs_set_return_value(&fregs->regs, ip); + instruction_pointer_set(&fregs->regs, ip); } #define ftrace_regs_get_argument(fregs, n) \ -- GitLab From 21c5ae5cc1eee70f7f3b09f1d6b237d9812d4b9c Mon Sep 17 00:00:00 2001 From: Hengqi Chen Date: Wed, 17 Jan 2024 12:43:13 +0800 Subject: [PATCH 1029/1290] LoongArch: BPF: Support 64-bit pointers to kfuncs Like commit 1cf3bfc60f9836f ("bpf: Support 64-bit pointers to kfuncs") for s390x, add support for 64-bit pointers to kfuncs for LoongArch. Since the infrastructure is already implemented in BPF core, the only thing need to be done is to override bpf_jit_supports_far_kfunc_call(). Before this change, several test_verifier tests failed: # ./test_verifier | grep # | grep FAIL #119/p calls: invalid kfunc call: ptr_to_mem to struct with non-scalar FAIL #120/p calls: invalid kfunc call: ptr_to_mem to struct with nesting depth > 4 FAIL #121/p calls: invalid kfunc call: ptr_to_mem to struct with FAM FAIL #122/p calls: invalid kfunc call: reg->type != PTR_TO_CTX FAIL #123/p calls: invalid kfunc call: void * not allowed in func proto without mem size arg FAIL #124/p calls: trigger reg2btf_ids[reg->type] for reg->type > __BPF_REG_TYPE_MAX FAIL #125/p calls: invalid kfunc call: reg->off must be zero when passed to release kfunc FAIL #126/p calls: invalid kfunc call: don't match first member type when passed to release kfunc FAIL #127/p calls: invalid kfunc call: PTR_TO_BTF_ID with negative offset FAIL #128/p calls: invalid kfunc call: PTR_TO_BTF_ID with variable offset FAIL #129/p calls: invalid kfunc call: referenced arg needs refcounted PTR_TO_BTF_ID FAIL #130/p calls: valid kfunc call: referenced arg needs refcounted PTR_TO_BTF_ID FAIL #486/p map_kptr: ref: reference state created and released on xchg FAIL This is because the kfuncs in the loaded module are far away from __bpf_call_base: ffff800002009440 t bpf_kfunc_call_test_fail1 [bpf_testmod] 9000000002e128d8 T __bpf_call_base The offset relative to __bpf_call_base does NOT fit in s32, which breaks the assumption in BPF core. Enable bpf_jit_supports_far_kfunc_call() lifts this limit. Note that to reproduce the above result, tools/testing/selftests/bpf/config should be applied, and run the test with JIT enabled, unpriv BPF enabled. With this change, the test_verifier tests now all passed: # ./test_verifier ... Summary: 777 PASSED, 0 SKIPPED, 0 FAILED Tested-by: Tiezhu Yang Signed-off-by: Hengqi Chen Signed-off-by: Huacai Chen --- arch/loongarch/net/bpf_jit.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/loongarch/net/bpf_jit.c b/arch/loongarch/net/bpf_jit.c index 4fcd6cd6da234..2f154c60ee007 100644 --- a/arch/loongarch/net/bpf_jit.c +++ b/arch/loongarch/net/bpf_jit.c @@ -201,6 +201,11 @@ bool bpf_jit_supports_kfunc_call(void) return true; } +bool bpf_jit_supports_far_kfunc_call(void) +{ + return true; +} + /* initialized on the first pass of build_body() */ static int out_offset = -1; static int emit_bpf_tail_call(struct jit_ctx *ctx) -- GitLab From 36a87385e31c9343af9a4756598e704741250a67 Mon Sep 17 00:00:00 2001 From: Hengqi Chen Date: Wed, 17 Jan 2024 12:43:13 +0800 Subject: [PATCH 1030/1290] LoongArch: BPF: Prevent out-of-bounds memory access The test_tag test triggers an unhandled page fault: # ./test_tag [ 130.640218] CPU 0 Unable to handle kernel paging request at virtual address ffff80001b898004, era == 9000000003137f7c, ra == 9000000003139e70 [ 130.640501] Oops[#3]: [ 130.640553] CPU: 0 PID: 1326 Comm: test_tag Tainted: G D O 6.7.0-rc4-loong-devel-gb62ab1a397cf #47 61985c1d94084daa2432f771daa45b56b10d8d2a [ 130.640764] Hardware name: QEMU QEMU Virtual Machine, BIOS unknown 2/2/2022 [ 130.640874] pc 9000000003137f7c ra 9000000003139e70 tp 9000000104cb4000 sp 9000000104cb7a40 [ 130.641001] a0 ffff80001b894000 a1 ffff80001b897ff8 a2 000000006ba210be a3 0000000000000000 [ 130.641128] a4 000000006ba210be a5 00000000000000f1 a6 00000000000000b3 a7 0000000000000000 [ 130.641256] t0 0000000000000000 t1 00000000000007f6 t2 0000000000000000 t3 9000000004091b70 [ 130.641387] t4 000000006ba210be t5 0000000000000004 t6 fffffffffffffff0 t7 90000000040913e0 [ 130.641512] t8 0000000000000005 u0 0000000000000dc0 s9 0000000000000009 s0 9000000104cb7ae0 [ 130.641641] s1 00000000000007f6 s2 0000000000000009 s3 0000000000000095 s4 0000000000000000 [ 130.641771] s5 ffff80001b894000 s6 ffff80001b897fb0 s7 9000000004090c50 s8 0000000000000000 [ 130.641900] ra: 9000000003139e70 build_body+0x1fcc/0x4988 [ 130.642007] ERA: 9000000003137f7c build_body+0xd8/0x4988 [ 130.642112] CRMD: 000000b0 (PLV0 -IE -DA +PG DACF=CC DACM=CC -WE) [ 130.642261] PRMD: 00000004 (PPLV0 +PIE -PWE) [ 130.642353] EUEN: 00000003 (+FPE +SXE -ASXE -BTE) [ 130.642458] ECFG: 00071c1c (LIE=2-4,10-12 VS=7) [ 130.642554] ESTAT: 00010000 [PIL] (IS= ECode=1 EsubCode=0) [ 130.642658] BADV: ffff80001b898004 [ 130.642719] PRID: 0014c010 (Loongson-64bit, Loongson-3A5000) [ 130.642815] Modules linked in: [last unloaded: bpf_testmod(O)] [ 130.642924] Process test_tag (pid: 1326, threadinfo=00000000f7f4015f, task=000000006499f9fd) [ 130.643062] Stack : 0000000000000000 9000000003380724 0000000000000000 0000000104cb7be8 [ 130.643213] 0000000000000000 25af8d9b6e600558 9000000106250ea0 9000000104cb7ae0 [ 130.643378] 0000000000000000 0000000000000000 9000000104cb7be8 90000000049f6000 [ 130.643538] 0000000000000090 9000000106250ea0 ffff80001b894000 ffff80001b894000 [ 130.643685] 00007ffffb917790 900000000313ca94 0000000000000000 0000000000000000 [ 130.643831] ffff80001b894000 0000000000000ff7 0000000000000000 9000000100468000 [ 130.643983] 0000000000000000 0000000000000000 0000000000000040 25af8d9b6e600558 [ 130.644131] 0000000000000bb7 ffff80001b894048 0000000000000000 0000000000000000 [ 130.644276] 9000000104cb7be8 90000000049f6000 0000000000000090 9000000104cb7bdc [ 130.644423] ffff80001b894000 0000000000000000 00007ffffb917790 90000000032acfb0 [ 130.644572] ... [ 130.644629] Call Trace: [ 130.644641] [<9000000003137f7c>] build_body+0xd8/0x4988 [ 130.644785] [<900000000313ca94>] bpf_int_jit_compile+0x228/0x4ec [ 130.644891] [<90000000032acfb0>] bpf_prog_select_runtime+0x158/0x1b0 [ 130.645003] [<90000000032b3504>] bpf_prog_load+0x760/0xb44 [ 130.645089] [<90000000032b6744>] __sys_bpf+0xbb8/0x2588 [ 130.645175] [<90000000032b8388>] sys_bpf+0x20/0x2c [ 130.645259] [<9000000003f6ab38>] do_syscall+0x7c/0x94 [ 130.645369] [<9000000003121c5c>] handle_syscall+0xbc/0x158 [ 130.645507] [ 130.645539] Code: 380839f6 380831f9 28412bae <24000ca6> 004081ad 0014cb50 004083e8 02bff34c 58008e91 [ 130.645729] [ 130.646418] ---[ end trace 0000000000000000 ]--- On my machine, which has CONFIG_PAGE_SIZE_16KB=y, the test failed at loading a BPF prog with 2039 instructions: prog = (struct bpf_prog *)ffff80001b894000 insn = (struct bpf_insn *)(prog->insnsi)ffff80001b894048 insn + 2039 = (struct bpf_insn *)ffff80001b898000 <- end of the page In the build_insn() function, we are trying to access next instruction unconditionally, i.e. `(insn + 1)->imm`. The address lies in the next page and can be not owned by the current process, thus an page fault is inevitable and then segfault. So, let's access next instruction only under `dst = imm64` context. With this fix, we have: # ./test_tag test_tag: OK (40945 tests) Fixes: bbfddb904df6f82 ("LoongArch: BPF: Avoid declare variables in switch-case") Tested-by: Tiezhu Yang Signed-off-by: Hengqi Chen Signed-off-by: Huacai Chen --- arch/loongarch/net/bpf_jit.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/loongarch/net/bpf_jit.c b/arch/loongarch/net/bpf_jit.c index 2f154c60ee007..e73323d759d0b 100644 --- a/arch/loongarch/net/bpf_jit.c +++ b/arch/loongarch/net/bpf_jit.c @@ -470,7 +470,6 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, bool ext const u8 dst = regmap[insn->dst_reg]; const s16 off = insn->off; const s32 imm = insn->imm; - const u64 imm64 = (u64)(insn + 1)->imm << 32 | (u32)insn->imm; const bool is32 = BPF_CLASS(insn->code) == BPF_ALU || BPF_CLASS(insn->code) == BPF_JMP32; switch (code) { @@ -928,8 +927,12 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, bool ext /* dst = imm64 */ case BPF_LD | BPF_IMM | BPF_DW: + { + const u64 imm64 = (u64)(insn + 1)->imm << 32 | (u32)insn->imm; + move_imm(ctx, dst, imm64, is32); return 1; + } /* dst = *(size *)(src + off) */ case BPF_LDX | BPF_MEM | BPF_B: -- GitLab From fc562925f51c0ce462c17b24a5c538db676af576 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 17 Jan 2024 12:43:13 +0800 Subject: [PATCH 1031/1290] LoongArch: Update Loongson-3 default config file 1, Increase NR_CPUS to 256. 2, Enable some cgroup options. 3, Enable some PREEMPT_DYNAMIC/SCHED_CORE options. 4, Enable some CMA/DMA_CMA options. 5, Enable some F2FS options. 6, Enable some DMABUF/UDMABUF options. 7, Enable some USB4 and NTB options. 8, Enable some networking options (MPTCP). 9, Enable Loongson-specific drivers: APB DMA, ASoC. 10, Enable PCI_HOST_GENERIC and SND_VIRTIO for virtual machine. 11, Remove obsolete SECURITY_SELINUX_DISABLE. 12, Regenerate the whole file to keep the order of options be the same as the latest source code. Signed-off-by: Huacai Chen --- arch/loongarch/configs/loongson3_defconfig | 55 ++++++++++++++++++---- 1 file changed, 47 insertions(+), 8 deletions(-) diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig index 33795e4a5bd63..fd8d8a38cf5de 100644 --- a/arch/loongarch/configs/loongson3_defconfig +++ b/arch/loongarch/configs/loongson3_defconfig @@ -6,6 +6,8 @@ CONFIG_HIGH_RES_TIMERS=y CONFIG_BPF_SYSCALL=y CONFIG_BPF_JIT=y CONFIG_PREEMPT=y +CONFIG_PREEMPT_DYNAMIC=y +CONFIG_SCHED_CORE=y CONFIG_BSD_PROCESS_ACCT=y CONFIG_BSD_PROCESS_ACCT_V3=y CONFIG_TASKSTATS=y @@ -19,6 +21,7 @@ CONFIG_BLK_CGROUP=y CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y CONFIG_CGROUP_PIDS=y +CONFIG_CGROUP_RDMA=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_HUGETLB=y CONFIG_CPUSETS=y @@ -26,6 +29,7 @@ CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_PERF=y CONFIG_CGROUP_BPF=y +CONFIG_CGROUP_MISC=y CONFIG_NAMESPACES=y CONFIG_USER_NS=y CONFIG_CHECKPOINT_RESTORE=y @@ -35,6 +39,8 @@ CONFIG_BLK_DEV_INITRD=y CONFIG_EXPERT=y CONFIG_KALLSYMS_ALL=y CONFIG_PERF_EVENTS=y +CONFIG_KEXEC=y +CONFIG_CRASH_DUMP=y CONFIG_LOONGARCH=y CONFIG_64BIT=y CONFIG_MACH_LOONGSON64=y @@ -44,13 +50,11 @@ CONFIG_DMI=y CONFIG_EFI=y CONFIG_SMP=y CONFIG_HOTPLUG_CPU=y -CONFIG_NR_CPUS=64 +CONFIG_NR_CPUS=256 CONFIG_NUMA=y CONFIG_CPU_HAS_FPU=y CONFIG_CPU_HAS_LSX=y CONFIG_CPU_HAS_LASX=y -CONFIG_KEXEC=y -CONFIG_CRASH_DUMP=y CONFIG_RANDOMIZE_BASE=y CONFIG_SUSPEND=y CONFIG_HIBERNATION=y @@ -62,10 +66,6 @@ CONFIG_ACPI_IPMI=m CONFIG_ACPI_HOTPLUG_CPU=y CONFIG_ACPI_PCI_SLOT=y CONFIG_ACPI_HOTPLUG_MEMORY=y -CONFIG_EFI_ZBOOT=y -CONFIG_EFI_GENERIC_STUB_INITRD_CMDLINE_LOADER=y -CONFIG_EFI_CAPSULE_LOADER=m -CONFIG_EFI_TEST=m CONFIG_VIRTUALIZATION=y CONFIG_KVM=m CONFIG_JUMP_LABEL=y @@ -74,10 +74,18 @@ CONFIG_MODULE_FORCE_LOAD=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y CONFIG_MODVERSIONS=y +CONFIG_BLK_DEV_ZONED=y CONFIG_BLK_DEV_THROTTLING=y +CONFIG_BLK_DEV_THROTTLING_LOW=y +CONFIG_BLK_WBT=y +CONFIG_BLK_CGROUP_IOLATENCY=y +CONFIG_BLK_CGROUP_FC_APPID=y +CONFIG_BLK_CGROUP_IOCOST=y +CONFIG_BLK_CGROUP_IOPRIO=y CONFIG_PARTITION_ADVANCED=y CONFIG_BSD_DISKLABEL=y CONFIG_UNIXWARE_DISKLABEL=y +CONFIG_CMDLINE_PARTITION=y CONFIG_IOSCHED_BFQ=y CONFIG_BFQ_GROUP_IOSCHED=y CONFIG_BINFMT_MISC=m @@ -93,6 +101,8 @@ CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y CONFIG_MEMORY_HOTREMOVE=y CONFIG_KSM=y CONFIG_TRANSPARENT_HUGEPAGE=y +CONFIG_CMA=y +CONFIG_CMA_SYSFS=y CONFIG_USERFAULTFD=y CONFIG_NET=y CONFIG_PACKET=y @@ -128,6 +138,7 @@ CONFIG_IPV6_ROUTER_PREF=y CONFIG_IPV6_ROUTE_INFO=y CONFIG_INET6_ESP=m CONFIG_IPV6_MROUTE=y +CONFIG_MPTCP=y CONFIG_NETWORK_PHY_TIMESTAMPING=y CONFIG_NETFILTER=y CONFIG_BRIDGE_NETFILTER=m @@ -354,6 +365,7 @@ CONFIG_PCIEAER=y CONFIG_PCI_IOV=y CONFIG_HOTPLUG_PCI=y CONFIG_HOTPLUG_PCI_SHPC=y +CONFIG_PCI_HOST_GENERIC=y CONFIG_PCCARD=m CONFIG_YENTA=m CONFIG_RAPIDIO=y @@ -367,6 +379,10 @@ CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_FW_LOADER_COMPRESS=y CONFIG_FW_LOADER_COMPRESS_ZSTD=y +CONFIG_EFI_ZBOOT=y +CONFIG_EFI_BOOTLOADER_CONTROL=m +CONFIG_EFI_CAPSULE_LOADER=m +CONFIG_EFI_TEST=m CONFIG_MTD=m CONFIG_MTD_BLOCK=m CONFIG_MTD_CFI=m @@ -588,6 +604,7 @@ CONFIG_RTW89_8852AE=m CONFIG_RTW89_8852CE=m CONFIG_ZD1211RW=m CONFIG_USB_NET_RNDIS_WLAN=m +CONFIG_USB4_NET=m CONFIG_INPUT_MOUSEDEV=y CONFIG_INPUT_MOUSEDEV_PSAUX=y CONFIG_INPUT_EVDEV=y @@ -693,6 +710,9 @@ CONFIG_SND_HDA_CODEC_SIGMATEL=y CONFIG_SND_HDA_CODEC_HDMI=y CONFIG_SND_HDA_CODEC_CONEXANT=y CONFIG_SND_USB_AUDIO=m +CONFIG_SND_SOC=m +CONFIG_SND_SOC_LOONGSON_CARD=m +CONFIG_SND_VIRTIO=m CONFIG_HIDRAW=y CONFIG_UHID=m CONFIG_HID_A4TECH=m @@ -740,6 +760,11 @@ CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_EFI=y CONFIG_RTC_DRV_LOONGSON=y CONFIG_DMADEVICES=y +CONFIG_LS2X_APB_DMA=y +CONFIG_UDMABUF=y +CONFIG_DMABUF_HEAPS=y +CONFIG_DMABUF_HEAPS_SYSTEM=y +CONFIG_DMABUF_HEAPS_CMA=y CONFIG_UIO=m CONFIG_UIO_PDRV_GENIRQ=m CONFIG_UIO_DMEM_GENIRQ=m @@ -780,7 +805,15 @@ CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND=y CONFIG_DEVFREQ_GOV_PERFORMANCE=y CONFIG_DEVFREQ_GOV_POWERSAVE=y CONFIG_DEVFREQ_GOV_USERSPACE=y +CONFIG_NTB=m +CONFIG_NTB_MSI=y +CONFIG_NTB_IDT=m +CONFIG_NTB_EPF=m +CONFIG_NTB_SWITCHTEC=m +CONFIG_NTB_PERF=m +CONFIG_NTB_TRANSPORT=m CONFIG_PWM=y +CONFIG_USB4=y CONFIG_EXT2_FS=y CONFIG_EXT2_FS_XATTR=y CONFIG_EXT2_FS_POSIX_ACL=y @@ -799,6 +832,10 @@ CONFIG_GFS2_FS_LOCKING_DLM=y CONFIG_OCFS2_FS=m CONFIG_BTRFS_FS=y CONFIG_BTRFS_FS_POSIX_ACL=y +CONFIG_F2FS_FS=m +CONFIG_F2FS_FS_SECURITY=y +CONFIG_F2FS_CHECK_FS=y +CONFIG_F2FS_FS_COMPRESSION=y CONFIG_FANOTIFY=y CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y CONFIG_QUOTA=y @@ -885,7 +922,6 @@ CONFIG_KEY_DH_OPERATIONS=y CONFIG_SECURITY=y CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX_BOOTPARAM=y -CONFIG_SECURITY_SELINUX_DISABLE=y CONFIG_SECURITY_APPARMOR=y CONFIG_SECURITY_YAMA=y CONFIG_DEFAULT_SECURITY_DAC=y @@ -916,6 +952,9 @@ CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m CONFIG_CRYPTO_CRC32_LOONGARCH=m CONFIG_CRYPTO_DEV_VIRTIO=m +CONFIG_DMA_CMA=y +CONFIG_DMA_NUMA_CMA=y +CONFIG_CMA_SIZE_MBYTES=0 CONFIG_PRINTK_TIME=y CONFIG_STRIP_ASM_SYMS=y CONFIG_MAGIC_SYSRQ=y -- GitLab From 6e441fa3ac475be73c03c9a85bd305d66ea476a6 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Wed, 17 Jan 2024 12:43:13 +0800 Subject: [PATCH 1032/1290] MAINTAINERS: Add BPF JIT for LOONGARCH entry After commit 5dc615520c4d ("LoongArch: Add BPF JIT support"), there is no BPF JIT for LOONGARCH entry, in order to maintain the current code and the new features timely, just add it. Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- MAINTAINERS | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index a7c4cf8201e01..afe2f311232fb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3651,6 +3651,13 @@ L: bpf@vger.kernel.org S: Supported F: arch/arm64/net/ +BPF JIT for LOONGARCH +M: Tiezhu Yang +R: Hengqi Chen +L: bpf@vger.kernel.org +S: Maintained +F: arch/loongarch/net/ + BPF JIT for MIPS (32-BIT AND 64-BIT) M: Johan Almbladh M: Paul Burton -- GitLab From efb8235bfdbe661c460f803150b50840a73b5f03 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 15 Jan 2024 12:17:43 +0100 Subject: [PATCH 1033/1290] gpiolib: revert the attempt to protect the GPIO device list with an rwsem This reverts commits 1979a2807547 ("gpiolib: replace the GPIO device mutex with a read-write semaphore") and 65a828bab158 ("gpiolib: use a mutex to protect the list of GPIO devices"). Unfortunately the legacy GPIO API that's still used in older code has to translate numbers from the global GPIO numberspace to descriptors. This results in a GPIO device lookup in every call to legacy functions. Some of those functions - like gpio_set/get_value() - can be called from atomic context so taking a sleeping lock that is an RW semaphore results in an error. We'll probably have to protect this list with SRCU. Reported-by: Dan Carpenter Closes: https://lore.kernel.org/linux-wireless/f7b5ff1e-8f34-4d98-a7be-b826cb897dc8@moroto.mountain/ Fixes: 1979a2807547 ("gpiolib: replace the GPIO device mutex with a read-write semaphore") Fixes: 65a828bab158 ("gpiolib: use a mutex to protect the list of GPIO devices") Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-sysfs.c | 45 ++++++------ drivers/gpio/gpiolib-sysfs.h | 6 -- drivers/gpio/gpiolib.c | 133 +++++++++++++++++++---------------- drivers/gpio/gpiolib.h | 2 - 4 files changed, 97 insertions(+), 89 deletions(-) diff --git a/drivers/gpio/gpiolib-sysfs.c b/drivers/gpio/gpiolib-sysfs.c index 4dbf298bb5dda..6bf5332136e5a 100644 --- a/drivers/gpio/gpiolib-sysfs.c +++ b/drivers/gpio/gpiolib-sysfs.c @@ -768,25 +768,6 @@ int gpiochip_sysfs_register(struct gpio_device *gdev) return 0; } -int gpiochip_sysfs_register_all(void) -{ - struct gpio_device *gdev; - int ret; - - guard(rwsem_read)(&gpio_devices_sem); - - list_for_each_entry(gdev, &gpio_devices, list) { - if (gdev->mockdev) - continue; - - ret = gpiochip_sysfs_register(gdev); - if (ret) - return ret; - } - - return 0; -} - void gpiochip_sysfs_unregister(struct gpio_device *gdev) { struct gpio_desc *desc; @@ -811,7 +792,9 @@ void gpiochip_sysfs_unregister(struct gpio_device *gdev) static int __init gpiolib_sysfs_init(void) { - int status; + int status; + unsigned long flags; + struct gpio_device *gdev; status = class_register(&gpio_class); if (status < 0) @@ -823,6 +806,26 @@ static int __init gpiolib_sysfs_init(void) * We run before arch_initcall() so chip->dev nodes can have * registered, and so arch_initcall() can always gpiod_export(). */ - return gpiochip_sysfs_register_all(); + spin_lock_irqsave(&gpio_lock, flags); + list_for_each_entry(gdev, &gpio_devices, list) { + if (gdev->mockdev) + continue; + + /* + * TODO we yield gpio_lock here because + * gpiochip_sysfs_register() acquires a mutex. This is unsafe + * and needs to be fixed. + * + * Also it would be nice to use gpio_device_find() here so we + * can keep gpio_chips local to gpiolib.c, but the yield of + * gpio_lock prevents us from doing this. + */ + spin_unlock_irqrestore(&gpio_lock, flags); + status = gpiochip_sysfs_register(gdev); + spin_lock_irqsave(&gpio_lock, flags); + } + spin_unlock_irqrestore(&gpio_lock, flags); + + return status; } postcore_initcall(gpiolib_sysfs_init); diff --git a/drivers/gpio/gpiolib-sysfs.h b/drivers/gpio/gpiolib-sysfs.h index ab157cec0b4be..b794b396d6a52 100644 --- a/drivers/gpio/gpiolib-sysfs.h +++ b/drivers/gpio/gpiolib-sysfs.h @@ -8,7 +8,6 @@ struct gpio_device; #ifdef CONFIG_GPIO_SYSFS int gpiochip_sysfs_register(struct gpio_device *gdev); -int gpiochip_sysfs_register_all(void); void gpiochip_sysfs_unregister(struct gpio_device *gdev); #else @@ -18,11 +17,6 @@ static inline int gpiochip_sysfs_register(struct gpio_device *gdev) return 0; } -static inline int gpiochip_sysfs_register_all(void) -{ - return 0; -} - static inline void gpiochip_sysfs_unregister(struct gpio_device *gdev) { } diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 4c93cf73a8260..44c8f5743a241 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -2,7 +2,6 @@ #include #include -#include #include #include #include @@ -16,7 +15,6 @@ #include #include #include -#include #include #include #include @@ -83,9 +81,7 @@ DEFINE_SPINLOCK(gpio_lock); static DEFINE_MUTEX(gpio_lookup_lock); static LIST_HEAD(gpio_lookup_list); - LIST_HEAD(gpio_devices); -DECLARE_RWSEM(gpio_devices_sem); static DEFINE_MUTEX(gpio_machine_hogs_mutex); static LIST_HEAD(gpio_machine_hogs); @@ -117,15 +113,20 @@ static inline void desc_set_label(struct gpio_desc *d, const char *label) struct gpio_desc *gpio_to_desc(unsigned gpio) { struct gpio_device *gdev; + unsigned long flags; + + spin_lock_irqsave(&gpio_lock, flags); - scoped_guard(rwsem_read, &gpio_devices_sem) { - list_for_each_entry(gdev, &gpio_devices, list) { - if (gdev->base <= gpio && - gdev->base + gdev->ngpio > gpio) - return &gdev->descs[gpio - gdev->base]; + list_for_each_entry(gdev, &gpio_devices, list) { + if (gdev->base <= gpio && + gdev->base + gdev->ngpio > gpio) { + spin_unlock_irqrestore(&gpio_lock, flags); + return &gdev->descs[gpio - gdev->base]; } } + spin_unlock_irqrestore(&gpio_lock, flags); + if (!gpio_is_valid(gpio)) pr_warn("invalid GPIO %d\n", gpio); @@ -398,21 +399,26 @@ static int gpiodev_add_to_list_unlocked(struct gpio_device *gdev) static struct gpio_desc *gpio_name_to_desc(const char * const name) { struct gpio_device *gdev; + unsigned long flags; if (!name) return NULL; - guard(rwsem_read)(&gpio_devices_sem); + spin_lock_irqsave(&gpio_lock, flags); list_for_each_entry(gdev, &gpio_devices, list) { struct gpio_desc *desc; for_each_gpio_desc(gdev->chip, desc) { - if (desc->name && !strcmp(desc->name, name)) + if (desc->name && !strcmp(desc->name, name)) { + spin_unlock_irqrestore(&gpio_lock, flags); return desc; + } } } + spin_unlock_irqrestore(&gpio_lock, flags); + return NULL; } @@ -807,6 +813,7 @@ int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data, struct lock_class_key *request_key) { struct gpio_device *gdev; + unsigned long flags; unsigned int i; int base = 0; int ret = 0; @@ -871,46 +878,49 @@ int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data, gdev->ngpio = gc->ngpio; - scoped_guard(rwsem_write, &gpio_devices_sem) { - /* - * TODO: this allocates a Linux GPIO number base in the global - * GPIO numberspace for this chip. In the long run we want to - * get *rid* of this numberspace and use only descriptors, but - * it may be a pipe dream. It will not happen before we get rid - * of the sysfs interface anyways. - */ - base = gc->base; + spin_lock_irqsave(&gpio_lock, flags); + /* + * TODO: this allocates a Linux GPIO number base in the global + * GPIO numberspace for this chip. In the long run we want to + * get *rid* of this numberspace and use only descriptors, but + * it may be a pipe dream. It will not happen before we get rid + * of the sysfs interface anyways. + */ + base = gc->base; + if (base < 0) { + base = gpiochip_find_base_unlocked(gc->ngpio); if (base < 0) { - base = gpiochip_find_base_unlocked(gc->ngpio); - if (base < 0) { - ret = base; - base = 0; - goto err_free_label; - } - /* - * TODO: it should not be necessary to reflect the assigned - * base outside of the GPIO subsystem. Go over drivers and - * see if anyone makes use of this, else drop this and assign - * a poison instead. - */ - gc->base = base; - } else { - dev_warn(&gdev->dev, - "Static allocation of GPIO base is deprecated, use dynamic allocation.\n"); - } - gdev->base = base; - - ret = gpiodev_add_to_list_unlocked(gdev); - if (ret) { - chip_err(gc, "GPIO integer space overlap, cannot add chip\n"); + spin_unlock_irqrestore(&gpio_lock, flags); + ret = base; + base = 0; goto err_free_label; } + /* + * TODO: it should not be necessary to reflect the assigned + * base outside of the GPIO subsystem. Go over drivers and + * see if anyone makes use of this, else drop this and assign + * a poison instead. + */ + gc->base = base; + } else { + dev_warn(&gdev->dev, + "Static allocation of GPIO base is deprecated, use dynamic allocation.\n"); + } + gdev->base = base; - for (i = 0; i < gc->ngpio; i++) - gdev->descs[i].gdev = gdev; + ret = gpiodev_add_to_list_unlocked(gdev); + if (ret) { + spin_unlock_irqrestore(&gpio_lock, flags); + chip_err(gc, "GPIO integer space overlap, cannot add chip\n"); + goto err_free_label; } + for (i = 0; i < gc->ngpio; i++) + gdev->descs[i].gdev = gdev; + + spin_unlock_irqrestore(&gpio_lock, flags); + BLOCKING_INIT_NOTIFIER_HEAD(&gdev->line_state_notifier); BLOCKING_INIT_NOTIFIER_HEAD(&gdev->device_notifier); init_rwsem(&gdev->sem); @@ -1001,8 +1011,9 @@ err_free_gpiochip_mask: goto err_print_message; } err_remove_from_list: - scoped_guard(rwsem_write, &gpio_devices_sem) - list_del(&gdev->list); + spin_lock_irqsave(&gpio_lock, flags); + list_del(&gdev->list); + spin_unlock_irqrestore(&gpio_lock, flags); err_free_label: kfree_const(gdev->label); err_free_descs: @@ -1065,7 +1076,7 @@ void gpiochip_remove(struct gpio_chip *gc) dev_crit(&gdev->dev, "REMOVING GPIOCHIP WITH GPIOS STILL REQUESTED\n"); - scoped_guard(rwsem_write, &gpio_devices_sem) + scoped_guard(spinlock_irqsave, &gpio_lock) list_del(&gdev->list); /* @@ -1114,7 +1125,7 @@ struct gpio_device *gpio_device_find(void *data, */ might_sleep(); - guard(rwsem_read)(&gpio_devices_sem); + guard(spinlock_irqsave)(&gpio_lock); list_for_each_entry(gdev, &gpio_devices, list) { if (gdev->chip && match(gdev->chip, data)) @@ -4725,33 +4736,35 @@ static void gpiolib_dbg_show(struct seq_file *s, struct gpio_device *gdev) static void *gpiolib_seq_start(struct seq_file *s, loff_t *pos) { + unsigned long flags; struct gpio_device *gdev = NULL; loff_t index = *pos; s->private = ""; - guard(rwsem_read)(&gpio_devices_sem); - - list_for_each_entry(gdev, &gpio_devices, list) { - if (index-- == 0) + spin_lock_irqsave(&gpio_lock, flags); + list_for_each_entry(gdev, &gpio_devices, list) + if (index-- == 0) { + spin_unlock_irqrestore(&gpio_lock, flags); return gdev; - } + } + spin_unlock_irqrestore(&gpio_lock, flags); return NULL; } static void *gpiolib_seq_next(struct seq_file *s, void *v, loff_t *pos) { + unsigned long flags; struct gpio_device *gdev = v; void *ret = NULL; - scoped_guard(rwsem_read, &gpio_devices_sem) { - if (list_is_last(&gdev->list, &gpio_devices)) - ret = NULL; - else - ret = list_first_entry(&gdev->list, struct gpio_device, - list); - } + spin_lock_irqsave(&gpio_lock, flags); + if (list_is_last(&gdev->list, &gpio_devices)) + ret = NULL; + else + ret = list_first_entry(&gdev->list, struct gpio_device, list); + spin_unlock_irqrestore(&gpio_lock, flags); s->private = "\n"; ++*pos; diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h index 97df54abf57ac..a4a2520b5f31c 100644 --- a/drivers/gpio/gpiolib.h +++ b/drivers/gpio/gpiolib.h @@ -15,7 +15,6 @@ #include /* for enum gpiod_flags */ #include #include -#include #include #include @@ -137,7 +136,6 @@ int gpiod_set_transitory(struct gpio_desc *desc, bool transitory); extern spinlock_t gpio_lock; extern struct list_head gpio_devices; -extern struct rw_semaphore gpio_devices_sem; void gpiod_line_state_notify(struct gpio_desc *desc, unsigned long action); -- GitLab From c2945c435c999c63e47f337bc7c13c98c21d0bcc Mon Sep 17 00:00:00 2001 From: Romain Gantois Date: Tue, 16 Jan 2024 13:19:17 +0100 Subject: [PATCH 1034/1290] net: stmmac: Prevent DSA tags from breaking COE Some DSA tagging protocols change the EtherType field in the MAC header e.g. DSA_TAG_PROTO_(DSA/EDSA/BRCM/MTK/RTL4C_A/SJA1105). On TX these tagged frames are ignored by the checksum offload engine and IP header checker of some stmmac cores. On RX, the stmmac driver wrongly assumes that checksums have been computed for these tagged packets, and sets CHECKSUM_UNNECESSARY. Add an additional check in the stmmac TX and RX hotpaths so that COE is deactivated for packets with ethertypes that will not trigger the COE and IP header checks. Fixes: 6b2c6e4a938f ("net: stmmac: propagate feature flags to vlan") Cc: Reported-by: Richard Tresidder Link: https://lore.kernel.org/netdev/e5c6c75f-2dfa-4e50-a1fb-6bf4cdb617c2@electromag.com.au/ Reported-by: Romain Gantois Link: https://lore.kernel.org/netdev/c57283ed-6b9b-b0e6-ee12-5655c1c54495@bootlin.com/ Reviewed-by: Vladimir Oltean Reviewed-by: Linus Walleij Signed-off-by: Romain Gantois Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 32 +++++++++++++++++-- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index c78a96b8eb649..a0e46369ae158 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -4435,6 +4435,28 @@ dma_map_err: return NETDEV_TX_OK; } +/** + * stmmac_has_ip_ethertype() - Check if packet has IP ethertype + * @skb: socket buffer to check + * + * Check if a packet has an ethertype that will trigger the IP header checks + * and IP/TCP checksum engine of the stmmac core. + * + * Return: true if the ethertype can trigger the checksum engine, false + * otherwise + */ +static bool stmmac_has_ip_ethertype(struct sk_buff *skb) +{ + int depth = 0; + __be16 proto; + + proto = __vlan_get_protocol(skb, eth_header_parse_protocol(skb), + &depth); + + return (depth <= ETH_HLEN) && + (proto == htons(ETH_P_IP) || proto == htons(ETH_P_IPV6)); +} + /** * stmmac_xmit - Tx entry point of the driver * @skb : the socket buffer @@ -4499,9 +4521,13 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) /* DWMAC IPs can be synthesized to support tx coe only for a few tx * queues. In that case, checksum offloading for those queues that don't * support tx coe needs to fallback to software checksum calculation. + * + * Packets that won't trigger the COE e.g. most DSA-tagged packets will + * also have to be checksummed in software. */ if (csum_insertion && - priv->plat->tx_queues_cfg[queue].coe_unsupported) { + (priv->plat->tx_queues_cfg[queue].coe_unsupported || + !stmmac_has_ip_ethertype(skb))) { if (unlikely(skb_checksum_help(skb))) goto dma_map_err; csum_insertion = !csum_insertion; @@ -5066,7 +5092,7 @@ static void stmmac_dispatch_skb_zc(struct stmmac_priv *priv, u32 queue, stmmac_rx_vlan(priv->dev, skb); skb->protocol = eth_type_trans(skb, priv->dev); - if (unlikely(!coe)) + if (unlikely(!coe) || !stmmac_has_ip_ethertype(skb)) skb_checksum_none_assert(skb); else skb->ip_summed = CHECKSUM_UNNECESSARY; @@ -5589,7 +5615,7 @@ drain_data: skb->protocol = eth_type_trans(skb, priv->dev); - if (unlikely(!coe)) + if (unlikely(!coe) || !stmmac_has_ip_ethertype(skb)) skb_checksum_none_assert(skb); else skb->ip_summed = CHECKSUM_UNNECESSARY; -- GitLab From c0f5aec28edf98906d28f08daace6522adf9ee7a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 16 Jan 2024 18:18:47 +0100 Subject: [PATCH 1035/1290] mptcp: relax check on MPC passive fallback While testing the blamed commit below, I was able to miss (!) packetdrill failures in the fastopen test-cases. On passive fastopen the child socket is created by incoming TCP MPC syn, allow for both MPC_SYN and MPC_ACK header. Fixes: 724b00c12957 ("mptcp: refine opt_mp_capable determination") Reviewed-by: Matthieu Baerts Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 1117d1e84274a..0dcb721c89d19 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -783,7 +783,8 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, * options. */ mptcp_get_options(skb, &mp_opt); - if (!(mp_opt.suboptions & OPTION_MPTCP_MPC_ACK)) + if (!(mp_opt.suboptions & + (OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_ACK))) fallback = true; } else if (subflow_req->mp_join) { -- GitLab From ea937f77208323d35ffe2f8d8fc81b00118bfcda Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Jan 2024 11:14:00 -0800 Subject: [PATCH 1036/1290] net: netdevsim: don't try to destroy PHC on VFs PHC gets initialized in nsim_init_netdevsim(), which is only called if (nsim_dev_port_is_pf()). Create a counterpart of nsim_init_netdevsim() and move the mock_phc_destroy() there. This fixes a crash trying to destroy netdevsim with VFs instantiated, as caught by running the devlink.sh test: BUG: kernel NULL pointer dereference, address: 00000000000000b8 RIP: 0010:mock_phc_destroy+0xd/0x30 Call Trace: nsim_destroy+0x4a/0x70 [netdevsim] __nsim_dev_port_del+0x47/0x70 [netdevsim] nsim_dev_reload_destroy+0x105/0x120 [netdevsim] nsim_drv_remove+0x2f/0xb0 [netdevsim] device_release_driver_internal+0x1a1/0x210 bus_remove_device+0xd5/0x120 device_del+0x159/0x490 device_unregister+0x12/0x30 del_device_store+0x11a/0x1a0 [netdevsim] kernfs_fop_write_iter+0x130/0x1d0 vfs_write+0x30b/0x4b0 ksys_write+0x69/0xf0 do_syscall_64+0xcc/0x1e0 entry_SYSCALL_64_after_hwframe+0x6f/0x77 Fixes: b63e78fca889 ("net: netdevsim: use mock PHC driver") Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/netdevsim/netdev.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c index aecaf5f44374f..77e8250282a51 100644 --- a/drivers/net/netdevsim/netdev.c +++ b/drivers/net/netdevsim/netdev.c @@ -369,6 +369,12 @@ static int nsim_init_netdevsim_vf(struct netdevsim *ns) return err; } +static void nsim_exit_netdevsim(struct netdevsim *ns) +{ + nsim_udp_tunnels_info_destroy(ns->netdev); + mock_phc_destroy(ns->phc); +} + struct netdevsim * nsim_create(struct nsim_dev *nsim_dev, struct nsim_dev_port *nsim_dev_port) { @@ -417,8 +423,7 @@ void nsim_destroy(struct netdevsim *ns) } rtnl_unlock(); if (nsim_dev_port_is_pf(ns->nsim_dev_port)) - nsim_udp_tunnels_info_destroy(dev); - mock_phc_destroy(ns->phc); + nsim_exit_netdevsim(ns); free_netdev(dev); } -- GitLab From 0617c3de9b4026b87be12b0cb5c35f42c7c66fcb Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 3 Jan 2024 23:34:58 +0100 Subject: [PATCH 1037/1290] netfilter: nf_tables: reject invalid set policy Report -EINVAL in case userspace provides a unsupported set backend policy. Fixes: c50b960ccc59 ("netfilter: nf_tables: implement proper set selection") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 8438a8922e4ab..a90a364f5be5a 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5048,8 +5048,16 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, } desc.policy = NFT_SET_POL_PERFORMANCE; - if (nla[NFTA_SET_POLICY] != NULL) + if (nla[NFTA_SET_POLICY] != NULL) { desc.policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY])); + switch (desc.policy) { + case NFT_SET_POL_PERFORMANCE: + case NFT_SET_POL_MEMORY: + break; + default: + return -EOPNOTSUPP; + } + } if (nla[NFTA_SET_DESC] != NULL) { err = nf_tables_set_desc_parse(&desc, nla[NFTA_SET_DESC]); -- GitLab From 65b3bd600e15e00fb9e433bbc8aa55e42b202055 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 6 Jan 2024 23:52:02 +0100 Subject: [PATCH 1038/1290] netfilter: nf_tables: validate .maxattr at expression registration struct nft_expr_info allows to store up to NFT_EXPR_MAXATTR (16) attributes when parsing netlink attributes. Rise a warning in case there is ever a nft expression whose .maxattr goes beyond this number of expressions, in such case, struct nft_expr_info needs to be updated. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a90a364f5be5a..2548d7d56408c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2977,6 +2977,9 @@ static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info, */ int nft_register_expr(struct nft_expr_type *type) { + if (WARN_ON_ONCE(type->maxattr > NFT_EXPR_MAXATTR)) + return -ENOMEM; + nfnl_lock(NFNL_SUBSYS_NFTABLES); if (type->family == NFPROTO_UNSPEC) list_add_tail_rcu(&type->list, &nf_tables_expressions); -- GitLab From 3c13725f43dcf43ad8a9bcd6a9f12add19a8f93e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 7 Jan 2024 23:00:15 +0100 Subject: [PATCH 1039/1290] netfilter: nf_tables: bail out if stateful expression provides no .clone All existing NFT_EXPR_STATEFUL provide a .clone interface, remove fallback to copy content of stateful expression since this is never exercised and bail out if .clone interface is not defined. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2548d7d56408c..b3db97440db15 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3274,14 +3274,13 @@ int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src) { int err; - if (src->ops->clone) { - dst->ops = src->ops; - err = src->ops->clone(dst, src); - if (err < 0) - return err; - } else { - memcpy(dst, src, src->ops->size); - } + if (WARN_ON_ONCE(!src->ops->clone)) + return -EINVAL; + + dst->ops = src->ops; + err = src->ops->clone(dst, src); + if (err < 0) + return err; __module_get(src->ops->type->owner); -- GitLab From 91a139cee1202a4599a380810d93c69b5bac6197 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 10 Jan 2024 00:42:37 +0100 Subject: [PATCH 1040/1290] netfilter: nft_limit: do not ignore unsupported flags Bail out if userspace provides unsupported flags, otherwise future extensions to the limit expression will be silently ignored by the kernel. Fixes: c7862a5f0de5 ("netfilter: nft_limit: allow to invert matching criteria") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_limit.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index 145dc62c62472..79039afde34ec 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -58,6 +58,7 @@ static inline bool nft_limit_eval(struct nft_limit_priv *priv, u64 cost) static int nft_limit_init(struct nft_limit_priv *priv, const struct nlattr * const tb[], bool pkts) { + bool invert = false; u64 unit, tokens; if (tb[NFTA_LIMIT_RATE] == NULL || @@ -90,19 +91,23 @@ static int nft_limit_init(struct nft_limit_priv *priv, priv->rate); } + if (tb[NFTA_LIMIT_FLAGS]) { + u32 flags = ntohl(nla_get_be32(tb[NFTA_LIMIT_FLAGS])); + + if (flags & ~NFT_LIMIT_F_INV) + return -EOPNOTSUPP; + + if (flags & NFT_LIMIT_F_INV) + invert = true; + } + priv->limit = kmalloc(sizeof(*priv->limit), GFP_KERNEL_ACCOUNT); if (!priv->limit) return -ENOMEM; priv->limit->tokens = tokens; priv->tokens_max = priv->limit->tokens; - - if (tb[NFTA_LIMIT_FLAGS]) { - u32 flags = ntohl(nla_get_be32(tb[NFTA_LIMIT_FLAGS])); - - if (flags & NFT_LIMIT_F_INV) - priv->invert = true; - } + priv->invert = invert; priv->limit->last = ktime_get_ns(); spin_lock_init(&priv->limit->lock); -- GitLab From c3f9fd54cd87233f53bdf0e191a86b3a5e960e02 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 11 Jan 2024 23:06:37 +0800 Subject: [PATCH 1041/1290] netfilter: nfnetlink_log: use proper helper for fetching physinif We don't use physindev in __build_packet_message except for getting physinif from it. So let's switch to nf_bridge_get_physinif to get what we want directly. Signed-off-by: Pavel Tikhomirov Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_log.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index f03f4d4d7d889..134e05d31061e 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -508,7 +508,7 @@ __build_packet_message(struct nfnl_log_net *log, htonl(br_port_get_rcu(indev)->br->dev->ifindex))) goto nla_put_failure; } else { - struct net_device *physindev; + int physinif; /* Case 2: indev is bridge group, we need to look for * physical device (when called from ipv4) */ @@ -516,10 +516,10 @@ __build_packet_message(struct nfnl_log_net *log, htonl(indev->ifindex))) goto nla_put_failure; - physindev = nf_bridge_get_physindev(skb); - if (physindev && + physinif = nf_bridge_get_physinif(skb); + if (physinif && nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV, - htonl(physindev->ifindex))) + htonl(physinif))) goto nla_put_failure; } #endif -- GitLab From aeaa44075f8e49e2e0ad4507d925e690b7950145 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 11 Jan 2024 23:06:38 +0800 Subject: [PATCH 1042/1290] netfilter: nf_queue: remove excess nf_bridge variable We don't really need nf_bridge variable here. And nf_bridge_info_exists is better replacement for nf_bridge_info_get in case we are only checking for existence. Signed-off-by: Pavel Tikhomirov Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_queue.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 63d1516816b1f..3dfcb3ac5cb44 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -82,10 +82,8 @@ static void __nf_queue_entry_init_physdevs(struct nf_queue_entry *entry) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) const struct sk_buff *skb = entry->skb; - struct nf_bridge_info *nf_bridge; - nf_bridge = nf_bridge_info_get(skb); - if (nf_bridge) { + if (nf_bridge_info_exists(skb)) { entry->physin = nf_bridge_get_physindev(skb); entry->physout = nf_bridge_get_physoutdev(skb); } else { -- GitLab From a54e72197037d2c9bfcd70dddaac8c8ccb5b41ba Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 11 Jan 2024 23:06:39 +0800 Subject: [PATCH 1043/1290] netfilter: propagate net to nf_bridge_get_physindev This is a preparation patch for replacing physindev with physinif on nf_bridge_info structure. We will use dev_get_by_index_rcu to resolve device, when needed, and it requires net to be available. Signed-off-by: Pavel Tikhomirov Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge.h | 2 +- net/ipv4/netfilter/nf_reject_ipv4.c | 2 +- net/ipv6/netfilter/nf_reject_ipv6.c | 2 +- net/netfilter/ipset/ip_set_hash_netiface.c | 8 ++++---- net/netfilter/nf_log_syslog.c | 13 +++++++------ net/netfilter/nf_queue.c | 2 +- net/netfilter/xt_physdev.c | 2 +- 7 files changed, 16 insertions(+), 15 deletions(-) diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index f980edfdd2783..e927b9a15a556 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -56,7 +56,7 @@ static inline int nf_bridge_get_physoutif(const struct sk_buff *skb) } static inline struct net_device * -nf_bridge_get_physindev(const struct sk_buff *skb) +nf_bridge_get_physindev(const struct sk_buff *skb, struct net *net) { const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index f01b038fc1cda..86e7d390671af 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -289,7 +289,7 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb, * build the eth header using the original destination's MAC as the * source, and send the RST packet directly. */ - br_indev = nf_bridge_get_physindev(oldskb); + br_indev = nf_bridge_get_physindev(oldskb, net); if (br_indev) { struct ethhdr *oeth = eth_hdr(oldskb); diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index d45bc54b7ea55..27b2164f4c439 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -354,7 +354,7 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, * build the eth header using the original destination's MAC as the * source, and send the RST packet directly. */ - br_indev = nf_bridge_get_physindev(oldskb); + br_indev = nf_bridge_get_physindev(oldskb, net); if (br_indev) { struct ethhdr *oeth = eth_hdr(oldskb); diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 95aeb31c60e0d..30a655e5c4fdc 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -138,9 +138,9 @@ hash_netiface4_data_next(struct hash_netiface4_elem *next, #include "ip_set_hash_gen.h" #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) -static const char *get_physindev_name(const struct sk_buff *skb) +static const char *get_physindev_name(const struct sk_buff *skb, struct net *net) { - struct net_device *dev = nf_bridge_get_physindev(skb); + struct net_device *dev = nf_bridge_get_physindev(skb, net); return dev ? dev->name : NULL; } @@ -177,7 +177,7 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb, if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - const char *eiface = SRCDIR ? get_physindev_name(skb) : + const char *eiface = SRCDIR ? get_physindev_name(skb, xt_net(par)) : get_physoutdev_name(skb); if (!eiface) @@ -395,7 +395,7 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb, if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - const char *eiface = SRCDIR ? get_physindev_name(skb) : + const char *eiface = SRCDIR ? get_physindev_name(skb, xt_net(par)) : get_physoutdev_name(skb); if (!eiface) diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c index c66689ad2b491..58402226045e8 100644 --- a/net/netfilter/nf_log_syslog.c +++ b/net/netfilter/nf_log_syslog.c @@ -111,7 +111,8 @@ nf_log_dump_packet_common(struct nf_log_buf *m, u8 pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, - const struct nf_loginfo *loginfo, const char *prefix) + const struct nf_loginfo *loginfo, const char *prefix, + struct net *net) { const struct net_device *physoutdev __maybe_unused; const struct net_device *physindev __maybe_unused; @@ -121,7 +122,7 @@ nf_log_dump_packet_common(struct nf_log_buf *m, u8 pf, in ? in->name : "", out ? out->name : ""); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - physindev = nf_bridge_get_physindev(skb); + physindev = nf_bridge_get_physindev(skb, net); if (physindev && in != physindev) nf_log_buf_add(m, "PHYSIN=%s ", physindev->name); physoutdev = nf_bridge_get_physoutdev(skb); @@ -148,7 +149,7 @@ static void nf_log_arp_packet(struct net *net, u_int8_t pf, loginfo = &default_loginfo; nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo, - prefix); + prefix, net); dump_arp_packet(m, loginfo, skb, skb_network_offset(skb)); nf_log_buf_close(m); @@ -845,7 +846,7 @@ static void nf_log_ip_packet(struct net *net, u_int8_t pf, loginfo = &default_loginfo; nf_log_dump_packet_common(m, pf, hooknum, skb, in, - out, loginfo, prefix); + out, loginfo, prefix, net); if (in) dump_mac_header(m, loginfo, skb); @@ -880,7 +881,7 @@ static void nf_log_ip6_packet(struct net *net, u_int8_t pf, loginfo = &default_loginfo; nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, - loginfo, prefix); + loginfo, prefix, net); if (in) dump_mac_header(m, loginfo, skb); @@ -916,7 +917,7 @@ static void nf_log_unknown_packet(struct net *net, u_int8_t pf, loginfo = &default_loginfo; nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo, - prefix); + prefix, net); dump_mac_header(m, loginfo, skb); diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 3dfcb3ac5cb44..e2f334f70281f 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -84,7 +84,7 @@ static void __nf_queue_entry_init_physdevs(struct nf_queue_entry *entry) const struct sk_buff *skb = entry->skb; if (nf_bridge_info_exists(skb)) { - entry->physin = nf_bridge_get_physindev(skb); + entry->physin = nf_bridge_get_physindev(skb, entry->state.net); entry->physout = nf_bridge_get_physoutdev(skb); } else { entry->physin = NULL; diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index ec6ed6fda96c5..343e65f377d44 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -59,7 +59,7 @@ physdev_mt(const struct sk_buff *skb, struct xt_action_param *par) (!!outdev ^ !(info->invert & XT_PHYSDEV_OP_BRIDGED))) return false; - physdev = nf_bridge_get_physindev(skb); + physdev = nf_bridge_get_physindev(skb, xt_net(par)); indev = physdev ? physdev->name : NULL; if ((info->bitmask & XT_PHYSDEV_OP_ISIN && -- GitLab From 9874808878d9eed407e3977fd11fee49de1e1d86 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 11 Jan 2024 23:06:40 +0800 Subject: [PATCH 1044/1290] netfilter: bridge: replace physindev with physinif in nf_bridge_info An skb can be added to a neigh->arp_queue while waiting for an arp reply. Where original skb's skb->dev can be different to neigh's neigh->dev. For instance in case of bridging dnated skb from one veth to another, the skb would be added to a neigh->arp_queue of the bridge. As skb->dev can be reset back to nf_bridge->physindev and used, and as there is no explicit mechanism that prevents this physindev from been freed under us (for instance neigh_flush_dev doesn't cleanup skbs from different device's neigh queue) we can crash on e.g. this stack: arp_process neigh_update skb = __skb_dequeue(&neigh->arp_queue) neigh_resolve_output(..., skb) ... br_nf_dev_xmit br_nf_pre_routing_finish_bridge_slow skb->dev = nf_bridge->physindev br_handle_frame_finish Let's use plain ifindex instead of net_device link. To peek into the original net_device we will use dev_get_by_index_rcu(). Thus either we get device and are safe to use it or we don't get it and drop skb. Fixes: c4e70a87d975 ("netfilter: bridge: rename br_netfilter.c to br_netfilter_hooks.c") Suggested-by: Florian Westphal Signed-off-by: Pavel Tikhomirov Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge.h | 4 +-- include/linux/skbuff.h | 2 +- net/bridge/br_netfilter_hooks.c | 42 +++++++++++++++++++++++------ net/bridge/br_netfilter_ipv6.c | 14 +++++++--- net/ipv4/netfilter/nf_reject_ipv4.c | 9 ++++--- net/ipv6/netfilter/nf_reject_ipv6.c | 11 +++++--- 6 files changed, 61 insertions(+), 21 deletions(-) diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index e927b9a15a556..743475ca7e9d5 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -42,7 +42,7 @@ static inline int nf_bridge_get_physinif(const struct sk_buff *skb) if (!nf_bridge) return 0; - return nf_bridge->physindev ? nf_bridge->physindev->ifindex : 0; + return nf_bridge->physinif; } static inline int nf_bridge_get_physoutif(const struct sk_buff *skb) @@ -60,7 +60,7 @@ nf_bridge_get_physindev(const struct sk_buff *skb, struct net *net) { const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - return nf_bridge ? nf_bridge->physindev : NULL; + return nf_bridge ? dev_get_by_index_rcu(net, nf_bridge->physinif) : NULL; } static inline struct net_device * diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a5ae952454c89..2dde34c29203b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -295,7 +295,7 @@ struct nf_bridge_info { u8 bridged_dnat:1; u8 sabotage_in_done:1; __u16 frag_max_size; - struct net_device *physindev; + int physinif; /* always valid & non-NULL from FORWARD on, for physdev match */ struct net_device *physoutdev; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 6adcb45bca75d..ed17208907578 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -279,8 +279,17 @@ int br_nf_pre_routing_finish_bridge(struct net *net, struct sock *sk, struct sk_ if ((READ_ONCE(neigh->nud_state) & NUD_CONNECTED) && READ_ONCE(neigh->hh.hh_len)) { + struct net_device *br_indev; + + br_indev = nf_bridge_get_physindev(skb, net); + if (!br_indev) { + neigh_release(neigh); + goto free_skb; + } + neigh_hh_bridge(&neigh->hh, skb); - skb->dev = nf_bridge->physindev; + skb->dev = br_indev; + ret = br_handle_frame_finish(net, sk, skb); } else { /* the neighbour function below overwrites the complete @@ -352,12 +361,18 @@ br_nf_ipv4_daddr_was_changed(const struct sk_buff *skb, */ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net_device *dev = skb->dev; + struct net_device *dev = skb->dev, *br_indev; struct iphdr *iph = ip_hdr(skb); struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct rtable *rt; int err; + br_indev = nf_bridge_get_physindev(skb, net); + if (!br_indev) { + kfree_skb(skb); + return 0; + } + nf_bridge->frag_max_size = IPCB(skb)->frag_max_size; if (nf_bridge->pkt_otherhost) { @@ -397,7 +412,7 @@ free_skb: } else { if (skb_dst(skb)->dev == dev) { bridged_dnat: - skb->dev = nf_bridge->physindev; + skb->dev = br_indev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); br_nf_hook_thresh(NF_BR_PRE_ROUTING, @@ -410,7 +425,7 @@ bridged_dnat: skb->pkt_type = PACKET_HOST; } } else { - rt = bridge_parent_rtable(nf_bridge->physindev); + rt = bridge_parent_rtable(br_indev); if (!rt) { kfree_skb(skb); return 0; @@ -419,7 +434,7 @@ bridged_dnat: skb_dst_set_noref(skb, &rt->dst); } - skb->dev = nf_bridge->physindev; + skb->dev = br_indev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL, @@ -456,7 +471,7 @@ struct net_device *setup_pre_routing(struct sk_buff *skb, const struct net *net) } nf_bridge->in_prerouting = 1; - nf_bridge->physindev = skb->dev; + nf_bridge->physinif = skb->dev->ifindex; skb->dev = brnf_get_logical_dev(skb, skb->dev, net); if (skb->protocol == htons(ETH_P_8021Q)) @@ -553,7 +568,11 @@ static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff if (skb->protocol == htons(ETH_P_IPV6)) nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size; - in = nf_bridge->physindev; + in = nf_bridge_get_physindev(skb, net); + if (!in) { + kfree_skb(skb); + return 0; + } if (nf_bridge->pkt_otherhost) { skb->pkt_type = PACKET_OTHERHOST; nf_bridge->pkt_otherhost = false; @@ -899,6 +918,13 @@ static unsigned int ip_sabotage_in(void *priv, static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + struct net_device *br_indev; + + br_indev = nf_bridge_get_physindev(skb, dev_net(skb->dev)); + if (!br_indev) { + kfree_skb(skb); + return; + } skb_pull(skb, ETH_HLEN); nf_bridge->bridged_dnat = 0; @@ -908,7 +934,7 @@ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb) skb_copy_to_linear_data_offset(skb, -(ETH_HLEN - ETH_ALEN), nf_bridge->neigh_header, ETH_HLEN - ETH_ALEN); - skb->dev = nf_bridge->physindev; + skb->dev = br_indev; nf_bridge->physoutdev = NULL; br_handle_frame_finish(dev_net(skb->dev), NULL, skb); diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c index 2e24a743f9173..e0421eaa3abc7 100644 --- a/net/bridge/br_netfilter_ipv6.c +++ b/net/bridge/br_netfilter_ipv6.c @@ -102,9 +102,15 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct rtable *rt; - struct net_device *dev = skb->dev; + struct net_device *dev = skb->dev, *br_indev; const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); + br_indev = nf_bridge_get_physindev(skb, net); + if (!br_indev) { + kfree_skb(skb); + return 0; + } + nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size; if (nf_bridge->pkt_otherhost) { @@ -122,7 +128,7 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc } if (skb_dst(skb)->dev == dev) { - skb->dev = nf_bridge->physindev; + skb->dev = br_indev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); br_nf_hook_thresh(NF_BR_PRE_ROUTING, @@ -133,7 +139,7 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); skb->pkt_type = PACKET_HOST; } else { - rt = bridge_parent_rtable(nf_bridge->physindev); + rt = bridge_parent_rtable(br_indev); if (!rt) { kfree_skb(skb); return 0; @@ -142,7 +148,7 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc skb_dst_set_noref(skb, &rt->dst); } - skb->dev = nf_bridge->physindev; + skb->dev = br_indev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 86e7d390671af..04504b2b51df5 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -239,7 +239,6 @@ static int nf_reject_fill_skb_dst(struct sk_buff *skb_in) void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb, int hook) { - struct net_device *br_indev __maybe_unused; struct sk_buff *nskb; struct iphdr *niph; const struct tcphdr *oth; @@ -289,9 +288,13 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb, * build the eth header using the original destination's MAC as the * source, and send the RST packet directly. */ - br_indev = nf_bridge_get_physindev(oldskb, net); - if (br_indev) { + if (nf_bridge_info_exists(oldskb)) { struct ethhdr *oeth = eth_hdr(oldskb); + struct net_device *br_indev; + + br_indev = nf_bridge_get_physindev(oldskb, net); + if (!br_indev) + goto free_nskb; nskb->dev = br_indev; niph->tot_len = htons(nskb->len); diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index 27b2164f4c439..196dd4ecb5e21 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -278,7 +278,6 @@ static int nf_reject6_fill_skb_dst(struct sk_buff *skb_in) void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, int hook) { - struct net_device *br_indev __maybe_unused; struct sk_buff *nskb; struct tcphdr _otcph; const struct tcphdr *otcph; @@ -354,9 +353,15 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, * build the eth header using the original destination's MAC as the * source, and send the RST packet directly. */ - br_indev = nf_bridge_get_physindev(oldskb, net); - if (br_indev) { + if (nf_bridge_info_exists(oldskb)) { struct ethhdr *oeth = eth_hdr(oldskb); + struct net_device *br_indev; + + br_indev = nf_bridge_get_physindev(oldskb, net); + if (!br_indev) { + kfree_skb(nskb); + return; + } nskb->dev = br_indev; nskb->protocol = htons(ETH_P_IPV6); -- GitLab From b1db244ffd041a49ecc9618e8feb6b5c1afcdaa7 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 12 Jan 2024 23:28:45 +0100 Subject: [PATCH 1045/1290] netfilter: nf_tables: check if catch-all set element is active in next generation When deactivating the catch-all set element, check the state in the next generation that represents this transaction. This bug uncovered after the recent removal of the element busy mark a2dd0233cbc4 ("netfilter: nf_tables: remove busy mark and gc batch API"). Fixes: aaa31047a6d2 ("netfilter: nftables: add catch-all set element support") Cc: stable@vger.kernel.org Reported-by: lonial con Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index b3db97440db15..50b595ef63892 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6578,7 +6578,7 @@ static int nft_setelem_catchall_deactivate(const struct net *net, list_for_each_entry(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); - if (!nft_is_active(net, ext)) + if (!nft_is_active_next(net, ext)) continue; kfree(elem->priv); -- GitLab From 3ce67e3793f48c1b9635beb9bb71116ca1e51b58 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 14 Jan 2024 23:53:39 +0100 Subject: [PATCH 1046/1290] netfilter: nf_tables: do not allow mismatch field size and set key length The set description provides the size of each field in the set whose sum should not mismatch the set key length, bail out otherwise. I did not manage to crash nft_set_pipapo with mismatch fields and set key length so far, but this is UB which must be disallowed. Fixes: f3a2181e16f1 ("netfilter: nf_tables: Support for sets with multiple ranged fields") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 50b595ef63892..e9fa4a32c0939 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4813,8 +4813,8 @@ static int nft_set_desc_concat_parse(const struct nlattr *attr, static int nft_set_desc_concat(struct nft_set_desc *desc, const struct nlattr *nla) { + u32 num_regs = 0, key_num_regs = 0; struct nlattr *attr; - u32 num_regs = 0; int rem, err, i; nla_for_each_nested(attr, nla, rem) { @@ -4829,6 +4829,10 @@ static int nft_set_desc_concat(struct nft_set_desc *desc, for (i = 0; i < desc->field_count; i++) num_regs += DIV_ROUND_UP(desc->field_len[i], sizeof(u32)); + key_num_regs = DIV_ROUND_UP(desc->klen, sizeof(u32)); + if (key_num_regs != num_regs) + return -EINVAL; + if (num_regs > NFT_REG32_COUNT) return -E2BIG; -- GitLab From 6b1ca88e4bb63673dc9f9c7f23c899f22c3cb17a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 15 Jan 2024 00:14:38 +0100 Subject: [PATCH 1047/1290] netfilter: nf_tables: skip dead set elements in netlink dump Delete from packet path relies on the garbage collector to purge elements with NFT_SET_ELEM_DEAD_BIT on. Skip these dead elements from nf_tables_dump_setelem() path, I very rarely see tests/shell/testcases/maps/typeof_maps_add_delete reports [DUMP FAILED] showing a mismatch in the expected output with an element that should not be there. If the netlink dump happens before GC worker run, it might show dead elements in the ruleset listing. nft_rhash_get() already skips dead elements in nft_rhash_cmp(), therefore, it already does not show the element when getting a single element via netlink control plane. Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index e9fa4a32c0939..88a6a858383b6 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5718,7 +5718,7 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); struct nft_set_dump_args *args; - if (nft_set_elem_expired(ext)) + if (nft_set_elem_expired(ext) || nft_set_elem_is_dead(ext)) return 0; args = container_of(iter, struct nft_set_dump_args, iter); -- GitLab From 113661e07460a6604aacc8ae1b23695a89e7d4b3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 15 Jan 2024 12:50:29 +0100 Subject: [PATCH 1048/1290] netfilter: nf_tables: reject NFT_SET_CONCAT with not field length description It is still possible to set on the NFT_SET_CONCAT flag by specifying a set size and no field description, report EINVAL in such case. Fixes: 1b6345d4160e ("netfilter: nf_tables: check NFT_SET_CONCAT flag if field_count is specified") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 88a6a858383b6..4b55533ce5ca2 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5070,8 +5070,12 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (err < 0) return err; - if (desc.field_count > 1 && !(flags & NFT_SET_CONCAT)) + if (desc.field_count > 1) { + if (!(flags & NFT_SET_CONCAT)) + return -EINVAL; + } else if (flags & NFT_SET_CONCAT) { return -EINVAL; + } } else if (flags & NFT_SET_CONCAT) { return -EINVAL; } -- GitLab From d6938c1c76c64f42363d0d1f051e1b4641c2ad40 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Mon, 15 Jan 2024 17:39:22 +0300 Subject: [PATCH 1049/1290] ipvs: avoid stat macros calls from preemptible context Inside decrement_ttl() upon discovering that the packet ttl has exceeded, __IP_INC_STATS and __IP6_INC_STATS macros can be called from preemptible context having the following backtrace: check_preemption_disabled: 48 callbacks suppressed BUG: using __this_cpu_add() in preemptible [00000000] code: curl/1177 caller is decrement_ttl+0x217/0x830 CPU: 5 PID: 1177 Comm: curl Not tainted 6.7.0+ #34 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 04/01/2014 Call Trace: dump_stack_lvl+0xbd/0xe0 check_preemption_disabled+0xd1/0xe0 decrement_ttl+0x217/0x830 __ip_vs_get_out_rt+0x4e0/0x1ef0 ip_vs_nat_xmit+0x205/0xcd0 ip_vs_in_hook+0x9b1/0x26a0 nf_hook_slow+0xc2/0x210 nf_hook+0x1fb/0x770 __ip_local_out+0x33b/0x640 ip_local_out+0x2a/0x490 __ip_queue_xmit+0x990/0x1d10 __tcp_transmit_skb+0x288b/0x3d10 tcp_connect+0x3466/0x5180 tcp_v4_connect+0x1535/0x1bb0 __inet_stream_connect+0x40d/0x1040 inet_stream_connect+0x57/0xa0 __sys_connect_file+0x162/0x1a0 __sys_connect+0x137/0x160 __x64_sys_connect+0x72/0xb0 do_syscall_64+0x6f/0x140 entry_SYSCALL_64_after_hwframe+0x6e/0x76 RIP: 0033:0x7fe6dbbc34e0 Use the corresponding preemption-aware variants: IP_INC_STATS and IP6_INC_STATS. Found by Linux Verification Center (linuxtesting.org). Fixes: 8d8e20e2d7bb ("ipvs: Decrement ttl") Signed-off-by: Fedor Pchelkin Acked-by: Julian Anastasov Acked-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_xmit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 9193e109e6b38..65e0259178da4 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -271,7 +271,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs, skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); - __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); + IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); return false; } @@ -286,7 +286,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs, { if (ip_hdr(skb)->ttl <= 1) { /* Tell the sender its packet died... */ - __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); + IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); return false; } -- GitLab From 8f54fca3f8fce49c2d5f4ec4b7c325d1e50916df Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 4 Jan 2024 16:01:30 +0100 Subject: [PATCH 1050/1290] s390/net: add Thorsten Winkler as maintainer Thank you Wenjia for your support, welcome Thorsten! Acked-by: Wenjia Zhang Acked-by: Thorsten Winkler Signed-off-by: Alexandra Winter Signed-off-by: Alexander Gordeev --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index cc92b10a4cada..20396600f86a2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18962,7 +18962,7 @@ F: drivers/iommu/s390-iommu.c S390 IUCV NETWORK LAYER M: Alexandra Winter -M: Wenjia Zhang +M: Thorsten Winkler L: linux-s390@vger.kernel.org L: netdev@vger.kernel.org S: Supported @@ -18981,7 +18981,7 @@ F: arch/s390/mm S390 NETWORK DRIVERS M: Alexandra Winter -M: Wenjia Zhang +M: Thorsten Winkler L: linux-s390@vger.kernel.org L: netdev@vger.kernel.org S: Supported -- GitLab From 850fb7fa8c684a4c6bf0e4b6978f4ddcc5d43d11 Mon Sep 17 00:00:00 2001 From: Tony Krowiak Date: Mon, 15 Jan 2024 13:54:31 -0500 Subject: [PATCH 1051/1290] s390/vfio-ap: always filter entire AP matrix The vfio_ap_mdev_filter_matrix function is called whenever a new adapter or domain is assigned to the mdev. The purpose of the function is to update the guest's AP configuration by filtering the matrix of adapters and domains assigned to the mdev. When an adapter or domain is assigned, only the APQNs associated with the APID of the new adapter or APQI of the new domain are inspected. If an APQN does not reference a queue device bound to the vfio_ap device driver, then it's APID will be filtered from the mdev's matrix when updating the guest's AP configuration. Inspecting only the APID of the new adapter or APQI of the new domain will result in passing AP queues through to a guest that are not bound to the vfio_ap device driver under certain circumstances. Consider the following: guest's AP configuration (all also assigned to the mdev's matrix): 14.0004 14.0005 14.0006 16.0004 16.0005 16.0006 unassign domain 4 unbind queue 16.0005 assign domain 4 When domain 4 is re-assigned, since only domain 4 will be inspected, the APQNs that will be examined will be: 14.0004 16.0004 Since both of those APQNs reference queue devices that are bound to the vfio_ap device driver, nothing will get filtered from the mdev's matrix when updating the guest's AP configuration. Consequently, queue 16.0005 will get passed through despite not being bound to the driver. This violates the linux device model requirement that a guest shall only be given access to devices bound to the device driver facilitating their pass-through. To resolve this problem, every adapter and domain assigned to the mdev will be inspected when filtering the mdev's matrix. Signed-off-by: Tony Krowiak Acked-by: Halil Pasic Fixes: 48cae940c31d ("s390/vfio-ap: refresh guest's APCB by filtering AP resources assigned to mdev") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240115185441.31526-2-akrowiak@linux.ibm.com Signed-off-by: Alexander Gordeev --- drivers/s390/crypto/vfio_ap_ops.c | 57 +++++++++---------------------- 1 file changed, 17 insertions(+), 40 deletions(-) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index acb710d3d7bcd..1f7a6c1067868 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -674,8 +674,7 @@ static bool vfio_ap_mdev_filter_cdoms(struct ap_matrix_mdev *matrix_mdev) * Return: a boolean value indicating whether the KVM guest's APCB was changed * by the filtering or not. */ -static bool vfio_ap_mdev_filter_matrix(unsigned long *apm, unsigned long *aqm, - struct ap_matrix_mdev *matrix_mdev) +static bool vfio_ap_mdev_filter_matrix(struct ap_matrix_mdev *matrix_mdev) { unsigned long apid, apqi, apqn; DECLARE_BITMAP(prev_shadow_apm, AP_DEVICES); @@ -696,8 +695,8 @@ static bool vfio_ap_mdev_filter_matrix(unsigned long *apm, unsigned long *aqm, bitmap_and(matrix_mdev->shadow_apcb.aqm, matrix_mdev->matrix.aqm, (unsigned long *)matrix_dev->info.aqm, AP_DOMAINS); - for_each_set_bit_inv(apid, apm, AP_DEVICES) { - for_each_set_bit_inv(apqi, aqm, AP_DOMAINS) { + for_each_set_bit_inv(apid, matrix_mdev->matrix.apm, AP_DEVICES) { + for_each_set_bit_inv(apqi, matrix_mdev->matrix.aqm, AP_DOMAINS) { /* * If the APQN is not bound to the vfio_ap device * driver, then we can't assign it to the guest's @@ -962,7 +961,6 @@ static ssize_t assign_adapter_store(struct device *dev, { int ret; unsigned long apid; - DECLARE_BITMAP(apm_delta, AP_DEVICES); struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev); mutex_lock(&ap_perms_mutex); @@ -991,11 +989,8 @@ static ssize_t assign_adapter_store(struct device *dev, } vfio_ap_mdev_link_adapter(matrix_mdev, apid); - memset(apm_delta, 0, sizeof(apm_delta)); - set_bit_inv(apid, apm_delta); - if (vfio_ap_mdev_filter_matrix(apm_delta, - matrix_mdev->matrix.aqm, matrix_mdev)) + if (vfio_ap_mdev_filter_matrix(matrix_mdev)) vfio_ap_mdev_update_guest_apcb(matrix_mdev); ret = count; @@ -1171,7 +1166,6 @@ static ssize_t assign_domain_store(struct device *dev, { int ret; unsigned long apqi; - DECLARE_BITMAP(aqm_delta, AP_DOMAINS); struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev); mutex_lock(&ap_perms_mutex); @@ -1200,11 +1194,8 @@ static ssize_t assign_domain_store(struct device *dev, } vfio_ap_mdev_link_domain(matrix_mdev, apqi); - memset(aqm_delta, 0, sizeof(aqm_delta)); - set_bit_inv(apqi, aqm_delta); - if (vfio_ap_mdev_filter_matrix(matrix_mdev->matrix.apm, aqm_delta, - matrix_mdev)) + if (vfio_ap_mdev_filter_matrix(matrix_mdev)) vfio_ap_mdev_update_guest_apcb(matrix_mdev); ret = count; @@ -2109,9 +2100,7 @@ int vfio_ap_mdev_probe_queue(struct ap_device *apdev) if (matrix_mdev) { vfio_ap_mdev_link_queue(matrix_mdev, q); - if (vfio_ap_mdev_filter_matrix(matrix_mdev->matrix.apm, - matrix_mdev->matrix.aqm, - matrix_mdev)) + if (vfio_ap_mdev_filter_matrix(matrix_mdev)) vfio_ap_mdev_update_guest_apcb(matrix_mdev); } dev_set_drvdata(&apdev->device, q); @@ -2461,34 +2450,22 @@ void vfio_ap_on_cfg_changed(struct ap_config_info *cur_cfg_info, static void vfio_ap_mdev_hot_plug_cfg(struct ap_matrix_mdev *matrix_mdev) { - bool do_hotplug = false; - int filter_domains = 0; - int filter_adapters = 0; - DECLARE_BITMAP(apm, AP_DEVICES); - DECLARE_BITMAP(aqm, AP_DOMAINS); + bool filter_domains, filter_adapters, filter_cdoms, do_hotplug = false; mutex_lock(&matrix_mdev->kvm->lock); mutex_lock(&matrix_dev->mdevs_lock); - filter_adapters = bitmap_and(apm, matrix_mdev->matrix.apm, - matrix_mdev->apm_add, AP_DEVICES); - filter_domains = bitmap_and(aqm, matrix_mdev->matrix.aqm, - matrix_mdev->aqm_add, AP_DOMAINS); - - if (filter_adapters && filter_domains) - do_hotplug |= vfio_ap_mdev_filter_matrix(apm, aqm, matrix_mdev); - else if (filter_adapters) - do_hotplug |= - vfio_ap_mdev_filter_matrix(apm, - matrix_mdev->shadow_apcb.aqm, - matrix_mdev); - else - do_hotplug |= - vfio_ap_mdev_filter_matrix(matrix_mdev->shadow_apcb.apm, - aqm, matrix_mdev); + filter_adapters = bitmap_intersects(matrix_mdev->matrix.apm, + matrix_mdev->apm_add, AP_DEVICES); + filter_domains = bitmap_intersects(matrix_mdev->matrix.aqm, + matrix_mdev->aqm_add, AP_DOMAINS); + filter_cdoms = bitmap_intersects(matrix_mdev->matrix.adm, + matrix_mdev->adm_add, AP_DOMAINS); + + if (filter_adapters || filter_domains) + do_hotplug = vfio_ap_mdev_filter_matrix(matrix_mdev); - if (bitmap_intersects(matrix_mdev->matrix.adm, matrix_mdev->adm_add, - AP_DOMAINS)) + if (filter_cdoms) do_hotplug |= vfio_ap_mdev_filter_cdoms(matrix_mdev); if (do_hotplug) -- GitLab From 16fb78cbf56e42b8efb2682a4444ab59e32e7959 Mon Sep 17 00:00:00 2001 From: Tony Krowiak Date: Mon, 15 Jan 2024 13:54:32 -0500 Subject: [PATCH 1052/1290] s390/vfio-ap: loop over the shadow APCB when filtering guest's AP configuration While filtering the mdev matrix, it doesn't make sense - and will have unexpected results - to filter an APID from the matrix if the APID or one of the associated APQIs is not in the host's AP configuration. There are two reasons for this: 1. An adapter or domain that is not in the host's AP configuration can be assigned to the matrix; this is known as over-provisioning. Queue devices, however, are only created for adapters and domains in the host's AP configuration, so there will be no queues associated with an over-provisioned adapter or domain to filter. 2. The adapter or domain may have been externally removed from the host's configuration via an SE or HMC attached to a DPM enabled LPAR. In this case, the vfio_ap device driver would have been notified by the AP bus via the on_config_changed callback and the adapter or domain would have already been filtered. Since the matrix_mdev->shadow_apcb.apm and matrix_mdev->shadow_apcb.aqm are copied from the mdev matrix sans the APIDs and APQIs not in the host's AP configuration, let's loop over those bitmaps instead of those assigned to the matrix. Signed-off-by: Tony Krowiak Reviewed-by: Halil Pasic Fixes: 48cae940c31d ("s390/vfio-ap: refresh guest's APCB by filtering AP resources assigned to mdev") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240115185441.31526-3-akrowiak@linux.ibm.com Signed-off-by: Alexander Gordeev --- drivers/s390/crypto/vfio_ap_ops.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 1f7a6c1067868..e825e13847fe0 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -695,8 +695,9 @@ static bool vfio_ap_mdev_filter_matrix(struct ap_matrix_mdev *matrix_mdev) bitmap_and(matrix_mdev->shadow_apcb.aqm, matrix_mdev->matrix.aqm, (unsigned long *)matrix_dev->info.aqm, AP_DOMAINS); - for_each_set_bit_inv(apid, matrix_mdev->matrix.apm, AP_DEVICES) { - for_each_set_bit_inv(apqi, matrix_mdev->matrix.aqm, AP_DOMAINS) { + for_each_set_bit_inv(apid, matrix_mdev->shadow_apcb.apm, AP_DEVICES) { + for_each_set_bit_inv(apqi, matrix_mdev->shadow_apcb.aqm, + AP_DOMAINS) { /* * If the APQN is not bound to the vfio_ap device * driver, then we can't assign it to the guest's -- GitLab From 774d10196e648e2c0b78da817f631edfb3dfa557 Mon Sep 17 00:00:00 2001 From: Tony Krowiak Date: Mon, 15 Jan 2024 13:54:33 -0500 Subject: [PATCH 1053/1290] s390/vfio-ap: let on_scan_complete() callback filter matrix and update guest's APCB When adapters and/or domains are added to the host's AP configuration, this may result in multiple queue devices getting created and probed by the vfio_ap device driver. For each queue device probed, the matrix of adapters and domains assigned to a matrix mdev will be filtered to update the guest's APCB. If any adapters or domains get added to or removed from the APCB, the guest's AP configuration will be dynamically updated (i.e., hot plug/unplug). To dynamically update the guest's configuration, its VCPUs must be taken out of SIE for the period of time it takes to make the update. This is disruptive to the guest's operation and if there are many queues probed due to a change in the host's AP configuration, this could be troublesome. The problem is exacerbated by the fact that the 'on_scan_complete' callback also filters the mdev's matrix and updates the guest's AP configuration. In order to reduce the potential amount of disruption to the guest that may result from a change to the host's AP configuration, let's bypass the filtering of the matrix and updating of the guest's AP configuration in the probe callback - if due to a host config change - and defer it until the 'on_scan_complete' callback is invoked after the AP bus finishes its device scan operation. This way the filtering and updating will be performed only once regardless of the number of queues added. Signed-off-by: Tony Krowiak Reviewed-by: Halil Pasic Fixes: 48cae940c31d ("s390/vfio-ap: refresh guest's APCB by filtering AP resources assigned to mdev") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240115185441.31526-4-akrowiak@linux.ibm.com Signed-off-by: Alexander Gordeev --- drivers/s390/crypto/vfio_ap_ops.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index e825e13847fe0..e45d0bf7b126e 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -2101,9 +2101,22 @@ int vfio_ap_mdev_probe_queue(struct ap_device *apdev) if (matrix_mdev) { vfio_ap_mdev_link_queue(matrix_mdev, q); + /* + * If we're in the process of handling the adding of adapters or + * domains to the host's AP configuration, then let the + * vfio_ap device driver's on_scan_complete callback filter the + * matrix and update the guest's AP configuration after all of + * the new queue devices are probed. + */ + if (!bitmap_empty(matrix_mdev->apm_add, AP_DEVICES) || + !bitmap_empty(matrix_mdev->aqm_add, AP_DOMAINS)) + goto done; + if (vfio_ap_mdev_filter_matrix(matrix_mdev)) vfio_ap_mdev_update_guest_apcb(matrix_mdev); } + +done: dev_set_drvdata(&apdev->device, q); release_update_locks_for_mdev(matrix_mdev); -- GitLab From f848cba767e59f8d5c54984b1d45451aae040d50 Mon Sep 17 00:00:00 2001 From: Tony Krowiak Date: Mon, 15 Jan 2024 13:54:34 -0500 Subject: [PATCH 1054/1290] s390/vfio-ap: reset queues filtered from the guest's AP config When filtering the adapters from the configuration profile for a guest to create or update a guest's AP configuration, if the APID of an adapter and the APQI of a domain identify a queue device that is not bound to the vfio_ap device driver, the APID of the adapter will be filtered because an individual APQN can not be filtered due to the fact the APQNs are assigned to an AP configuration as a matrix of APIDs and APQIs. Consequently, a guest will not have access to all of the queues associated with the filtered adapter. If the queues are subsequently made available again to the guest, they should re-appear in a reset state; so, let's make sure all queues associated with an adapter unplugged from the guest are reset. In order to identify the set of queues that need to be reset, let's allow a vfio_ap_queue object to be simultaneously stored in both a hashtable and a list: A hashtable used to store all of the queues assigned to a matrix mdev; and/or, a list used to store a subset of the queues that need to be reset. For example, when an adapter is hot unplugged from a guest, all guest queues associated with that adapter must be reset. Since that may be a subset of those assigned to the matrix mdev, they can be stored in a list that can be passed to the vfio_ap_mdev_reset_queues function. Signed-off-by: Tony Krowiak Acked-by: Halil Pasic Fixes: 48cae940c31d ("s390/vfio-ap: refresh guest's APCB by filtering AP resources assigned to mdev") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240115185441.31526-5-akrowiak@linux.ibm.com Signed-off-by: Alexander Gordeev --- drivers/s390/crypto/vfio_ap_ops.c | 171 +++++++++++++++++++------- drivers/s390/crypto/vfio_ap_private.h | 3 + 2 files changed, 129 insertions(+), 45 deletions(-) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index e45d0bf7b126e..44cd29aace8ec 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -32,7 +32,8 @@ #define AP_RESET_INTERVAL 20 /* Reset sleep interval (20ms) */ -static int vfio_ap_mdev_reset_queues(struct ap_queue_table *qtable); +static int vfio_ap_mdev_reset_queues(struct ap_matrix_mdev *matrix_mdev); +static int vfio_ap_mdev_reset_qlist(struct list_head *qlist); static struct vfio_ap_queue *vfio_ap_find_queue(int apqn); static const struct vfio_device_ops vfio_ap_matrix_dev_ops; static void vfio_ap_mdev_reset_queue(struct vfio_ap_queue *q); @@ -665,16 +666,23 @@ static bool vfio_ap_mdev_filter_cdoms(struct ap_matrix_mdev *matrix_mdev) * device driver. * * @matrix_mdev: the matrix mdev whose matrix is to be filtered. + * @apm_filtered: a 256-bit bitmap for storing the APIDs filtered from the + * guest's AP configuration that are still in the host's AP + * configuration. * * Note: If an APQN referencing a queue device that is not bound to the vfio_ap * driver, its APID will be filtered from the guest's APCB. The matrix * structure precludes filtering an individual APQN, so its APID will be - * filtered. + * filtered. Consequently, all queues associated with the adapter that + * are in the host's AP configuration must be reset. If queues are + * subsequently made available again to the guest, they should re-appear + * in a reset state * * Return: a boolean value indicating whether the KVM guest's APCB was changed * by the filtering or not. */ -static bool vfio_ap_mdev_filter_matrix(struct ap_matrix_mdev *matrix_mdev) +static bool vfio_ap_mdev_filter_matrix(struct ap_matrix_mdev *matrix_mdev, + unsigned long *apm_filtered) { unsigned long apid, apqi, apqn; DECLARE_BITMAP(prev_shadow_apm, AP_DEVICES); @@ -684,6 +692,7 @@ static bool vfio_ap_mdev_filter_matrix(struct ap_matrix_mdev *matrix_mdev) bitmap_copy(prev_shadow_apm, matrix_mdev->shadow_apcb.apm, AP_DEVICES); bitmap_copy(prev_shadow_aqm, matrix_mdev->shadow_apcb.aqm, AP_DOMAINS); vfio_ap_matrix_init(&matrix_dev->info, &matrix_mdev->shadow_apcb); + bitmap_clear(apm_filtered, 0, AP_DEVICES); /* * Copy the adapters, domains and control domains to the shadow_apcb @@ -709,8 +718,16 @@ static bool vfio_ap_mdev_filter_matrix(struct ap_matrix_mdev *matrix_mdev) apqn = AP_MKQID(apid, apqi); q = vfio_ap_mdev_get_queue(matrix_mdev, apqn); if (!q || q->reset_status.response_code) { - clear_bit_inv(apid, - matrix_mdev->shadow_apcb.apm); + clear_bit_inv(apid, matrix_mdev->shadow_apcb.apm); + + /* + * If the adapter was previously plugged into + * the guest, let's let the caller know that + * the APID was filtered. + */ + if (test_bit_inv(apid, prev_shadow_apm)) + set_bit_inv(apid, apm_filtered); + break; } } @@ -812,7 +829,7 @@ static void vfio_ap_mdev_remove(struct mdev_device *mdev) mutex_lock(&matrix_dev->guests_lock); mutex_lock(&matrix_dev->mdevs_lock); - vfio_ap_mdev_reset_queues(&matrix_mdev->qtable); + vfio_ap_mdev_reset_queues(matrix_mdev); vfio_ap_mdev_unlink_fr_queues(matrix_mdev); list_del(&matrix_mdev->node); mutex_unlock(&matrix_dev->mdevs_lock); @@ -922,6 +939,47 @@ static void vfio_ap_mdev_link_adapter(struct ap_matrix_mdev *matrix_mdev, AP_MKQID(apid, apqi)); } +static int reset_queues_for_apids(struct ap_matrix_mdev *matrix_mdev, + unsigned long *apm_reset) +{ + struct vfio_ap_queue *q, *tmpq; + struct list_head qlist; + unsigned long apid, apqi; + int apqn, ret = 0; + + if (bitmap_empty(apm_reset, AP_DEVICES)) + return 0; + + INIT_LIST_HEAD(&qlist); + + for_each_set_bit_inv(apid, apm_reset, AP_DEVICES) { + for_each_set_bit_inv(apqi, matrix_mdev->shadow_apcb.aqm, + AP_DOMAINS) { + /* + * If the domain is not in the host's AP configuration, + * then resetting it will fail with response code 01 + * (APQN not valid). + */ + if (!test_bit_inv(apqi, + (unsigned long *)matrix_dev->info.aqm)) + continue; + + apqn = AP_MKQID(apid, apqi); + q = vfio_ap_mdev_get_queue(matrix_mdev, apqn); + + if (q) + list_add_tail(&q->reset_qnode, &qlist); + } + } + + ret = vfio_ap_mdev_reset_qlist(&qlist); + + list_for_each_entry_safe(q, tmpq, &qlist, reset_qnode) + list_del(&q->reset_qnode); + + return ret; +} + /** * assign_adapter_store - parses the APID from @buf and sets the * corresponding bit in the mediated matrix device's APM @@ -962,6 +1020,7 @@ static ssize_t assign_adapter_store(struct device *dev, { int ret; unsigned long apid; + DECLARE_BITMAP(apm_filtered, AP_DEVICES); struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev); mutex_lock(&ap_perms_mutex); @@ -991,8 +1050,10 @@ static ssize_t assign_adapter_store(struct device *dev, vfio_ap_mdev_link_adapter(matrix_mdev, apid); - if (vfio_ap_mdev_filter_matrix(matrix_mdev)) + if (vfio_ap_mdev_filter_matrix(matrix_mdev, apm_filtered)) { vfio_ap_mdev_update_guest_apcb(matrix_mdev); + reset_queues_for_apids(matrix_mdev, apm_filtered); + } ret = count; done: @@ -1023,11 +1084,12 @@ static struct vfio_ap_queue * adapter was assigned. * @matrix_mdev: the matrix mediated device to which the adapter was assigned. * @apid: the APID of the unassigned adapter. - * @qtable: table for storing queues associated with unassigned adapter. + * @qlist: list for storing queues associated with unassigned adapter that + * need to be reset. */ static void vfio_ap_mdev_unlink_adapter(struct ap_matrix_mdev *matrix_mdev, unsigned long apid, - struct ap_queue_table *qtable) + struct list_head *qlist) { unsigned long apqi; struct vfio_ap_queue *q; @@ -1035,11 +1097,10 @@ static void vfio_ap_mdev_unlink_adapter(struct ap_matrix_mdev *matrix_mdev, for_each_set_bit_inv(apqi, matrix_mdev->matrix.aqm, AP_DOMAINS) { q = vfio_ap_unlink_apqn_fr_mdev(matrix_mdev, apid, apqi); - if (q && qtable) { + if (q && qlist) { if (test_bit_inv(apid, matrix_mdev->shadow_apcb.apm) && test_bit_inv(apqi, matrix_mdev->shadow_apcb.aqm)) - hash_add(qtable->queues, &q->mdev_qnode, - q->apqn); + list_add_tail(&q->reset_qnode, qlist); } } } @@ -1047,26 +1108,23 @@ static void vfio_ap_mdev_unlink_adapter(struct ap_matrix_mdev *matrix_mdev, static void vfio_ap_mdev_hot_unplug_adapter(struct ap_matrix_mdev *matrix_mdev, unsigned long apid) { - int loop_cursor; - struct vfio_ap_queue *q; - struct ap_queue_table *qtable = kzalloc(sizeof(*qtable), GFP_KERNEL); + struct vfio_ap_queue *q, *tmpq; + struct list_head qlist; - hash_init(qtable->queues); - vfio_ap_mdev_unlink_adapter(matrix_mdev, apid, qtable); + INIT_LIST_HEAD(&qlist); + vfio_ap_mdev_unlink_adapter(matrix_mdev, apid, &qlist); if (test_bit_inv(apid, matrix_mdev->shadow_apcb.apm)) { clear_bit_inv(apid, matrix_mdev->shadow_apcb.apm); vfio_ap_mdev_update_guest_apcb(matrix_mdev); } - vfio_ap_mdev_reset_queues(qtable); + vfio_ap_mdev_reset_qlist(&qlist); - hash_for_each(qtable->queues, loop_cursor, q, mdev_qnode) { + list_for_each_entry_safe(q, tmpq, &qlist, reset_qnode) { vfio_ap_unlink_mdev_fr_queue(q); - hash_del(&q->mdev_qnode); + list_del(&q->reset_qnode); } - - kfree(qtable); } /** @@ -1167,6 +1225,7 @@ static ssize_t assign_domain_store(struct device *dev, { int ret; unsigned long apqi; + DECLARE_BITMAP(apm_filtered, AP_DEVICES); struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev); mutex_lock(&ap_perms_mutex); @@ -1196,8 +1255,10 @@ static ssize_t assign_domain_store(struct device *dev, vfio_ap_mdev_link_domain(matrix_mdev, apqi); - if (vfio_ap_mdev_filter_matrix(matrix_mdev)) + if (vfio_ap_mdev_filter_matrix(matrix_mdev, apm_filtered)) { vfio_ap_mdev_update_guest_apcb(matrix_mdev); + reset_queues_for_apids(matrix_mdev, apm_filtered); + } ret = count; done: @@ -1210,7 +1271,7 @@ static DEVICE_ATTR_WO(assign_domain); static void vfio_ap_mdev_unlink_domain(struct ap_matrix_mdev *matrix_mdev, unsigned long apqi, - struct ap_queue_table *qtable) + struct list_head *qlist) { unsigned long apid; struct vfio_ap_queue *q; @@ -1218,11 +1279,10 @@ static void vfio_ap_mdev_unlink_domain(struct ap_matrix_mdev *matrix_mdev, for_each_set_bit_inv(apid, matrix_mdev->matrix.apm, AP_DEVICES) { q = vfio_ap_unlink_apqn_fr_mdev(matrix_mdev, apid, apqi); - if (q && qtable) { + if (q && qlist) { if (test_bit_inv(apid, matrix_mdev->shadow_apcb.apm) && test_bit_inv(apqi, matrix_mdev->shadow_apcb.aqm)) - hash_add(qtable->queues, &q->mdev_qnode, - q->apqn); + list_add_tail(&q->reset_qnode, qlist); } } } @@ -1230,26 +1290,23 @@ static void vfio_ap_mdev_unlink_domain(struct ap_matrix_mdev *matrix_mdev, static void vfio_ap_mdev_hot_unplug_domain(struct ap_matrix_mdev *matrix_mdev, unsigned long apqi) { - int loop_cursor; - struct vfio_ap_queue *q; - struct ap_queue_table *qtable = kzalloc(sizeof(*qtable), GFP_KERNEL); + struct vfio_ap_queue *q, *tmpq; + struct list_head qlist; - hash_init(qtable->queues); - vfio_ap_mdev_unlink_domain(matrix_mdev, apqi, qtable); + INIT_LIST_HEAD(&qlist); + vfio_ap_mdev_unlink_domain(matrix_mdev, apqi, &qlist); if (test_bit_inv(apqi, matrix_mdev->shadow_apcb.aqm)) { clear_bit_inv(apqi, matrix_mdev->shadow_apcb.aqm); vfio_ap_mdev_update_guest_apcb(matrix_mdev); } - vfio_ap_mdev_reset_queues(qtable); + vfio_ap_mdev_reset_qlist(&qlist); - hash_for_each(qtable->queues, loop_cursor, q, mdev_qnode) { + list_for_each_entry_safe(q, tmpq, &qlist, reset_qnode) { vfio_ap_unlink_mdev_fr_queue(q); - hash_del(&q->mdev_qnode); + list_del(&q->reset_qnode); } - - kfree(qtable); } /** @@ -1604,7 +1661,7 @@ static void vfio_ap_mdev_unset_kvm(struct ap_matrix_mdev *matrix_mdev) get_update_locks_for_kvm(kvm); kvm_arch_crypto_clear_masks(kvm); - vfio_ap_mdev_reset_queues(&matrix_mdev->qtable); + vfio_ap_mdev_reset_queues(matrix_mdev); kvm_put_kvm(kvm); matrix_mdev->kvm = NULL; @@ -1740,15 +1797,33 @@ static void vfio_ap_mdev_reset_queue(struct vfio_ap_queue *q) } } -static int vfio_ap_mdev_reset_queues(struct ap_queue_table *qtable) +static int vfio_ap_mdev_reset_queues(struct ap_matrix_mdev *matrix_mdev) { int ret = 0, loop_cursor; struct vfio_ap_queue *q; - hash_for_each(qtable->queues, loop_cursor, q, mdev_qnode) + hash_for_each(matrix_mdev->qtable.queues, loop_cursor, q, mdev_qnode) vfio_ap_mdev_reset_queue(q); - hash_for_each(qtable->queues, loop_cursor, q, mdev_qnode) { + hash_for_each(matrix_mdev->qtable.queues, loop_cursor, q, mdev_qnode) { + flush_work(&q->reset_work); + + if (q->reset_status.response_code) + ret = -EIO; + } + + return ret; +} + +static int vfio_ap_mdev_reset_qlist(struct list_head *qlist) +{ + int ret = 0; + struct vfio_ap_queue *q; + + list_for_each_entry(q, qlist, reset_qnode) + vfio_ap_mdev_reset_queue(q); + + list_for_each_entry(q, qlist, reset_qnode) { flush_work(&q->reset_work); if (q->reset_status.response_code) @@ -1934,7 +2009,7 @@ static ssize_t vfio_ap_mdev_ioctl(struct vfio_device *vdev, ret = vfio_ap_mdev_get_device_info(arg); break; case VFIO_DEVICE_RESET: - ret = vfio_ap_mdev_reset_queues(&matrix_mdev->qtable); + ret = vfio_ap_mdev_reset_queues(matrix_mdev); break; case VFIO_DEVICE_GET_IRQ_INFO: ret = vfio_ap_get_irq_info(arg); @@ -2080,6 +2155,7 @@ int vfio_ap_mdev_probe_queue(struct ap_device *apdev) { int ret; struct vfio_ap_queue *q; + DECLARE_BITMAP(apm_filtered, AP_DEVICES); struct ap_matrix_mdev *matrix_mdev; ret = sysfs_create_group(&apdev->device.kobj, &vfio_queue_attr_group); @@ -2112,15 +2188,17 @@ int vfio_ap_mdev_probe_queue(struct ap_device *apdev) !bitmap_empty(matrix_mdev->aqm_add, AP_DOMAINS)) goto done; - if (vfio_ap_mdev_filter_matrix(matrix_mdev)) + if (vfio_ap_mdev_filter_matrix(matrix_mdev, apm_filtered)) { vfio_ap_mdev_update_guest_apcb(matrix_mdev); + reset_queues_for_apids(matrix_mdev, apm_filtered); + } } done: dev_set_drvdata(&apdev->device, q); release_update_locks_for_mdev(matrix_mdev); - return 0; + return ret; err_remove_group: sysfs_remove_group(&apdev->device.kobj, &vfio_queue_attr_group); @@ -2464,6 +2542,7 @@ void vfio_ap_on_cfg_changed(struct ap_config_info *cur_cfg_info, static void vfio_ap_mdev_hot_plug_cfg(struct ap_matrix_mdev *matrix_mdev) { + DECLARE_BITMAP(apm_filtered, AP_DEVICES); bool filter_domains, filter_adapters, filter_cdoms, do_hotplug = false; mutex_lock(&matrix_mdev->kvm->lock); @@ -2477,7 +2556,7 @@ static void vfio_ap_mdev_hot_plug_cfg(struct ap_matrix_mdev *matrix_mdev) matrix_mdev->adm_add, AP_DOMAINS); if (filter_adapters || filter_domains) - do_hotplug = vfio_ap_mdev_filter_matrix(matrix_mdev); + do_hotplug = vfio_ap_mdev_filter_matrix(matrix_mdev, apm_filtered); if (filter_cdoms) do_hotplug |= vfio_ap_mdev_filter_cdoms(matrix_mdev); @@ -2485,6 +2564,8 @@ static void vfio_ap_mdev_hot_plug_cfg(struct ap_matrix_mdev *matrix_mdev) if (do_hotplug) vfio_ap_mdev_update_guest_apcb(matrix_mdev); + reset_queues_for_apids(matrix_mdev, apm_filtered); + mutex_unlock(&matrix_dev->mdevs_lock); mutex_unlock(&matrix_mdev->kvm->lock); } diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h index 88aff8b81f2fc..98d37aa27044a 100644 --- a/drivers/s390/crypto/vfio_ap_private.h +++ b/drivers/s390/crypto/vfio_ap_private.h @@ -133,6 +133,8 @@ struct ap_matrix_mdev { * @apqn: the APQN of the AP queue device * @saved_isc: the guest ISC registered with the GIB interface * @mdev_qnode: allows the vfio_ap_queue struct to be added to a hashtable + * @reset_qnode: allows the vfio_ap_queue struct to be added to a list of queues + * that need to be reset * @reset_status: the status from the last reset of the queue * @reset_work: work to wait for queue reset to complete */ @@ -143,6 +145,7 @@ struct vfio_ap_queue { #define VFIO_AP_ISC_INVALID 0xff unsigned char saved_isc; struct hlist_node mdev_qnode; + struct list_head reset_qnode; struct ap_queue_status reset_status; struct work_struct reset_work; }; -- GitLab From f009cfa466558b7dfe97f167ba1875d6f9ea4c07 Mon Sep 17 00:00:00 2001 From: Tony Krowiak Date: Mon, 15 Jan 2024 13:54:35 -0500 Subject: [PATCH 1055/1290] s390/vfio-ap: reset queues associated with adapter for queue unbound from driver When a queue is unbound from the vfio_ap device driver, if that queue is assigned to a guest's AP configuration, its associated adapter is removed because queues are defined to a guest via a matrix of adapters and domains; so, it is not possible to remove a single queue. If an adapter is removed from the guest's AP configuration, all associated queues must be reset to prevent leaking crypto data should any of them be assigned to a different guest or device driver. The one caveat is that if the queue is being removed because the adapter or domain has been removed from the host's AP configuration, then an attempt to reset the queue will fail with response code 01, AP-queue number not valid; so resetting these queues should be skipped. Acked-by: Halil Pasic Signed-off-by: Tony Krowiak Fixes: 09d31ff78793 ("s390/vfio-ap: hot plug/unplug of AP devices when probed/removed") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240115185441.31526-6-akrowiak@linux.ibm.com Signed-off-by: Alexander Gordeev --- drivers/s390/crypto/vfio_ap_ops.c | 76 +++++++++++++++++-------------- 1 file changed, 41 insertions(+), 35 deletions(-) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 44cd29aace8ec..550c936c413db 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -939,45 +939,45 @@ static void vfio_ap_mdev_link_adapter(struct ap_matrix_mdev *matrix_mdev, AP_MKQID(apid, apqi)); } +static void collect_queues_to_reset(struct ap_matrix_mdev *matrix_mdev, + unsigned long apid, + struct list_head *qlist) +{ + struct vfio_ap_queue *q; + unsigned long apqi; + + for_each_set_bit_inv(apqi, matrix_mdev->shadow_apcb.aqm, AP_DOMAINS) { + q = vfio_ap_mdev_get_queue(matrix_mdev, AP_MKQID(apid, apqi)); + if (q) + list_add_tail(&q->reset_qnode, qlist); + } +} + +static void reset_queues_for_apid(struct ap_matrix_mdev *matrix_mdev, + unsigned long apid) +{ + struct list_head qlist; + + INIT_LIST_HEAD(&qlist); + collect_queues_to_reset(matrix_mdev, apid, &qlist); + vfio_ap_mdev_reset_qlist(&qlist); +} + static int reset_queues_for_apids(struct ap_matrix_mdev *matrix_mdev, unsigned long *apm_reset) { - struct vfio_ap_queue *q, *tmpq; struct list_head qlist; - unsigned long apid, apqi; - int apqn, ret = 0; + unsigned long apid; if (bitmap_empty(apm_reset, AP_DEVICES)) return 0; INIT_LIST_HEAD(&qlist); - for_each_set_bit_inv(apid, apm_reset, AP_DEVICES) { - for_each_set_bit_inv(apqi, matrix_mdev->shadow_apcb.aqm, - AP_DOMAINS) { - /* - * If the domain is not in the host's AP configuration, - * then resetting it will fail with response code 01 - * (APQN not valid). - */ - if (!test_bit_inv(apqi, - (unsigned long *)matrix_dev->info.aqm)) - continue; - - apqn = AP_MKQID(apid, apqi); - q = vfio_ap_mdev_get_queue(matrix_mdev, apqn); - - if (q) - list_add_tail(&q->reset_qnode, &qlist); - } - } + for_each_set_bit_inv(apid, apm_reset, AP_DEVICES) + collect_queues_to_reset(matrix_mdev, apid, &qlist); - ret = vfio_ap_mdev_reset_qlist(&qlist); - - list_for_each_entry_safe(q, tmpq, &qlist, reset_qnode) - list_del(&q->reset_qnode); - - return ret; + return vfio_ap_mdev_reset_qlist(&qlist); } /** @@ -2217,24 +2217,30 @@ void vfio_ap_mdev_remove_queue(struct ap_device *apdev) matrix_mdev = q->matrix_mdev; if (matrix_mdev) { - vfio_ap_unlink_queue_fr_mdev(q); - apid = AP_QID_CARD(q->apqn); apqi = AP_QID_QUEUE(q->apqn); - - /* - * If the queue is assigned to the guest's APCB, then remove - * the adapter's APID from the APCB and hot it into the guest. - */ + /* If the queue is assigned to the guest's AP configuration */ if (test_bit_inv(apid, matrix_mdev->shadow_apcb.apm) && test_bit_inv(apqi, matrix_mdev->shadow_apcb.aqm)) { + /* + * Since the queues are defined via a matrix of adapters + * and domains, it is not possible to hot unplug a + * single queue; so, let's unplug the adapter. + */ clear_bit_inv(apid, matrix_mdev->shadow_apcb.apm); vfio_ap_mdev_update_guest_apcb(matrix_mdev); + reset_queues_for_apid(matrix_mdev, apid); + goto done; } } vfio_ap_mdev_reset_queue(q); flush_work(&q->reset_work); + +done: + if (matrix_mdev) + vfio_ap_unlink_queue_fr_mdev(q); + dev_set_drvdata(&apdev->device, NULL); kfree(q); release_update_locks_for_mdev(matrix_mdev); -- GitLab From b9bd10c43456d16abd97b717446f51afb3b88411 Mon Sep 17 00:00:00 2001 From: Tony Krowiak Date: Mon, 15 Jan 2024 13:54:36 -0500 Subject: [PATCH 1056/1290] s390/vfio-ap: do not reset queue removed from host config When a queue is unbound from the vfio_ap device driver, it is reset to ensure its crypto data is not leaked when it is bound to another device driver. If the queue is unbound due to the fact that the adapter or domain was removed from the host's AP configuration, then attempting to reset it will fail with response code 01 (APID not valid) getting returned from the reset command. Let's ensure that the queue is assigned to the host's configuration before resetting it. Signed-off-by: Tony Krowiak Reviewed-by: "Jason J. Herne" Reviewed-by: Halil Pasic Fixes: eeb386aeb5b7 ("s390/vfio-ap: handle config changed and scan complete notification") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240115185441.31526-7-akrowiak@linux.ibm.com Signed-off-by: Alexander Gordeev --- drivers/s390/crypto/vfio_ap_ops.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 550c936c413db..983b3b16196c6 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -2215,10 +2215,10 @@ void vfio_ap_mdev_remove_queue(struct ap_device *apdev) q = dev_get_drvdata(&apdev->device); get_update_locks_for_queue(q); matrix_mdev = q->matrix_mdev; + apid = AP_QID_CARD(q->apqn); + apqi = AP_QID_QUEUE(q->apqn); if (matrix_mdev) { - apid = AP_QID_CARD(q->apqn); - apqi = AP_QID_QUEUE(q->apqn); /* If the queue is assigned to the guest's AP configuration */ if (test_bit_inv(apid, matrix_mdev->shadow_apcb.apm) && test_bit_inv(apqi, matrix_mdev->shadow_apcb.aqm)) { @@ -2234,8 +2234,16 @@ void vfio_ap_mdev_remove_queue(struct ap_device *apdev) } } - vfio_ap_mdev_reset_queue(q); - flush_work(&q->reset_work); + /* + * If the queue is not in the host's AP configuration, then resetting + * it will fail with response code 01, (APQN not valid); so, let's make + * sure it is in the host's config. + */ + if (test_bit_inv(apid, (unsigned long *)matrix_dev->info.apm) && + test_bit_inv(apqi, (unsigned long *)matrix_dev->info.aqm)) { + vfio_ap_mdev_reset_queue(q); + flush_work(&q->reset_work); + } done: if (matrix_mdev) -- GitLab From 78fbb92af27d0982634116c7a31065f24d092826 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 12 Jan 2024 13:26:57 +0000 Subject: [PATCH 1057/1290] nbd: always initialize struct msghdr completely syzbot complains that msg->msg_get_inq value can be uninitialized [1] struct msghdr got many new fields recently, we should always make sure their values is zero by default. [1] BUG: KMSAN: uninit-value in tcp_recvmsg+0x686/0xac0 net/ipv4/tcp.c:2571 tcp_recvmsg+0x686/0xac0 net/ipv4/tcp.c:2571 inet_recvmsg+0x131/0x580 net/ipv4/af_inet.c:879 sock_recvmsg_nosec net/socket.c:1044 [inline] sock_recvmsg+0x12b/0x1e0 net/socket.c:1066 __sock_xmit+0x236/0x5c0 drivers/block/nbd.c:538 nbd_read_reply drivers/block/nbd.c:732 [inline] recv_work+0x262/0x3100 drivers/block/nbd.c:863 process_one_work kernel/workqueue.c:2627 [inline] process_scheduled_works+0x104e/0x1e70 kernel/workqueue.c:2700 worker_thread+0xf45/0x1490 kernel/workqueue.c:2781 kthread+0x3ed/0x540 kernel/kthread.c:388 ret_from_fork+0x66/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:242 Local variable msg created at: __sock_xmit+0x4c/0x5c0 drivers/block/nbd.c:513 nbd_read_reply drivers/block/nbd.c:732 [inline] recv_work+0x262/0x3100 drivers/block/nbd.c:863 CPU: 1 PID: 7465 Comm: kworker/u5:1 Not tainted 6.7.0-rc7-syzkaller-00041-gf016f7547aee #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/17/2023 Workqueue: nbd5-recv recv_work Fixes: f94fd25cb0aa ("tcp: pass back data left in socket after receive") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: stable@vger.kernel.org Cc: Josef Bacik Cc: Jens Axboe Cc: linux-block@vger.kernel.org Cc: nbd@other.debian.org Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240112132657.647112-1-edumazet@google.com Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 4e72ec4e25ac5..33a8f37bb6a1f 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -508,7 +508,7 @@ static int __sock_xmit(struct nbd_device *nbd, struct socket *sock, int send, struct iov_iter *iter, int msg_flags, int *sent) { int result; - struct msghdr msg; + struct msghdr msg = {} ; unsigned int noreclaim_flag; if (unlikely(!sock)) { @@ -524,10 +524,6 @@ static int __sock_xmit(struct nbd_device *nbd, struct socket *sock, int send, do { sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC; sock->sk->sk_use_task_frag = false; - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_control = NULL; - msg.msg_controllen = 0; msg.msg_flags = msg_flags | MSG_NOSIGNAL; if (send) -- GitLab From baf59771343dc0c2ef9ac3189bf9df2d6143654f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 16 Jan 2024 17:08:32 -0700 Subject: [PATCH 1058/1290] io_uring/register: guard compat syscall with CONFIG_COMPAT Add compat.h include to avoid a potential build issue: io_uring/register.c:281:6: error: call to undeclared function 'in_compat_syscall'; ISO C99 and later do not support implicit function declarations [-Werror,-Wimplicit-function-declaration] if (in_compat_syscall()) { ^ 1 warning generated. io_uring/register.c:282:9: error: call to undeclared function 'compat_get_bitmap'; ISO C99 and later do not support implicit function declarations [-Werror,-Wimplicit-function-declaration] ret = compat_get_bitmap(cpumask_bits(new_mask), ^ Fixes: c43203154d8a ("io_uring/register: move io_uring_register(2) related code to register.c") Reported-by: Manu Bretelle Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- io_uring/register.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/io_uring/register.c b/io_uring/register.c index 708dd1d89add4..5e62c12089965 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -278,13 +279,14 @@ static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, if (len > cpumask_size()) len = cpumask_size(); - if (in_compat_syscall()) { +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) ret = compat_get_bitmap(cpumask_bits(new_mask), (const compat_ulong_t __user *)arg, len * 8 /* CHAR_BIT */); - } else { + else +#endif ret = copy_from_user(new_mask, arg, len); - } if (ret) { free_cpumask_var(new_mask); -- GitLab From dc12d1799ce710fd90abbe0ced71e7e1ae0894fc Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 17 Jan 2024 00:57:26 +0000 Subject: [PATCH 1059/1290] io_uring: adjust defer tw counting The UINT_MAX work item counting bias in io_req_local_work_add() in case of !IOU_F_TWQ_LAZY_WAKE works in a sense that we will not miss a wake up, however it's still eerie. In particular, if we add a lazy work item after a non-lazy one, we'll increment it and get nr_tw==0, and subsequent adds may try to unnecessarily wake up the task, which is though not so likely to happen in real workloads. Half the bias, it's still large enough to be larger than any valid ->cq_wait_nr, which is limited by IORING_MAX_CQ_ENTRIES, but further have a good enough of space before it overflows. Fixes: 8751d15426a31 ("io_uring: reduce scheduling due to tw") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/108b971e958deaf7048342930c341ba90f75d806.1705438669.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 50c9f04bc193c..d40c767a62163 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1325,7 +1325,7 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) nr_tw = nr_tw_prev + 1; /* Large enough to fail the nr_wait comparison below */ if (!(flags & IOU_F_TWQ_LAZY_WAKE)) - nr_tw = -1U; + nr_tw = INT_MAX; req->nr_tw = nr_tw; req->io_task_work.node.next = first; -- GitLab From d381099f980b5f6c3c7e150baf13b0aaefc66c29 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 17 Jan 2024 00:57:27 +0000 Subject: [PATCH 1060/1290] io_uring: clean up local tw add-wait sync Kill a smp_mb__after_atomic() right before wake_up, it's useless, and add a comment explaining implicit barriers from cmpxchg and synchronsation around ->cq_wait_nr with the waiter. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/3007f3c2d53c72b61de56919ef56b53158b8276f.1705438669.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d40c767a62163..3ab7e6a46149b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1332,6 +1332,14 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) } while (!try_cmpxchg(&ctx->work_llist.first, &first, &req->io_task_work.node)); + /* + * cmpxchg implies a full barrier, which pairs with the barrier + * in set_current_state() on the io_cqring_wait() side. It's used + * to ensure that either we see updated ->cq_wait_nr, or waiters + * going to sleep will observe the work added to the list, which + * is similar to the wait/wawke task state sync. + */ + if (!first) { if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); @@ -1346,8 +1354,6 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) /* either not enough or the previous add has already woken it up */ if (nr_wait > nr_tw || nr_tw_prev >= nr_wait) return; - /* pairs with set_current_state() in io_cqring_wait() */ - smp_mb__after_atomic(); wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); } -- GitLab From e8c407717b4814dac5641d93cbbbb9fc394f7cf0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 17 Jan 2024 00:57:28 +0000 Subject: [PATCH 1061/1290] io_uring: clean *local_work_add var naming if (!first) { ... } While it reads as do something if it's not the first entry, it does exactly the opposite because "first" here is a pointer to the first entry. Remove the confusion by renaming it into "head". Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/3b8be483b52f58a524185bb88694b8a268e7e85d.1705438669.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3ab7e6a46149b..3508198d17ba9 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1304,16 +1304,16 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) { struct io_ring_ctx *ctx = req->ctx; unsigned nr_wait, nr_tw, nr_tw_prev; - struct llist_node *first; + struct llist_node *head; if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) flags &= ~IOU_F_TWQ_LAZY_WAKE; - first = READ_ONCE(ctx->work_llist.first); + head = READ_ONCE(ctx->work_llist.first); do { nr_tw_prev = 0; - if (first) { - struct io_kiocb *first_req = container_of(first, + if (head) { + struct io_kiocb *first_req = container_of(head, struct io_kiocb, io_task_work.node); /* @@ -1328,8 +1328,8 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) nr_tw = INT_MAX; req->nr_tw = nr_tw; - req->io_task_work.node.next = first; - } while (!try_cmpxchg(&ctx->work_llist.first, &first, + req->io_task_work.node.next = head; + } while (!try_cmpxchg(&ctx->work_llist.first, &head, &req->io_task_work.node)); /* @@ -1340,7 +1340,7 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) * is similar to the wait/wawke task state sync. */ - if (!first) { + if (!head) { if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); if (ctx->has_evfd) -- GitLab From b4bc35cf8704db86203c0739711dab1804265bf3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 17 Jan 2024 00:57:29 +0000 Subject: [PATCH 1062/1290] io_uring: combine cq_wait_nr checks Instead of explicitly checking ->cq_wait_nr for whether there are waiting, which is currently represented by 0, we can store there a large value and the nr_tw will automatically filter out those cases. Add a named constant for that and for the wake up bias value. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/38def30282654d980673976cd42fde9bab19b297.1705438669.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3508198d17ba9..b5fa3c7df1cf8 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -137,6 +137,14 @@ struct io_defer_entry { #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) +/* + * No waiters. It's larger than any valid value of the tw counter + * so that tests against ->cq_wait_nr would fail and skip wake_up(). + */ +#define IO_CQ_WAKE_INIT (-1U) +/* Forced wake up if there is a waiter regardless of ->cq_wait_nr */ +#define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1) + static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all); @@ -303,6 +311,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) goto err; ctx->flags = p->flags; + atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); init_waitqueue_head(&ctx->sqo_sq_wait); INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); @@ -1306,6 +1315,13 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) unsigned nr_wait, nr_tw, nr_tw_prev; struct llist_node *head; + /* See comment above IO_CQ_WAKE_INIT */ + BUILD_BUG_ON(IO_CQ_WAKE_FORCE <= IORING_MAX_CQ_ENTRIES); + + /* + * We don't know how many reuqests is there in the link and whether + * they can even be queued lazily, fall back to non-lazy. + */ if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) flags &= ~IOU_F_TWQ_LAZY_WAKE; @@ -1322,10 +1338,14 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) */ nr_tw_prev = READ_ONCE(first_req->nr_tw); } + + /* + * Theoretically, it can overflow, but that's fine as one of + * previous adds should've tried to wake the task. + */ nr_tw = nr_tw_prev + 1; - /* Large enough to fail the nr_wait comparison below */ if (!(flags & IOU_F_TWQ_LAZY_WAKE)) - nr_tw = INT_MAX; + nr_tw = IO_CQ_WAKE_FORCE; req->nr_tw = nr_tw; req->io_task_work.node.next = head; @@ -1348,11 +1368,11 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) } nr_wait = atomic_read(&ctx->cq_wait_nr); - /* no one is waiting */ - if (!nr_wait) + /* not enough or no one is waiting */ + if (nr_tw < nr_wait) return; - /* either not enough or the previous add has already woken it up */ - if (nr_wait > nr_tw || nr_tw_prev >= nr_wait) + /* the previous add has already woken it up */ + if (nr_tw_prev >= nr_wait) return; wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); } @@ -2620,7 +2640,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, ret = io_cqring_wait_schedule(ctx, &iowq); __set_current_state(TASK_RUNNING); - atomic_set(&ctx->cq_wait_nr, 0); + atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); /* * Run task_work after scheduling and before io_should_wake(). -- GitLab From fb3c007fde80d9d3b4207943e74c150c9116cead Mon Sep 17 00:00:00 2001 From: Bin Li Date: Wed, 17 Jan 2024 23:41:23 +0800 Subject: [PATCH 1063/1290] ALSA: hda/realtek: Enable headset mic on Lenovo M70 Gen5 Lenovo M70 Gen5 is equipped with ALC623, and it needs ALC283_FIXUP_HEADSET_MIC quirk to make its headset mic work. Signed-off-by: Bin Li Cc: Link: https://lore.kernel.org/r/20240117154123.21578-1-bin.li@canonical.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index dbf31fe901daa..f6f16622f9cc7 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10233,6 +10233,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x3176, "ThinkCentre Station", ALC283_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x17aa, 0x3178, "ThinkCentre Station", ALC283_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x17aa, 0x31af, "ThinkCentre Station", ALC623_FIXUP_LENOVO_THINKSTATION_P340), + SND_PCI_QUIRK(0x17aa, 0x334b, "Lenovo ThinkCentre M70 Gen5", ALC283_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x17aa, 0x3801, "Lenovo Yoga9 14IAP7", ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK_PIN), SND_PCI_QUIRK(0x17aa, 0x3802, "Lenovo Yoga DuetITL 2021", ALC287_FIXUP_YOGA7_14ITL_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x3813, "Legion 7i 15IMHG05", ALC287_FIXUP_LEGION_15IMHG05_SPEAKERS), -- GitLab From 4f41d30cd6dc865c3cbc1a852372321eba6d4e4c Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 25 Nov 2023 13:05:04 +0100 Subject: [PATCH 1064/1290] kdb: Fix a potential buffer overflow in kdb_local() When appending "[defcmd]" to 'kdb_prompt_str', the size of the string already in the buffer should be taken into account. An option could be to switch from strncat() to strlcat() which does the correct test to avoid such an overflow. However, this actually looks as dead code, because 'defcmd_in_progress' can't be true here. See a more detailed explanation at [1]. [1]: https://lore.kernel.org/all/CAD=FV=WSh7wKN7Yp-3wWiDgX4E3isQ8uh0LCzTmd1v9Cg9j+nQ@mail.gmail.com/ Fixes: 5d5314d6795f ("kdb: core for kgdb back end (1 of 2)") Signed-off-by: Christophe JAILLET Reviewed-by: Douglas Anderson --- kernel/debug/kdb/kdb_main.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 6b213c8252d62..d05066cb40b2e 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1348,8 +1348,6 @@ do_full_getstr: /* PROMPT can only be set if we have MEM_READ permission. */ snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"), raw_smp_processor_id()); - if (defcmd_in_progress) - strncat(kdb_prompt_str, "[defcmd]", CMD_BUFLEN); /* * Fetch command from keyboard -- GitLab From 49e60333d743ae32db3bdde2f93bc818482dd741 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 17 Jan 2024 12:36:09 -0800 Subject: [PATCH 1065/1290] blk-mq: Remove the hctx 'run' debugfs attribute Nobody uses the debugfs hctx 'run' attribute. Hence remove this attribute and also the code that updates the corresponding member variable. Suggested-by: Jens Axboe Cc: Gabriel Ryan Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240117203609.4122520-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 18 ------------------ block/blk-mq-sched.c | 2 -- include/linux/blk-mq.h | 3 --- 3 files changed, 23 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 5cbeb9344f2f5..94668e72ab09b 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -479,23 +479,6 @@ out: return res; } -static int hctx_run_show(void *data, struct seq_file *m) -{ - struct blk_mq_hw_ctx *hctx = data; - - seq_printf(m, "%lu\n", hctx->run); - return 0; -} - -static ssize_t hctx_run_write(void *data, const char __user *buf, size_t count, - loff_t *ppos) -{ - struct blk_mq_hw_ctx *hctx = data; - - hctx->run = 0; - return count; -} - static int hctx_active_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; @@ -624,7 +607,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"tags_bitmap", 0400, hctx_tags_bitmap_show}, {"sched_tags", 0400, hctx_sched_tags_show}, {"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show}, - {"run", 0600, hctx_run_show, hctx_run_write}, {"active", 0400, hctx_active_show}, {"dispatch_busy", 0400, hctx_dispatch_busy_show}, {"type", 0400, hctx_type_show}, diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 67c95f31b15bb..451a2c1f1f321 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -324,8 +324,6 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) return; - hctx->run++; - /* * A return of -EAGAIN is an indication that hctx->dispatch is not * empty and we must run again in order to avoid starving flushes. diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index a676e116085f3..7a8150a5f0513 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -391,9 +391,6 @@ struct blk_mq_hw_ctx { */ struct blk_mq_tags *sched_tags; - /** @run: Number of dispatched requests. */ - unsigned long run; - /** @numa_node: NUMA node the storage adapter has been connected to. */ unsigned int numa_node; /** @queue_num: Index of this hardware queue. */ -- GitLab From 590b1d19d73914477cfd9faac7a0d7dcf5b4eb08 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Tue, 16 Jan 2024 00:22:13 +0100 Subject: [PATCH 1066/1290] rtc: max31335: remove unecessary locking There is no race condition when accessing MAX31335_STATUS1 because it is always about clearing the alarm interrupt bit. Reviewed-by: Antoniu Miclaus Link: https://lore.kernel.org/r/20240115232215.273374-1-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-max31335.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/rtc/rtc-max31335.c b/drivers/rtc/rtc-max31335.c index 3ddfe71bbb560..2ce23f60a7f34 100644 --- a/drivers/rtc/rtc-max31335.c +++ b/drivers/rtc/rtc-max31335.c @@ -348,27 +348,19 @@ static int max31335_alarm_irq_enable(struct device *dev, unsigned int enabled) static irqreturn_t max31335_handle_irq(int irq, void *dev_id) { struct max31335_data *max31335 = dev_id; - struct mutex *lock = &max31335->rtc->ops_lock; int ret, status; - mutex_lock(lock); - ret = regmap_read(max31335->regmap, MAX31335_STATUS1, &status); if (ret) - goto exit; + return IRQ_HANDLED; if (FIELD_GET(MAX31335_STATUS1_A1F, status)) { - ret = regmap_update_bits(max31335->regmap, MAX31335_STATUS1, - MAX31335_STATUS1_A1F, 0); - if (ret) - goto exit; + regmap_update_bits(max31335->regmap, MAX31335_STATUS1, + MAX31335_STATUS1_A1F, 0); rtc_update_irq(max31335->rtc, 1, RTC_AF | RTC_IRQF); } -exit: - mutex_unlock(lock); - return IRQ_HANDLED; } -- GitLab From b7d450d98b0f9917cac31b39ba459a4aee26c8b1 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Tue, 16 Jan 2024 00:22:14 +0100 Subject: [PATCH 1067/1290] rtc: max31335: use regmap_update_bits_check Simplify the IRQ handler by using regmap_update_bits_check. Reviewed-by: Antoniu Miclaus Link: https://lore.kernel.org/r/20240115232215.273374-2-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-max31335.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/rtc/rtc-max31335.c b/drivers/rtc/rtc-max31335.c index 2ce23f60a7f34..a38d303d9df4d 100644 --- a/drivers/rtc/rtc-max31335.c +++ b/drivers/rtc/rtc-max31335.c @@ -348,18 +348,16 @@ static int max31335_alarm_irq_enable(struct device *dev, unsigned int enabled) static irqreturn_t max31335_handle_irq(int irq, void *dev_id) { struct max31335_data *max31335 = dev_id; - int ret, status; + bool status; + int ret; - ret = regmap_read(max31335->regmap, MAX31335_STATUS1, &status); + ret = regmap_update_bits_check(max31335->regmap, MAX31335_STATUS1, + MAX31335_STATUS1_A1F, 0, &status); if (ret) return IRQ_HANDLED; - if (FIELD_GET(MAX31335_STATUS1_A1F, status)) { - regmap_update_bits(max31335->regmap, MAX31335_STATUS1, - MAX31335_STATUS1_A1F, 0); - + if (status) rtc_update_irq(max31335->rtc, 1, RTC_AF | RTC_IRQF); - } return IRQ_HANDLED; } -- GitLab From dd7fe5d9fd6a27531e985e3812ef4031bd316b01 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 17 Jan 2024 10:54:16 -0700 Subject: [PATCH 1068/1290] rtc: max31335: Fix comparison in max31335_volatile_reg() Clang warns (or errors with CONFIG_WERROR=y): drivers/rtc/rtc-max31335.c:211:36: error: use of logical '||' with constant operand [-Werror,-Wconstant-logical-operand] 211 | if (reg == MAX31335_TEMP_DATA_MSB || MAX31335_TEMP_DATA_LSB) | ^ ~~~~~~~~~~~~~~~~~~~~~~ drivers/rtc/rtc-max31335.c:211:36: note: use '|' for a bitwise operation 211 | if (reg == MAX31335_TEMP_DATA_MSB || MAX31335_TEMP_DATA_LSB) | ^~ | | 1 error generated. This clearly should be a comparison against reg. Fix the comparison so that max31335_volatile_reg() does not always return true. Closes: https://github.com/ClangBuiltLinux/linux/issues/1980 Fixes: dedaf03b99d6 ("rtc: max31335: add driver support") Signed-off-by: Nathan Chancellor Link: https://lore.kernel.org/r/20240117-rtc-max3133-fix-comparison-v1-1-91e98b29d564@kernel.org Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-max31335.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-max31335.c b/drivers/rtc/rtc-max31335.c index a38d303d9df4d..402fda8fd5488 100644 --- a/drivers/rtc/rtc-max31335.c +++ b/drivers/rtc/rtc-max31335.c @@ -208,7 +208,7 @@ static bool max31335_volatile_reg(struct device *dev, unsigned int reg) return true; /* temperature registers */ - if (reg == MAX31335_TEMP_DATA_MSB || MAX31335_TEMP_DATA_LSB) + if (reg == MAX31335_TEMP_DATA_MSB || reg == MAX31335_TEMP_DATA_LSB) return true; return false; -- GitLab From 8681de6457aa071a5982e9b20682e56a7d87e3b6 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Fri, 5 Jan 2024 14:53:42 +0000 Subject: [PATCH 1069/1290] rtc: da9063: Make IRQ as optional On some platforms (eg: RZ/{G2UL,Five} SMARC EVK), there is no IRQ populated by default. Add irq optional support. Signed-off-by: Biju Das Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20240105145344.204453-2-biju.das.jz@bp.renesas.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-da9063.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/drivers/rtc/rtc-da9063.c b/drivers/rtc/rtc-da9063.c index 2f5d60622564a..b3a1f852c318b 100644 --- a/drivers/rtc/rtc-da9063.c +++ b/drivers/rtc/rtc-da9063.c @@ -485,25 +485,29 @@ static int da9063_rtc_probe(struct platform_device *pdev) clear_bit(RTC_FEATURE_UPDATE_INTERRUPT, rtc->rtc_dev->features); } - irq_alarm = platform_get_irq_byname(pdev, "ALARM"); - if (irq_alarm < 0) + irq_alarm = platform_get_irq_byname_optional(pdev, "ALARM"); + if (irq_alarm >= 0) { + ret = devm_request_threaded_irq(&pdev->dev, irq_alarm, NULL, + da9063_alarm_event, + IRQF_TRIGGER_LOW | IRQF_ONESHOT, + "ALARM", rtc); + if (ret) + dev_err(&pdev->dev, + "Failed to request ALARM IRQ %d: %d\n", + irq_alarm, ret); + + ret = dev_pm_set_wake_irq(&pdev->dev, irq_alarm); + if (ret) + dev_warn(&pdev->dev, + "Failed to set IRQ %d as a wake IRQ: %d\n", + irq_alarm, ret); + + device_init_wakeup(&pdev->dev, true); + } else if (irq_alarm != -ENXIO) { return irq_alarm; - - ret = devm_request_threaded_irq(&pdev->dev, irq_alarm, NULL, - da9063_alarm_event, - IRQF_TRIGGER_LOW | IRQF_ONESHOT, - "ALARM", rtc); - if (ret) - dev_err(&pdev->dev, "Failed to request ALARM IRQ %d: %d\n", - irq_alarm, ret); - - ret = dev_pm_set_wake_irq(&pdev->dev, irq_alarm); - if (ret) - dev_warn(&pdev->dev, - "Failed to set IRQ %d as a wake IRQ: %d\n", - irq_alarm, ret); - - device_init_wakeup(&pdev->dev, true); + } else { + clear_bit(RTC_FEATURE_ALARM, rtc->rtc_dev->features); + } return devm_rtc_register_device(rtc->rtc_dev); } -- GitLab From 4b60c32e979ac84a715c329161ddf42b0790be4e Mon Sep 17 00:00:00 2001 From: Biju Das Date: Fri, 5 Jan 2024 14:53:43 +0000 Subject: [PATCH 1070/1290] rtc: da9063: Use device_get_match_data() Replace of_match_node()->device_get_match_data() for the data associated with device match. Signed-off-by: Biju Das Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20240105145344.204453-3-biju.das.jz@bp.renesas.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-da9063.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/rtc/rtc-da9063.c b/drivers/rtc/rtc-da9063.c index b3a1f852c318b..265abdd7e935f 100644 --- a/drivers/rtc/rtc-da9063.c +++ b/drivers/rtc/rtc-da9063.c @@ -377,7 +377,6 @@ static int da9063_rtc_probe(struct platform_device *pdev) { struct da9063_compatible_rtc *rtc; const struct da9063_compatible_rtc_regmap *config; - const struct of_device_id *match; int irq_alarm; u8 data[RTC_DATA_LEN]; int ret; @@ -385,14 +384,11 @@ static int da9063_rtc_probe(struct platform_device *pdev) if (!pdev->dev.of_node) return -ENXIO; - match = of_match_node(da9063_compatible_reg_id_table, - pdev->dev.of_node); - rtc = devm_kzalloc(&pdev->dev, sizeof(*rtc), GFP_KERNEL); if (!rtc) return -ENOMEM; - rtc->config = match->data; + rtc->config = device_get_match_data(&pdev->dev); if (of_device_is_compatible(pdev->dev.of_node, "dlg,da9063-rtc")) { struct da9063 *chip = dev_get_drvdata(pdev->dev.parent); -- GitLab From f5334aa88345874d54a406e86a886a54173e1ba8 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Fri, 5 Jan 2024 14:53:44 +0000 Subject: [PATCH 1071/1290] rtc: da9063: Use dev_err_probe() Replace dev_err()->dev_err_probe() to simpilfy probe(). Signed-off-by: Biju Das Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20240105145344.204453-4-biju.das.jz@bp.renesas.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-da9063.c | 42 ++++++++++++++++------------------------ 1 file changed, 17 insertions(+), 25 deletions(-) diff --git a/drivers/rtc/rtc-da9063.c b/drivers/rtc/rtc-da9063.c index 265abdd7e935f..859397541f298 100644 --- a/drivers/rtc/rtc-da9063.c +++ b/drivers/rtc/rtc-da9063.c @@ -407,57 +407,49 @@ static int da9063_rtc_probe(struct platform_device *pdev) config->rtc_enable_reg, config->rtc_enable_mask, config->rtc_enable_mask); - if (ret < 0) { - dev_err(&pdev->dev, "Failed to enable RTC\n"); - return ret; - } + if (ret < 0) + return dev_err_probe(&pdev->dev, ret, "Failed to enable RTC\n"); ret = regmap_update_bits(rtc->regmap, config->rtc_enable_32k_crystal_reg, config->rtc_crystal_mask, config->rtc_crystal_mask); - if (ret < 0) { - dev_err(&pdev->dev, "Failed to run 32kHz oscillator\n"); - return ret; - } + if (ret < 0) + return dev_err_probe(&pdev->dev, ret, + "Failed to run 32kHz oscillator\n"); ret = regmap_update_bits(rtc->regmap, config->rtc_alarm_secs_reg, config->rtc_alarm_status_mask, 0); - if (ret < 0) { - dev_err(&pdev->dev, "Failed to access RTC alarm register\n"); - return ret; - } + if (ret < 0) + return dev_err_probe(&pdev->dev, ret, + "Failed to access RTC alarm register\n"); ret = regmap_update_bits(rtc->regmap, config->rtc_alarm_secs_reg, DA9063_ALARM_STATUS_ALARM, DA9063_ALARM_STATUS_ALARM); - if (ret < 0) { - dev_err(&pdev->dev, "Failed to access RTC alarm register\n"); - return ret; - } + if (ret < 0) + return dev_err_probe(&pdev->dev, ret, + "Failed to access RTC alarm register\n"); ret = regmap_update_bits(rtc->regmap, config->rtc_alarm_year_reg, config->rtc_tick_on_mask, 0); - if (ret < 0) { - dev_err(&pdev->dev, "Failed to disable TICKs\n"); - return ret; - } + if (ret < 0) + return dev_err_probe(&pdev->dev, ret, + "Failed to disable TICKs\n"); data[RTC_SEC] = 0; ret = regmap_bulk_read(rtc->regmap, config->rtc_alarm_secs_reg, &data[config->rtc_data_start], config->rtc_alarm_len); - if (ret < 0) { - dev_err(&pdev->dev, "Failed to read initial alarm data: %d\n", - ret); - return ret; - } + if (ret < 0) + return dev_err_probe(&pdev->dev, ret, + "Failed to read initial alarm data\n"); platform_set_drvdata(pdev, rtc); -- GitLab From 14688f1a91e1f37bc6bf50ff5241e857f24338e0 Mon Sep 17 00:00:00 2001 From: Mia Lin Date: Mon, 13 Nov 2023 18:38:07 +0800 Subject: [PATCH 1072/1290] rtc: nuvoton: Compatible with NCT3015Y-R and NCT3018Y-R The NCT3015Y-R and NCT3018Y-R use the same datasheet but have different topologies as follows. - Topology (Only 1st i2c can set TWO bit and HF bit) In NCT3015Y-R, rtc 1st i2c is connected to a host CPU rtc 2nd i2c is connected to a BMC In NCT3018Y-R, rtc 1st i2c is connected to a BMC rtc 2nd i2c is connected to a host CPU In order to be compatible with NCT3015Y-R and NCT3018Y-R, - In probe, If part number is NCT3018Y-R, only set HF bit to 24-Hour format. Else, do nothing - In set_time, If part number is NCT3018Y-R && TWO bit is 0, change TWO bit to 1, and restore TWO bit after updating time. Signed-off-by: Mia Lin Link: https://lore.kernel.org/r/20231113103807.1036978-2-mimi05633@gmail.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-nct3018y.c | 52 +++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/drivers/rtc/rtc-nct3018y.c b/drivers/rtc/rtc-nct3018y.c index ed4e606be8e58..f488a189a4651 100644 --- a/drivers/rtc/rtc-nct3018y.c +++ b/drivers/rtc/rtc-nct3018y.c @@ -23,6 +23,7 @@ #define NCT3018Y_REG_CTRL 0x0A /* timer control */ #define NCT3018Y_REG_ST 0x0B /* status */ #define NCT3018Y_REG_CLKO 0x0C /* clock out */ +#define NCT3018Y_REG_PART 0x21 /* part info */ #define NCT3018Y_BIT_AF BIT(7) #define NCT3018Y_BIT_ST BIT(7) @@ -37,10 +38,12 @@ #define NCT3018Y_REG_BAT_MASK 0x07 #define NCT3018Y_REG_CLKO_F_MASK 0x03 /* frequenc mask */ #define NCT3018Y_REG_CLKO_CKE 0x80 /* clock out enabled */ +#define NCT3018Y_REG_PART_NCT3018Y 0x02 struct nct3018y { struct rtc_device *rtc; struct i2c_client *client; + int part_num; #ifdef CONFIG_COMMON_CLK struct clk_hw clkout_hw; #endif @@ -177,8 +180,27 @@ static int nct3018y_rtc_read_time(struct device *dev, struct rtc_time *tm) static int nct3018y_rtc_set_time(struct device *dev, struct rtc_time *tm) { struct i2c_client *client = to_i2c_client(dev); + struct nct3018y *nct3018y = dev_get_drvdata(dev); unsigned char buf[4] = {0}; - int err; + int err, flags; + int restore_flags = 0; + + flags = i2c_smbus_read_byte_data(client, NCT3018Y_REG_CTRL); + if (flags < 0) { + dev_dbg(&client->dev, "Failed to read NCT3018Y_REG_CTRL.\n"); + return flags; + } + + /* Check and set TWO bit */ + if (nct3018y->part_num == NCT3018Y_REG_PART_NCT3018Y && !(flags & NCT3018Y_BIT_TWO)) { + restore_flags = 1; + flags |= NCT3018Y_BIT_TWO; + err = i2c_smbus_write_byte_data(client, NCT3018Y_REG_CTRL, flags); + if (err < 0) { + dev_dbg(&client->dev, "Unable to write NCT3018Y_REG_CTRL.\n"); + return err; + } + } buf[0] = bin2bcd(tm->tm_sec); err = i2c_smbus_write_byte_data(client, NCT3018Y_REG_SC, buf[0]); @@ -212,6 +234,18 @@ static int nct3018y_rtc_set_time(struct device *dev, struct rtc_time *tm) return -EIO; } + /* Restore TWO bit */ + if (restore_flags) { + if (nct3018y->part_num == NCT3018Y_REG_PART_NCT3018Y) + flags &= ~NCT3018Y_BIT_TWO; + + err = i2c_smbus_write_byte_data(client, NCT3018Y_REG_CTRL, flags); + if (err < 0) { + dev_dbg(&client->dev, "Unable to write NCT3018Y_REG_CTRL.\n"); + return err; + } + } + return err; } @@ -479,11 +513,17 @@ static int nct3018y_probe(struct i2c_client *client) dev_dbg(&client->dev, "%s: NCT3018Y_BIT_TWO is set\n", __func__); } - flags = NCT3018Y_BIT_TWO; - err = i2c_smbus_write_byte_data(client, NCT3018Y_REG_CTRL, flags); - if (err < 0) { - dev_dbg(&client->dev, "Unable to write NCT3018Y_REG_CTRL\n"); - return err; + nct3018y->part_num = i2c_smbus_read_byte_data(client, NCT3018Y_REG_PART); + if (nct3018y->part_num < 0) { + dev_dbg(&client->dev, "Failed to read NCT3018Y_REG_PART.\n"); + return nct3018y->part_num; + } else if (nct3018y->part_num == NCT3018Y_REG_PART_NCT3018Y) { + flags = NCT3018Y_BIT_HF; + err = i2c_smbus_write_byte_data(client, NCT3018Y_REG_CTRL, flags); + if (err < 0) { + dev_dbg(&client->dev, "Unable to write NCT3018Y_REG_CTRL.\n"); + return err; + } } flags = 0; -- GitLab From 0de65288d75ff96c30e216557d979fb9342c4323 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 17 Jan 2024 14:09:34 +0100 Subject: [PATCH 1073/1290] RISC-V: selftests: cbo: Ensure asm operands match constraints The 'i' constraint expects a constant operand, which fn and its constant derivative MK_CBO(fn) are, but passing fn through a function as a parameter and using a local variable for MK_CBO(fn) allow the compiler to lose sight of that when no optimization is done. Use a macro instead of a function and skip the local variable to ensure the compiler uses constants, matching the asm constraints. Reported-by: Yunhui Cui Closes: https://lore.kernel.org/all/20240117082514.42967-1-cuiyunhui@bytedance.com Fixes: a29e2a48afe3 ("RISC-V: selftests: Add CBO tests") Signed-off-by: Andrew Jones Link: https://lore.kernel.org/r/20240117130933.57514-2-ajones@ventanamicro.com Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- tools/testing/selftests/riscv/hwprobe/cbo.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/riscv/hwprobe/cbo.c b/tools/testing/selftests/riscv/hwprobe/cbo.c index c6a83ab11e223..c537d52fafc58 100644 --- a/tools/testing/selftests/riscv/hwprobe/cbo.c +++ b/tools/testing/selftests/riscv/hwprobe/cbo.c @@ -36,16 +36,14 @@ static void sigill_handler(int sig, siginfo_t *info, void *context) regs[0] += 4; } -static void cbo_insn(char *base, int fn) -{ - uint32_t insn = MK_CBO(fn); - - asm volatile( - "mv a0, %0\n" - "li a1, %1\n" - ".4byte %2\n" - : : "r" (base), "i" (fn), "i" (insn) : "a0", "a1", "memory"); -} +#define cbo_insn(base, fn) \ +({ \ + asm volatile( \ + "mv a0, %0\n" \ + "li a1, %1\n" \ + ".4byte %2\n" \ + : : "r" (base), "i" (fn), "i" (MK_CBO(fn)) : "a0", "a1", "memory"); \ +}) static void cbo_inval(char *base) { cbo_insn(base, 0); } static void cbo_clean(char *base) { cbo_insn(base, 1); } -- GitLab From 1e7196fa5b0312a6a3e49e7c1300e145afcba96b Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Mon, 8 Jan 2024 15:57:02 -0800 Subject: [PATCH 1074/1290] asm-generic: Improve csum_fold This csum_fold implementation introduced into arch/arc by Vineet Gupta is better than the default implementation on at least arc, x86, and riscv. Using GCC trunk and compiling non-inlined version, this implementation has 41.6667%, 25% fewer instructions on riscv64, x86-64 respectively with -O3 optimization. Most implmentations override this default in asm, but this should be more performant than all of those other implementations except for arm which has barrel shifting and sparc32 which has a carry flag. Signed-off-by: Charlie Jenkins Reviewed-by: David Laight Link: https://lore.kernel.org/r/20240108-optimize_checksum-v15-1-1c50de5f2167@rivosinc.com Signed-off-by: Palmer Dabbelt --- include/asm-generic/checksum.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/asm-generic/checksum.h b/include/asm-generic/checksum.h index 43e18db89c143..ad928cce268b4 100644 --- a/include/asm-generic/checksum.h +++ b/include/asm-generic/checksum.h @@ -2,6 +2,8 @@ #ifndef __ASM_GENERIC_CHECKSUM_H #define __ASM_GENERIC_CHECKSUM_H +#include + /* * computes the checksum of a memory block at buff, length len, * and adds in "sum" (32-bit) @@ -31,9 +33,7 @@ extern __sum16 ip_fast_csum(const void *iph, unsigned int ihl); static inline __sum16 csum_fold(__wsum csum) { u32 sum = (__force u32)csum; - sum = (sum & 0xffff) + (sum >> 16); - sum = (sum & 0xffff) + (sum >> 16); - return (__force __sum16)~sum; + return (__force __sum16)((~sum - ror32(sum, 16)) >> 16); } #endif -- GitLab From 2ce5729fce8f62b5118f56110d16006c0e22c522 Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Mon, 8 Jan 2024 15:57:03 -0800 Subject: [PATCH 1075/1290] riscv: Add static key for misaligned accesses Support static branches depending on the value of misaligned accesses. This will be used by a later patch in the series. At any point in time, this static branch will only be enabled if all online CPUs are considered "fast". Signed-off-by: Charlie Jenkins Reviewed-by: Evan Green Link: https://lore.kernel.org/r/20240108-optimize_checksum-v15-2-1c50de5f2167@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/cpufeature.h | 2 + arch/riscv/kernel/cpufeature.c | 90 ++++++++++++++++++++++++++++- 2 files changed, 89 insertions(+), 3 deletions(-) diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h index a418c3112cd60..7b129e5e2f075 100644 --- a/arch/riscv/include/asm/cpufeature.h +++ b/arch/riscv/include/asm/cpufeature.h @@ -133,4 +133,6 @@ static __always_inline bool riscv_cpu_has_extension_unlikely(int cpu, const unsi return __riscv_isa_extension_available(hart_isa[cpu].isa, ext); } +DECLARE_STATIC_KEY_FALSE(fast_misaligned_access_speed_key); + #endif diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index b3785ffc15703..b62baeb504d80 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -8,8 +8,10 @@ #include #include +#include #include #include +#include #include #include #include @@ -44,6 +46,8 @@ struct riscv_isainfo hart_isa[NR_CPUS]; /* Performance information */ DEFINE_PER_CPU(long, misaligned_access_speed); +static cpumask_t fast_misaligned_access; + /** * riscv_isa_extension_base() - Get base extension word * @@ -643,6 +647,16 @@ static int check_unaligned_access(void *param) (speed == RISCV_HWPROBE_MISALIGNED_FAST) ? "fast" : "slow"); per_cpu(misaligned_access_speed, cpu) = speed; + + /* + * Set the value of fast_misaligned_access of a CPU. These operations + * are atomic to avoid race conditions. + */ + if (speed == RISCV_HWPROBE_MISALIGNED_FAST) + cpumask_set_cpu(cpu, &fast_misaligned_access); + else + cpumask_clear_cpu(cpu, &fast_misaligned_access); + return 0; } @@ -655,13 +669,69 @@ static void check_unaligned_access_nonboot_cpu(void *param) check_unaligned_access(pages[cpu]); } +DEFINE_STATIC_KEY_FALSE(fast_misaligned_access_speed_key); + +static void modify_unaligned_access_branches(cpumask_t *mask, int weight) +{ + if (cpumask_weight(mask) == weight) + static_branch_enable_cpuslocked(&fast_misaligned_access_speed_key); + else + static_branch_disable_cpuslocked(&fast_misaligned_access_speed_key); +} + +static void set_unaligned_access_static_branches_except_cpu(int cpu) +{ + /* + * Same as set_unaligned_access_static_branches, except excludes the + * given CPU from the result. When a CPU is hotplugged into an offline + * state, this function is called before the CPU is set to offline in + * the cpumask, and thus the CPU needs to be explicitly excluded. + */ + + cpumask_t fast_except_me; + + cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask); + cpumask_clear_cpu(cpu, &fast_except_me); + + modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1); +} + +static void set_unaligned_access_static_branches(void) +{ + /* + * This will be called after check_unaligned_access_all_cpus so the + * result of unaligned access speed for all CPUs will be available. + * + * To avoid the number of online cpus changing between reading + * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be + * held before calling this function. + */ + + cpumask_t fast_and_online; + + cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask); + + modify_unaligned_access_branches(&fast_and_online, num_online_cpus()); +} + +static int lock_and_set_unaligned_access_static_branch(void) +{ + cpus_read_lock(); + set_unaligned_access_static_branches(); + cpus_read_unlock(); + + return 0; +} + +arch_initcall_sync(lock_and_set_unaligned_access_static_branch); + static int riscv_online_cpu(unsigned int cpu) { static struct page *buf; /* We are already set since the last check */ if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN) - return 0; + goto exit; buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); if (!buf) { @@ -671,6 +741,17 @@ static int riscv_online_cpu(unsigned int cpu) check_unaligned_access(buf); __free_pages(buf, MISALIGNED_BUFFER_ORDER); + +exit: + set_unaligned_access_static_branches(); + + return 0; +} + +static int riscv_offline_cpu(unsigned int cpu) +{ + set_unaligned_access_static_branches_except_cpu(cpu); + return 0; } @@ -705,9 +786,12 @@ static int check_unaligned_access_all_cpus(void) /* Check core 0. */ smp_call_on_cpu(0, check_unaligned_access, bufs[0], true); - /* Setup hotplug callback for any new CPUs that come online. */ + /* + * Setup hotplug callbacks for any new CPUs that come online or go + * offline. + */ cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online", - riscv_online_cpu, NULL); + riscv_online_cpu, riscv_offline_cpu); out: unaligned_emulation_finish(); -- GitLab From e11e367e9fe57164ea609807ed27184c85263355 Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Mon, 8 Jan 2024 15:57:04 -0800 Subject: [PATCH 1076/1290] riscv: Add checksum header Provide checksum algorithms that have been designed to leverage riscv instructions such as rotate. In 64-bit, can take advantage of the larger register to avoid some overflow checking. Signed-off-by: Charlie Jenkins Acked-by: Conor Dooley Reviewed-by: Xiao Wang Link: https://lore.kernel.org/r/20240108-optimize_checksum-v15-3-1c50de5f2167@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/checksum.h | 82 +++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 arch/riscv/include/asm/checksum.h diff --git a/arch/riscv/include/asm/checksum.h b/arch/riscv/include/asm/checksum.h new file mode 100644 index 0000000000000..5a810126aac71 --- /dev/null +++ b/arch/riscv/include/asm/checksum.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Checksum routines + * + * Copyright (C) 2023 Rivos Inc. + */ +#ifndef __ASM_RISCV_CHECKSUM_H +#define __ASM_RISCV_CHECKSUM_H + +#include +#include + +#define ip_fast_csum ip_fast_csum + +/* Define riscv versions of functions before importing asm-generic/checksum.h */ +#include + +/** + * Quickly compute an IP checksum with the assumption that IPv4 headers will + * always be in multiples of 32-bits, and have an ihl of at least 5. + * + * @ihl: the number of 32 bit segments and must be greater than or equal to 5. + * @iph: assumed to be word aligned given that NET_IP_ALIGN is set to 2 on + * riscv, defining IP headers to be aligned. + */ +static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) +{ + unsigned long csum = 0; + int pos = 0; + + do { + csum += ((const unsigned int *)iph)[pos]; + if (IS_ENABLED(CONFIG_32BIT)) + csum += csum < ((const unsigned int *)iph)[pos]; + } while (++pos < ihl); + + /* + * ZBB only saves three instructions on 32-bit and five on 64-bit so not + * worth checking if supported without Alternatives. + */ + if (IS_ENABLED(CONFIG_RISCV_ISA_ZBB) && + IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) { + unsigned long fold_temp; + + asm_volatile_goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0, + RISCV_ISA_EXT_ZBB, 1) + : + : + : + : no_zbb); + + if (IS_ENABLED(CONFIG_32BIT)) { + asm(".option push \n\ + .option arch,+zbb \n\ + not %[fold_temp], %[csum] \n\ + rori %[csum], %[csum], 16 \n\ + sub %[csum], %[fold_temp], %[csum] \n\ + .option pop" + : [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp)); + } else { + asm(".option push \n\ + .option arch,+zbb \n\ + rori %[fold_temp], %[csum], 32 \n\ + add %[csum], %[fold_temp], %[csum] \n\ + srli %[csum], %[csum], 32 \n\ + not %[fold_temp], %[csum] \n\ + roriw %[csum], %[csum], 16 \n\ + subw %[csum], %[fold_temp], %[csum] \n\ + .option pop" + : [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp)); + } + return (__force __sum16)(csum >> 16); + } +no_zbb: +#ifndef CONFIG_32BIT + csum += ror64(csum, 32); + csum >>= 32; +#endif + return csum_fold((__force __wsum)csum); +} + +#endif /* __ASM_RISCV_CHECKSUM_H */ -- GitLab From a04c192eabfb76824d00f1b4cd0f25844a59d0f0 Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Mon, 8 Jan 2024 15:57:05 -0800 Subject: [PATCH 1077/1290] riscv: Add checksum library Provide a 32 and 64 bit version of do_csum. When compiled for 32-bit will load from the buffer in groups of 32 bits, and when compiled for 64-bit will load in groups of 64 bits. Additionally provide riscv optimized implementation of csum_ipv6_magic. Signed-off-by: Charlie Jenkins Acked-by: Conor Dooley Reviewed-by: Xiao Wang Link: https://lore.kernel.org/r/20240108-optimize_checksum-v15-4-1c50de5f2167@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/checksum.h | 11 + arch/riscv/lib/Makefile | 1 + arch/riscv/lib/csum.c | 326 ++++++++++++++++++++++++++++++ 3 files changed, 338 insertions(+) create mode 100644 arch/riscv/lib/csum.c diff --git a/arch/riscv/include/asm/checksum.h b/arch/riscv/include/asm/checksum.h index 5a810126aac71..a5b60b54b101c 100644 --- a/arch/riscv/include/asm/checksum.h +++ b/arch/riscv/include/asm/checksum.h @@ -12,6 +12,17 @@ #define ip_fast_csum ip_fast_csum +extern unsigned int do_csum(const unsigned char *buff, int len); +#define do_csum do_csum + +/* Default version is sufficient for 32 bit */ +#ifndef CONFIG_32BIT +#define _HAVE_ARCH_IPV6_CSUM +__sum16 csum_ipv6_magic(const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u32 len, __u8 proto, __wsum sum); +#endif + /* Define riscv versions of functions before importing asm-generic/checksum.h */ #include diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile index 26cb2502ecf89..2aa1a4ad361fb 100644 --- a/arch/riscv/lib/Makefile +++ b/arch/riscv/lib/Makefile @@ -6,6 +6,7 @@ lib-y += memmove.o lib-y += strcmp.o lib-y += strlen.o lib-y += strncmp.o +lib-y += csum.o lib-$(CONFIG_MMU) += uaccess.o lib-$(CONFIG_64BIT) += tishift.o lib-$(CONFIG_RISCV_ISA_ZICBOZ) += clear_page.o diff --git a/arch/riscv/lib/csum.c b/arch/riscv/lib/csum.c new file mode 100644 index 0000000000000..06ce8e7250d98 --- /dev/null +++ b/arch/riscv/lib/csum.c @@ -0,0 +1,326 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Checksum library + * + * Influenced by arch/arm64/lib/csum.c + * Copyright (C) 2023 Rivos Inc. + */ +#include +#include +#include +#include +#include + +#include + +#include + +/* Default version is sufficient for 32 bit */ +#ifndef CONFIG_32BIT +__sum16 csum_ipv6_magic(const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u32 len, __u8 proto, __wsum csum) +{ + unsigned int ulen, uproto; + unsigned long sum = (__force unsigned long)csum; + + sum += (__force unsigned long)saddr->s6_addr32[0]; + sum += (__force unsigned long)saddr->s6_addr32[1]; + sum += (__force unsigned long)saddr->s6_addr32[2]; + sum += (__force unsigned long)saddr->s6_addr32[3]; + + sum += (__force unsigned long)daddr->s6_addr32[0]; + sum += (__force unsigned long)daddr->s6_addr32[1]; + sum += (__force unsigned long)daddr->s6_addr32[2]; + sum += (__force unsigned long)daddr->s6_addr32[3]; + + ulen = (__force unsigned int)htonl((unsigned int)len); + sum += ulen; + + uproto = (__force unsigned int)htonl(proto); + sum += uproto; + + /* + * Zbb support saves 4 instructions, so not worth checking without + * alternatives if supported + */ + if (IS_ENABLED(CONFIG_RISCV_ISA_ZBB) && + IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) { + unsigned long fold_temp; + + /* + * Zbb is likely available when the kernel is compiled with Zbb + * support, so nop when Zbb is available and jump when Zbb is + * not available. + */ + asm_volatile_goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0, + RISCV_ISA_EXT_ZBB, 1) + : + : + : + : no_zbb); + asm(".option push \n\ + .option arch,+zbb \n\ + rori %[fold_temp], %[sum], 32 \n\ + add %[sum], %[fold_temp], %[sum] \n\ + srli %[sum], %[sum], 32 \n\ + not %[fold_temp], %[sum] \n\ + roriw %[sum], %[sum], 16 \n\ + subw %[sum], %[fold_temp], %[sum] \n\ + .option pop" + : [sum] "+r" (sum), [fold_temp] "=&r" (fold_temp)); + return (__force __sum16)(sum >> 16); + } +no_zbb: + sum += ror64(sum, 32); + sum >>= 32; + return csum_fold((__force __wsum)sum); +} +EXPORT_SYMBOL(csum_ipv6_magic); +#endif /* !CONFIG_32BIT */ + +#ifdef CONFIG_32BIT +#define OFFSET_MASK 3 +#elif CONFIG_64BIT +#define OFFSET_MASK 7 +#endif + +static inline __no_sanitize_address unsigned long +do_csum_common(const unsigned long *ptr, const unsigned long *end, + unsigned long data) +{ + unsigned int shift; + unsigned long csum = 0, carry = 0; + + /* + * Do 32-bit reads on RV32 and 64-bit reads otherwise. This should be + * faster than doing 32-bit reads on architectures that support larger + * reads. + */ + while (ptr < end) { + csum += data; + carry += csum < data; + data = *(ptr++); + } + + /* + * Perform alignment (and over-read) bytes on the tail if any bytes + * leftover. + */ + shift = ((long)ptr - (long)end) * 8; +#ifdef __LITTLE_ENDIAN + data = (data << shift) >> shift; +#else + data = (data >> shift) << shift; +#endif + csum += data; + carry += csum < data; + csum += carry; + csum += csum < carry; + + return csum; +} + +/* + * Algorithm accounts for buff being misaligned. + * If buff is not aligned, will over-read bytes but not use the bytes that it + * shouldn't. The same thing will occur on the tail-end of the read. + */ +static inline __no_sanitize_address unsigned int +do_csum_with_alignment(const unsigned char *buff, int len) +{ + unsigned int offset, shift; + unsigned long csum, data; + const unsigned long *ptr, *end; + + /* + * Align address to closest word (double word on rv64) that comes before + * buff. This should always be in the same page and cache line. + * Directly call KASAN with the alignment we will be using. + */ + offset = (unsigned long)buff & OFFSET_MASK; + kasan_check_read(buff, len); + ptr = (const unsigned long *)(buff - offset); + + /* + * Clear the most significant bytes that were over-read if buff was not + * aligned. + */ + shift = offset * 8; + data = *(ptr++); +#ifdef __LITTLE_ENDIAN + data = (data >> shift) << shift; +#else + data = (data << shift) >> shift; +#endif + end = (const unsigned long *)(buff + len); + csum = do_csum_common(ptr, end, data); + + /* + * Zbb support saves 6 instructions, so not worth checking without + * alternatives if supported + */ + if (IS_ENABLED(CONFIG_RISCV_ISA_ZBB) && + IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) { + unsigned long fold_temp; + + /* + * Zbb is likely available when the kernel is compiled with Zbb + * support, so nop when Zbb is available and jump when Zbb is + * not available. + */ + asm_volatile_goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0, + RISCV_ISA_EXT_ZBB, 1) + : + : + : + : no_zbb); + +#ifdef CONFIG_32BIT + asm_volatile_goto(".option push \n\ + .option arch,+zbb \n\ + rori %[fold_temp], %[csum], 16 \n\ + andi %[offset], %[offset], 1 \n\ + add %[csum], %[fold_temp], %[csum] \n\ + beq %[offset], zero, %l[end] \n\ + rev8 %[csum], %[csum] \n\ + .option pop" + : [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp) + : [offset] "r" (offset) + : + : end); + + return (unsigned short)csum; +#else /* !CONFIG_32BIT */ + asm_volatile_goto(".option push \n\ + .option arch,+zbb \n\ + rori %[fold_temp], %[csum], 32 \n\ + add %[csum], %[fold_temp], %[csum] \n\ + srli %[csum], %[csum], 32 \n\ + roriw %[fold_temp], %[csum], 16 \n\ + addw %[csum], %[fold_temp], %[csum] \n\ + andi %[offset], %[offset], 1 \n\ + beq %[offset], zero, %l[end] \n\ + rev8 %[csum], %[csum] \n\ + .option pop" + : [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp) + : [offset] "r" (offset) + : + : end); + + return (csum << 16) >> 48; +#endif /* !CONFIG_32BIT */ +end: + return csum >> 16; + } +no_zbb: +#ifndef CONFIG_32BIT + csum += ror64(csum, 32); + csum >>= 32; +#endif + csum = (u32)csum + ror32((u32)csum, 16); + if (offset & 1) + return (u16)swab32(csum); + return csum >> 16; +} + +/* + * Does not perform alignment, should only be used if machine has fast + * misaligned accesses, or when buff is known to be aligned. + */ +static inline __no_sanitize_address unsigned int +do_csum_no_alignment(const unsigned char *buff, int len) +{ + unsigned long csum, data; + const unsigned long *ptr, *end; + + ptr = (const unsigned long *)(buff); + data = *(ptr++); + + kasan_check_read(buff, len); + + end = (const unsigned long *)(buff + len); + csum = do_csum_common(ptr, end, data); + + /* + * Zbb support saves 6 instructions, so not worth checking without + * alternatives if supported + */ + if (IS_ENABLED(CONFIG_RISCV_ISA_ZBB) && + IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) { + unsigned long fold_temp; + + /* + * Zbb is likely available when the kernel is compiled with Zbb + * support, so nop when Zbb is available and jump when Zbb is + * not available. + */ + asm_volatile_goto(ALTERNATIVE("j %l[no_zbb]", "nop", 0, + RISCV_ISA_EXT_ZBB, 1) + : + : + : + : no_zbb); + +#ifdef CONFIG_32BIT + asm (".option push \n\ + .option arch,+zbb \n\ + rori %[fold_temp], %[csum], 16 \n\ + add %[csum], %[fold_temp], %[csum] \n\ + .option pop" + : [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp) + : + : ); + +#else /* !CONFIG_32BIT */ + asm (".option push \n\ + .option arch,+zbb \n\ + rori %[fold_temp], %[csum], 32 \n\ + add %[csum], %[fold_temp], %[csum] \n\ + srli %[csum], %[csum], 32 \n\ + roriw %[fold_temp], %[csum], 16 \n\ + addw %[csum], %[fold_temp], %[csum] \n\ + .option pop" + : [csum] "+r" (csum), [fold_temp] "=&r" (fold_temp) + : + : ); +#endif /* !CONFIG_32BIT */ + return csum >> 16; + } +no_zbb: +#ifndef CONFIG_32BIT + csum += ror64(csum, 32); + csum >>= 32; +#endif + csum = (u32)csum + ror32((u32)csum, 16); + return csum >> 16; +} + +/* + * Perform a checksum on an arbitrary memory address. + * Will do a light-weight address alignment if buff is misaligned, unless + * cpu supports fast misaligned accesses. + */ +unsigned int do_csum(const unsigned char *buff, int len) +{ + if (unlikely(len <= 0)) + return 0; + + /* + * Significant performance gains can be seen by not doing alignment + * on machines with fast misaligned accesses. + * + * There is some duplicate code between the "with_alignment" and + * "no_alignment" implmentations, but the overlap is too awkward to be + * able to fit in one function without introducing multiple static + * branches. The largest chunk of overlap was delegated into the + * do_csum_common function. + */ + if (static_branch_likely(&fast_misaligned_access_speed_key)) + return do_csum_no_alignment(buff, len); + + if (((unsigned long)buff & OFFSET_MASK) == 0) + return do_csum_no_alignment(buff, len); + + return do_csum_with_alignment(buff, len); +} -- GitLab From 6f4c45cbcb00d649475a3099235e5b4fce569b4b Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Mon, 8 Jan 2024 15:57:06 -0800 Subject: [PATCH 1078/1290] kunit: Add tests for csum_ipv6_magic and ip_fast_csum Supplement existing checksum tests with tests for csum_ipv6_magic and ip_fast_csum. Signed-off-by: Charlie Jenkins Link: https://lore.kernel.org/r/20240108-optimize_checksum-v15-5-1c50de5f2167@rivosinc.com Signed-off-by: Palmer Dabbelt --- lib/checksum_kunit.c | 284 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 283 insertions(+), 1 deletion(-) diff --git a/lib/checksum_kunit.c b/lib/checksum_kunit.c index 0eed92b77ba37..af3e5ca4e1702 100644 --- a/lib/checksum_kunit.c +++ b/lib/checksum_kunit.c @@ -1,15 +1,21 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * Test cases csum_partial and csum_fold + * Test cases csum_partial, csum_fold, ip_fast_csum, csum_ipv6_magic */ #include #include +#include #define MAX_LEN 512 #define MAX_ALIGN 64 #define TEST_BUFLEN (MAX_LEN + MAX_ALIGN) +#define IPv4_MIN_WORDS 5 +#define IPv4_MAX_WORDS 15 +#define NUM_IPv6_TESTS 200 +#define NUM_IP_FAST_CSUM_TESTS 181 + /* Values for a little endian CPU. Byte swap each half on big endian CPU. */ static const u32 random_init_sum = 0x2847aab; static const u8 random_buf[] = { @@ -209,6 +215,237 @@ static const u32 init_sums_no_overflow[] = { 0xffff0000, 0xfffffffb, }; +static const __sum16 expected_csum_ipv6_magic[] = { + 0x18d4, 0x3085, 0x2e4b, 0xd9f4, 0xbdc8, 0x78f, 0x1034, 0x8422, 0x6fc0, + 0xd2f6, 0xbeb5, 0x9d3, 0x7e2a, 0x312e, 0x778e, 0xc1bb, 0x7cf2, 0x9d1e, + 0xca21, 0xf3ff, 0x7569, 0xb02e, 0xca86, 0x7e76, 0x4539, 0x45e3, 0xf28d, + 0xdf81, 0x8fd5, 0x3b5d, 0x8324, 0xf471, 0x83be, 0x1daf, 0x8c46, 0xe682, + 0xd1fb, 0x6b2e, 0xe687, 0x2a33, 0x4833, 0x2d67, 0x660f, 0x2e79, 0xd65e, + 0x6b62, 0x6672, 0x5dbd, 0x8680, 0xbaa5, 0x2229, 0x2125, 0x2d01, 0x1cc0, + 0x6d36, 0x33c0, 0xee36, 0xd832, 0x9820, 0x8a31, 0x53c5, 0x2e2, 0xdb0e, + 0x49ed, 0x17a7, 0x77a0, 0xd72e, 0x3d72, 0x7dc8, 0x5b17, 0xf55d, 0xa4d9, + 0x1446, 0x5d56, 0x6b2e, 0x69a5, 0xadb6, 0xff2a, 0x92e, 0xe044, 0x3402, + 0xbb60, 0xec7f, 0xe7e6, 0x1986, 0x32f4, 0x8f8, 0x5e00, 0x47c6, 0x3059, + 0x3969, 0xe957, 0x4388, 0x2854, 0x3334, 0xea71, 0xa6de, 0x33f9, 0x83fc, + 0x37b4, 0x5531, 0x3404, 0x1010, 0xed30, 0x610a, 0xc95, 0x9aed, 0x6ff, + 0x5136, 0x2741, 0x660e, 0x8b80, 0xf71, 0xa263, 0x88af, 0x7a73, 0x3c37, + 0x1908, 0x6db5, 0x2e92, 0x1cd2, 0x70c8, 0xee16, 0xe80, 0xcd55, 0x6e6, + 0x6434, 0x127, 0x655d, 0x2ea0, 0xb4f4, 0xdc20, 0x5671, 0xe462, 0xe52b, + 0xdb44, 0x3589, 0xc48f, 0xe60b, 0xd2d2, 0x66ad, 0x498, 0x436, 0xb917, + 0xf0ca, 0x1a6e, 0x1cb7, 0xbf61, 0x2870, 0xc7e8, 0x5b30, 0xe4a5, 0x168, + 0xadfc, 0xd035, 0xe690, 0xe283, 0xfb27, 0xe4ad, 0xb1a5, 0xf2d5, 0xc4b6, + 0x8a30, 0xd7d5, 0x7df9, 0x91d5, 0x63ed, 0x2d21, 0x312b, 0xab19, 0xa632, + 0x8d2e, 0xef06, 0x57b9, 0xc373, 0xbd1f, 0xa41f, 0x8444, 0x9975, 0x90cb, + 0xc49c, 0xe965, 0x4eff, 0x5a, 0xef6d, 0xe81a, 0xe260, 0x853a, 0xff7a, + 0x99aa, 0xb06b, 0xee19, 0xcc2c, 0xf34c, 0x7c49, 0xdac3, 0xa71e, 0xc988, + 0x3845, 0x1014 +}; + +static const __sum16 expected_fast_csum[] = { + 0xda83, 0x45da, 0x4f46, 0x4e4f, 0x34e, 0xe902, 0xa5e9, 0x87a5, 0x7187, + 0x5671, 0xf556, 0x6df5, 0x816d, 0x8f81, 0xbb8f, 0xfbba, 0x5afb, 0xbe5a, + 0xedbe, 0xabee, 0x6aac, 0xe6b, 0xea0d, 0x67ea, 0x7e68, 0x8a7e, 0x6f8a, + 0x3a70, 0x9f3a, 0xe89e, 0x75e8, 0x7976, 0xfa79, 0x2cfa, 0x3c2c, 0x463c, + 0x7146, 0x7a71, 0x547a, 0xfd53, 0x99fc, 0xb699, 0x92b6, 0xdb91, 0xe8da, + 0x5fe9, 0x1e60, 0xae1d, 0x39ae, 0xf439, 0xa1f4, 0xdda1, 0xede, 0x790f, + 0x579, 0x1206, 0x9012, 0x2490, 0xd224, 0x5cd2, 0xa65d, 0xca7, 0x220d, + 0xf922, 0xbf9, 0x920b, 0x1b92, 0x361c, 0x2e36, 0x4d2e, 0x24d, 0x2, + 0xcfff, 0x90cf, 0xa591, 0x93a5, 0x7993, 0x9579, 0xc894, 0x50c8, 0x5f50, + 0xd55e, 0xcad5, 0xf3c9, 0x8f4, 0x4409, 0x5043, 0x5b50, 0x55b, 0x2205, + 0x1e22, 0x801e, 0x3780, 0xe137, 0x7ee0, 0xf67d, 0x3cf6, 0xa53c, 0x2ea5, + 0x472e, 0x5147, 0xcf51, 0x1bcf, 0x951c, 0x1e95, 0xc71e, 0xe4c7, 0xc3e4, + 0x3dc3, 0xee3d, 0xa4ed, 0xf9a4, 0xcbf8, 0x75cb, 0xb375, 0x50b4, 0x3551, + 0xf835, 0x19f8, 0x8c1a, 0x538c, 0xad52, 0xa3ac, 0xb0a3, 0x5cb0, 0x6c5c, + 0x5b6c, 0xc05a, 0x92c0, 0x4792, 0xbe47, 0x53be, 0x1554, 0x5715, 0x4b57, + 0xe54a, 0x20e5, 0x21, 0xd500, 0xa1d4, 0xa8a1, 0x57a9, 0xca57, 0x5ca, + 0x1c06, 0x4f1c, 0xe24e, 0xd9e2, 0xf0d9, 0x4af1, 0x474b, 0x8146, 0xe81, + 0xfd0e, 0x84fd, 0x7c85, 0xba7c, 0x17ba, 0x4a17, 0x964a, 0xf595, 0xff5, + 0x5310, 0x3253, 0x6432, 0x4263, 0x2242, 0xe121, 0x32e1, 0xf632, 0xc5f5, + 0x21c6, 0x7d22, 0x8e7c, 0x418e, 0x5641, 0x3156, 0x7c31, 0x737c, 0x373, + 0x2503, 0xc22a, 0x3c2, 0x4a04, 0x8549, 0x5285, 0xa352, 0xe8a3, 0x6fe8, + 0x1a6f, 0x211a, 0xe021, 0x38e0, 0x7638, 0xf575, 0x9df5, 0x169e, 0xf116, + 0x23f1, 0xcd23, 0xece, 0x660f, 0x4866, 0x6a48, 0x716a, 0xee71, 0xa2ee, + 0xb8a2, 0x61b9, 0xa361, 0xf7a2, 0x26f7, 0x1127, 0x6611, 0xe065, 0x36e0, + 0x1837, 0x3018, 0x1c30, 0x721b, 0x3e71, 0xe43d, 0x99e4, 0x9e9a, 0xb79d, + 0xa9b7, 0xcaa, 0xeb0c, 0x4eb, 0x1305, 0x8813, 0xb687, 0xa9b6, 0xfba9, + 0xd7fb, 0xccd8, 0x2ecd, 0x652f, 0xae65, 0x3fae, 0x3a40, 0x563a, 0x7556, + 0x2776, 0x1228, 0xef12, 0xf9ee, 0xcef9, 0x56cf, 0xa956, 0x24a9, 0xba24, + 0x5fba, 0x665f, 0xf465, 0x8ff4, 0x6d8f, 0x346d, 0x5f34, 0x385f, 0xd137, + 0xb8d0, 0xacb8, 0x55ac, 0x7455, 0xe874, 0x89e8, 0xd189, 0xa0d1, 0xb2a0, + 0xb8b2, 0x36b8, 0x5636, 0xd355, 0x8d3, 0x1908, 0x2118, 0xc21, 0x990c, + 0x8b99, 0x158c, 0x7815, 0x9e78, 0x6f9e, 0x4470, 0x1d44, 0x341d, 0x2634, + 0x3f26, 0x793e, 0xc79, 0xcc0b, 0x26cc, 0xd126, 0x1fd1, 0xb41f, 0xb6b4, + 0x22b7, 0xa122, 0xa1, 0x7f01, 0x837e, 0x3b83, 0xaf3b, 0x6fae, 0x916f, + 0xb490, 0xffb3, 0xceff, 0x50cf, 0x7550, 0x7275, 0x1272, 0x2613, 0xaa26, + 0xd5aa, 0x7d5, 0x9607, 0x96, 0xb100, 0xf8b0, 0x4bf8, 0xdd4c, 0xeddd, + 0x98ed, 0x2599, 0x9325, 0xeb92, 0x8feb, 0xcc8f, 0x2acd, 0x392b, 0x3b39, + 0xcb3b, 0x6acb, 0xd46a, 0xb8d4, 0x6ab8, 0x106a, 0x2f10, 0x892f, 0x789, + 0xc806, 0x45c8, 0x7445, 0x3c74, 0x3a3c, 0xcf39, 0xd7ce, 0x58d8, 0x6e58, + 0x336e, 0x1034, 0xee10, 0xe9ed, 0xc2e9, 0x3fc2, 0xd53e, 0xd2d4, 0xead2, + 0x8fea, 0x2190, 0x1162, 0xbe11, 0x8cbe, 0x6d8c, 0xfb6c, 0x6dfb, 0xd36e, + 0x3ad3, 0xf3a, 0x870e, 0xc287, 0x53c3, 0xc54, 0x5b0c, 0x7d5a, 0x797d, + 0xec79, 0x5dec, 0x4d5e, 0x184e, 0xd618, 0x60d6, 0xb360, 0x98b3, 0xf298, + 0xb1f2, 0x69b1, 0xf969, 0xef9, 0xab0e, 0x21ab, 0xe321, 0x24e3, 0x8224, + 0x5481, 0x5954, 0x7a59, 0xff7a, 0x7dff, 0x1a7d, 0xa51a, 0x46a5, 0x6b47, + 0xe6b, 0x830e, 0xa083, 0xff9f, 0xd0ff, 0xffd0, 0xe6ff, 0x7de7, 0xc67d, + 0xd0c6, 0x61d1, 0x3a62, 0xc3b, 0x150c, 0x1715, 0x4517, 0x5345, 0x3954, + 0xdd39, 0xdadd, 0x32db, 0x6a33, 0xd169, 0x86d1, 0xb687, 0x3fb6, 0x883f, + 0xa487, 0x39a4, 0x2139, 0xbe20, 0xffbe, 0xedfe, 0x8ded, 0x368e, 0xc335, + 0x51c3, 0x9851, 0xf297, 0xd6f2, 0xb9d6, 0x95ba, 0x2096, 0xea1f, 0x76e9, + 0x4e76, 0xe04d, 0xd0df, 0x80d0, 0xa280, 0xfca2, 0x75fc, 0xef75, 0x32ef, + 0x6833, 0xdf68, 0xc4df, 0x76c4, 0xb77, 0xb10a, 0xbfb1, 0x58bf, 0x5258, + 0x4d52, 0x6c4d, 0x7e6c, 0xb67e, 0xccb5, 0x8ccc, 0xbe8c, 0xc8bd, 0x9ac8, + 0xa99b, 0x52a9, 0x2f53, 0xc30, 0x3e0c, 0xb83d, 0x83b7, 0x5383, 0x7e53, + 0x4f7e, 0xe24e, 0xb3e1, 0x8db3, 0x618e, 0xc861, 0xfcc8, 0x34fc, 0x9b35, + 0xaa9b, 0xb1aa, 0x5eb1, 0x395e, 0x8639, 0xd486, 0x8bd4, 0x558b, 0x2156, + 0xf721, 0x4ef6, 0x14f, 0x7301, 0xdd72, 0x49de, 0x894a, 0x9889, 0x8898, + 0x7788, 0x7b77, 0x637b, 0xb963, 0xabb9, 0x7cab, 0xc87b, 0x21c8, 0xcb21, + 0xdfca, 0xbfdf, 0xf2bf, 0x6af2, 0x626b, 0xb261, 0x3cb2, 0xc63c, 0xc9c6, + 0xc9c9, 0xb4c9, 0xf9b4, 0x91f9, 0x4091, 0x3a40, 0xcc39, 0xd1cb, 0x7ed1, + 0x537f, 0x6753, 0xa167, 0xba49, 0x88ba, 0x7789, 0x3877, 0xf037, 0xd3ef, + 0xb5d4, 0x55b6, 0xa555, 0xeca4, 0xa1ec, 0xb6a2, 0x7b7, 0x9507, 0xfd94, + 0x82fd, 0x5c83, 0x765c, 0x9676, 0x3f97, 0xda3f, 0x6fda, 0x646f, 0x3064, + 0x5e30, 0x655e, 0x6465, 0xcb64, 0xcdca, 0x4ccd, 0x3f4c, 0x243f, 0x6f24, + 0x656f, 0x6065, 0x3560, 0x3b36, 0xac3b, 0x4aac, 0x714a, 0x7e71, 0xda7e, + 0x7fda, 0xda7f, 0x6fda, 0xff6f, 0xc6ff, 0xedc6, 0xd4ed, 0x70d5, 0xeb70, + 0xa3eb, 0x80a3, 0xca80, 0x3fcb, 0x2540, 0xf825, 0x7ef8, 0xf87e, 0x73f8, + 0xb474, 0xb4b4, 0x92b5, 0x9293, 0x93, 0x3500, 0x7134, 0x9071, 0xfa8f, + 0x51fa, 0x1452, 0xba13, 0x7ab9, 0x957a, 0x8a95, 0x6e8a, 0x6d6e, 0x7c6d, + 0x447c, 0x9744, 0x4597, 0x8945, 0xef88, 0x8fee, 0x3190, 0x4831, 0x8447, + 0xa183, 0x1da1, 0xd41d, 0x2dd4, 0x4f2e, 0xc94e, 0xcbc9, 0xc9cb, 0x9ec9, + 0x319e, 0xd531, 0x20d5, 0x4021, 0xb23f, 0x29b2, 0xd828, 0xecd8, 0x5ded, + 0xfc5d, 0x4dfc, 0xd24d, 0x6bd2, 0x5f6b, 0xb35e, 0x7fb3, 0xee7e, 0x56ee, + 0xa657, 0x68a6, 0x8768, 0x7787, 0xb077, 0x4cb1, 0x764c, 0xb175, 0x7b1, + 0x3d07, 0x603d, 0x3560, 0x3e35, 0xb03d, 0xd6b0, 0xc8d6, 0xd8c8, 0x8bd8, + 0x3e8c, 0x303f, 0xd530, 0xf1d4, 0x42f1, 0xca42, 0xddca, 0x41dd, 0x3141, + 0x132, 0xe901, 0x8e9, 0xbe09, 0xe0bd, 0x2ce0, 0x862d, 0x3986, 0x9139, + 0x6d91, 0x6a6d, 0x8d6a, 0x1b8d, 0xac1b, 0xedab, 0x54ed, 0xc054, 0xcebf, + 0xc1ce, 0x5c2, 0x3805, 0x6038, 0x5960, 0xd359, 0xdd3, 0xbe0d, 0xafbd, + 0x6daf, 0x206d, 0x2c20, 0x862c, 0x8e86, 0xec8d, 0xa2ec, 0xa3a2, 0x51a3, + 0x8051, 0xfd7f, 0x91fd, 0xa292, 0xaf14, 0xeeae, 0x59ef, 0x535a, 0x8653, + 0x3986, 0x9539, 0xb895, 0xa0b8, 0x26a0, 0x2227, 0xc022, 0x77c0, 0xad77, + 0x46ad, 0xaa46, 0x60aa, 0x8560, 0x4785, 0xd747, 0x45d7, 0x2346, 0x5f23, + 0x25f, 0x1d02, 0x71d, 0x8206, 0xc82, 0x180c, 0x3018, 0x4b30, 0x4b, + 0x3001, 0x1230, 0x2d12, 0x8c2d, 0x148d, 0x4015, 0x5f3f, 0x3d5f, 0x6b3d, + 0x396b, 0x473a, 0xf746, 0x44f7, 0x8945, 0x3489, 0xcb34, 0x84ca, 0xd984, + 0xf0d9, 0xbcf0, 0x63bd, 0x3264, 0xf332, 0x45f3, 0x7346, 0x5673, 0xb056, + 0xd3b0, 0x4ad4, 0x184b, 0x7d18, 0x6c7d, 0xbb6c, 0xfeba, 0xe0fe, 0x10e1, + 0x5410, 0x2954, 0x9f28, 0x3a9f, 0x5a3a, 0xdb59, 0xbdc, 0xb40b, 0x1ab4, + 0x131b, 0x5d12, 0x6d5c, 0xe16c, 0xb0e0, 0x89b0, 0xba88, 0xbb, 0x3c01, + 0xe13b, 0x6fe1, 0x446f, 0xa344, 0x81a3, 0xfe81, 0xc7fd, 0x38c8, 0xb38, + 0x1a0b, 0x6d19, 0xf36c, 0x47f3, 0x6d48, 0xb76d, 0xd3b7, 0xd8d2, 0x52d9, + 0x4b53, 0xa54a, 0x34a5, 0xc534, 0x9bc4, 0xed9b, 0xbeed, 0x3ebe, 0x233e, + 0x9f22, 0x4a9f, 0x774b, 0x4577, 0xa545, 0x64a5, 0xb65, 0x870b, 0x487, + 0x9204, 0x5f91, 0xd55f, 0x35d5, 0x1a35, 0x71a, 0x7a07, 0x4e7a, 0xfc4e, + 0x1efc, 0x481f, 0x7448, 0xde74, 0xa7dd, 0x1ea7, 0xaa1e, 0xcfaa, 0xfbcf, + 0xedfb, 0x6eee, 0x386f, 0x4538, 0x6e45, 0xd96d, 0x11d9, 0x7912, 0x4b79, + 0x494b, 0x6049, 0xac5f, 0x65ac, 0x1366, 0x5913, 0xe458, 0x7ae4, 0x387a, + 0x3c38, 0xb03c, 0x76b0, 0x9376, 0xe193, 0x42e1, 0x7742, 0x6476, 0x3564, + 0x3c35, 0x6a3c, 0xcc69, 0x94cc, 0x5d95, 0xe5e, 0xee0d, 0x4ced, 0xce4c, + 0x52ce, 0xaa52, 0xdaaa, 0xe4da, 0x1de5, 0x4530, 0x5445, 0x3954, 0xb639, + 0x81b6, 0x7381, 0x1574, 0xc215, 0x10c2, 0x3f10, 0x6b3f, 0xe76b, 0x7be7, + 0xbc7b, 0xf7bb, 0x41f7, 0xcc41, 0x38cc, 0x4239, 0xa942, 0x4a9, 0xc504, + 0x7cc4, 0x437c, 0x6743, 0xea67, 0x8dea, 0xe88d, 0xd8e8, 0xdcd8, 0x17dd, + 0x5718, 0x958, 0xa609, 0x41a5, 0x5842, 0x159, 0x9f01, 0x269f, 0x5a26, + 0x405a, 0xc340, 0xb4c3, 0xd4b4, 0xf4d3, 0xf1f4, 0x39f2, 0xe439, 0x67e4, + 0x4168, 0xa441, 0xdda3, 0xdedd, 0x9df, 0xab0a, 0xa5ab, 0x9a6, 0xba09, + 0x9ab9, 0xad9a, 0x5ae, 0xe205, 0xece2, 0xecec, 0x14ed, 0xd614, 0x6bd5, + 0x916c, 0x3391, 0x6f33, 0x206f, 0x8020, 0x780, 0x7207, 0x2472, 0x8a23, + 0xb689, 0x3ab6, 0xf739, 0x97f6, 0xb097, 0xa4b0, 0xe6a4, 0x88e6, 0x2789, + 0xb28, 0x350b, 0x1f35, 0x431e, 0x1043, 0xc30f, 0x79c3, 0x379, 0x5703, + 0x3256, 0x4732, 0x7247, 0x9d72, 0x489d, 0xd348, 0xa4d3, 0x7ca4, 0xbf7b, + 0x45c0, 0x7b45, 0x337b, 0x4034, 0x843f, 0xd083, 0x35d0, 0x6335, 0x4d63, + 0xe14c, 0xcce0, 0xfecc, 0x35ff, 0x5636, 0xf856, 0xeef8, 0x2def, 0xfc2d, + 0x4fc, 0x6e04, 0xb66d, 0x78b6, 0xbb78, 0x3dbb, 0x9a3d, 0x839a, 0x9283, + 0x593, 0xd504, 0x23d5, 0x5424, 0xd054, 0x61d0, 0xdb61, 0x17db, 0x1f18, + 0x381f, 0x9e37, 0x679e, 0x1d68, 0x381d, 0x8038, 0x917f, 0x491, 0xbb04, + 0x23bb, 0x4124, 0xd41, 0xa30c, 0x8ba3, 0x8b8b, 0xc68b, 0xd2c6, 0xebd2, + 0x93eb, 0xbd93, 0x99bd, 0x1a99, 0xea19, 0x58ea, 0xcf58, 0x73cf, 0x1073, + 0x9e10, 0x139e, 0xea13, 0xcde9, 0x3ecd, 0x883f, 0xf89, 0x180f, 0x2a18, + 0x212a, 0xce20, 0x73ce, 0xf373, 0x60f3, 0xad60, 0x4093, 0x8e40, 0xb98e, + 0xbfb9, 0xf1bf, 0x8bf1, 0x5e8c, 0xe95e, 0x14e9, 0x4e14, 0x1c4e, 0x7f1c, + 0xe77e, 0x6fe7, 0xf26f, 0x13f2, 0x8b13, 0xda8a, 0x5fda, 0xea5f, 0x4eea, + 0xa84f, 0x88a8, 0x1f88, 0x2820, 0x9728, 0x5a97, 0x3f5b, 0xb23f, 0x70b2, + 0x2c70, 0x232d, 0xf623, 0x4f6, 0x905, 0x7509, 0xd675, 0x28d7, 0x9428, + 0x3794, 0xf036, 0x2bf0, 0xba2c, 0xedb9, 0xd7ed, 0x59d8, 0xed59, 0x4ed, + 0xe304, 0x18e3, 0x5c19, 0x3d5c, 0x753d, 0x6d75, 0x956d, 0x7f95, 0xc47f, + 0x83c4, 0xa84, 0x2e0a, 0x5f2e, 0xb95f, 0x77b9, 0x6d78, 0xf46d, 0x1bf4, + 0xed1b, 0xd6ed, 0xe0d6, 0x5e1, 0x3905, 0x5638, 0xa355, 0x99a2, 0xbe99, + 0xb4bd, 0x85b4, 0x2e86, 0x542e, 0x6654, 0xd765, 0x73d7, 0x3a74, 0x383a, + 0x2638, 0x7826, 0x7677, 0x9a76, 0x7e99, 0x2e7e, 0xea2d, 0xa6ea, 0x8a7, + 0x109, 0x3300, 0xad32, 0x5fad, 0x465f, 0x2f46, 0xc62f, 0xd4c5, 0xad5, + 0xcb0a, 0x4cb, 0xb004, 0x7baf, 0xe47b, 0x92e4, 0x8e92, 0x638e, 0x1763, + 0xc17, 0xf20b, 0x1ff2, 0x8920, 0x5889, 0xcb58, 0xf8cb, 0xcaf8, 0x84cb, + 0x9f84, 0x8a9f, 0x918a, 0x4991, 0x8249, 0xff81, 0x46ff, 0x5046, 0x5f50, + 0x725f, 0xf772, 0x8ef7, 0xe08f, 0xc1e0, 0x1fc2, 0x9e1f, 0x8b9d, 0x108b, + 0x411, 0x2b04, 0xb02a, 0x1fb0, 0x1020, 0x7a0f, 0x587a, 0x8958, 0xb188, + 0xb1b1, 0x49b2, 0xb949, 0x7ab9, 0x917a, 0xfc91, 0xe6fc, 0x47e7, 0xbc47, + 0x8fbb, 0xea8e, 0x34ea, 0x2635, 0x1726, 0x9616, 0xc196, 0xa6c1, 0xf3a6, + 0x11f3, 0x4811, 0x3e48, 0xeb3e, 0xf7ea, 0x1bf8, 0xdb1c, 0x8adb, 0xe18a, + 0x42e1, 0x9d42, 0x5d9c, 0x6e5d, 0x286e, 0x4928, 0x9a49, 0xb09c, 0xa6b0, + 0x2a7, 0xe702, 0xf5e6, 0x9af5, 0xf9b, 0x810f, 0x8080, 0x180, 0x1702, + 0x5117, 0xa650, 0x11a6, 0x1011, 0x550f, 0xd554, 0xbdd5, 0x6bbe, 0xc66b, + 0xfc7, 0x5510, 0x5555, 0x7655, 0x177, 0x2b02, 0x6f2a, 0xb70, 0x9f0b, + 0xcf9e, 0xf3cf, 0x3ff4, 0xcb40, 0x8ecb, 0x768e, 0x5277, 0x8652, 0x9186, + 0x9991, 0x5099, 0xd350, 0x93d3, 0x6d94, 0xe6d, 0x530e, 0x3153, 0xa531, + 0x64a5, 0x7964, 0x7c79, 0x467c, 0x1746, 0x3017, 0x3730, 0x538, 0x5, + 0x1e00, 0x5b1e, 0x955a, 0xae95, 0x3eaf, 0xff3e, 0xf8ff, 0xb2f9, 0xa1b3, + 0xb2a1, 0x5b2, 0xad05, 0x7cac, 0x2d7c, 0xd32c, 0x80d2, 0x7280, 0x8d72, + 0x1b8e, 0x831b, 0xac82, 0xfdac, 0xa7fd, 0x15a8, 0xd614, 0xe0d5, 0x7be0, + 0xb37b, 0x61b3, 0x9661, 0x9d95, 0xc79d, 0x83c7, 0xd883, 0xead7, 0xceb, + 0xf60c, 0xa9f5, 0x19a9, 0xa019, 0x8f9f, 0xd48f, 0x3ad5, 0x853a, 0x985, + 0x5309, 0x6f52, 0x1370, 0x6e13, 0xa96d, 0x98a9, 0x5198, 0x9f51, 0xb69f, + 0xa1b6, 0x2ea1, 0x672e, 0x2067, 0x6520, 0xaf65, 0x6eaf, 0x7e6f, 0xee7e, + 0x17ef, 0xa917, 0xcea8, 0x9ace, 0xff99, 0x5dff, 0xdf5d, 0x38df, 0xa39, + 0x1c0b, 0xe01b, 0x46e0, 0xcb46, 0x90cb, 0xba90, 0x4bb, 0x9104, 0x9d90, + 0xc89c, 0xf6c8, 0x6cf6, 0x886c, 0x1789, 0xbd17, 0x70bc, 0x7e71, 0x17e, + 0x1f01, 0xa01f, 0xbaa0, 0x14bb, 0xfc14, 0x7afb, 0xa07a, 0x3da0, 0xbf3d, + 0x48bf, 0x8c48, 0x968b, 0x9d96, 0xfd9d, 0x96fd, 0x9796, 0x6b97, 0xd16b, + 0xf4d1, 0x3bf4, 0x253c, 0x9125, 0x6691, 0xc166, 0x34c1, 0x5735, 0x1a57, + 0xdc19, 0x77db, 0x8577, 0x4a85, 0x824a, 0x9182, 0x7f91, 0xfd7f, 0xb4c3, + 0xb5b4, 0xb3b5, 0x7eb3, 0x617e, 0x4e61, 0xa4f, 0x530a, 0x3f52, 0xa33e, + 0x34a3, 0x9234, 0xf091, 0xf4f0, 0x1bf5, 0x311b, 0x9631, 0x6a96, 0x386b, + 0x1d39, 0xe91d, 0xe8e9, 0x69e8, 0x426a, 0xee42, 0x89ee, 0x368a, 0x2837, + 0x7428, 0x5974, 0x6159, 0x1d62, 0x7b1d, 0xf77a, 0x7bf7, 0x6b7c, 0x696c, + 0xf969, 0x4cf9, 0x714c, 0x4e71, 0x6b4e, 0x256c, 0x6e25, 0xe96d, 0x94e9, + 0x8f94, 0x3e8f, 0x343e, 0x4634, 0xb646, 0x97b5, 0x8997, 0xe8a, 0x900e, + 0x8090, 0xfd80, 0xa0fd, 0x16a1, 0xf416, 0xebf4, 0x95ec, 0x1196, 0x8911, + 0x3d89, 0xda3c, 0x9fd9, 0xd79f, 0x4bd7, 0x214c, 0x3021, 0x4f30, 0x994e, + 0x5c99, 0x6f5d, 0x326f, 0xab31, 0x6aab, 0xe969, 0x90e9, 0x1190, 0xff10, + 0xa2fe, 0xe0a2, 0x66e1, 0x4067, 0x9e3f, 0x2d9e, 0x712d, 0x8170, 0xd180, + 0xffd1, 0x25ff, 0x3826, 0x2538, 0x5f24, 0xc45e, 0x1cc4, 0xdf1c, 0x93df, + 0xc793, 0x80c7, 0x2380, 0xd223, 0x7ed2, 0xfc7e, 0x22fd, 0x7422, 0x1474, + 0xb714, 0x7db6, 0x857d, 0xa85, 0xa60a, 0x88a6, 0x4289, 0x7842, 0xc278, + 0xf7c2, 0xcdf7, 0x84cd, 0xae84, 0x8cae, 0xb98c, 0x1aba, 0x4d1a, 0x884c, + 0x4688, 0xcc46, 0xd8cb, 0x2bd9, 0xbe2b, 0xa2be, 0x72a2, 0xf772, 0xd2f6, + 0x75d2, 0xc075, 0xa3c0, 0x63a3, 0xae63, 0x8fae, 0x2a90, 0x5f2a, 0xef5f, + 0x5cef, 0xa05c, 0x89a0, 0x5e89, 0x6b5e, 0x736b, 0x773, 0x9d07, 0xe99c, + 0x27ea, 0x2028, 0xc20, 0x980b, 0x4797, 0x2848, 0x9828, 0xc197, 0x48c2, + 0x2449, 0x7024, 0x570, 0x3e05, 0xd3e, 0xf60c, 0xbbf5, 0x69bb, 0x3f6a, + 0x740, 0xf006, 0xe0ef, 0xbbe0, 0xadbb, 0x56ad, 0xcf56, 0xbfce, 0xa9bf, + 0x205b, 0x6920, 0xae69, 0x50ae, 0x2050, 0xf01f, 0x27f0, 0x9427, 0x8993, + 0x8689, 0x4087, 0x6e40, 0xb16e, 0xa1b1, 0xe8a1, 0x87e8, 0x6f88, 0xfe6f, + 0x4cfe, 0xe94d, 0xd5e9, 0x47d6, 0x3148, 0x5f31, 0xc35f, 0x13c4, 0xa413, + 0x5a5, 0x2405, 0xc223, 0x66c2, 0x3667, 0x5e37, 0x5f5e, 0x2f5f, 0x8c2f, + 0xe48c, 0xd0e4, 0x4d1, 0xd104, 0xe4d0, 0xcee4, 0xfcf, 0x480f, 0xa447, + 0x5ea4, 0xff5e, 0xbefe, 0x8dbe, 0x1d8e, 0x411d, 0x1841, 0x6918, 0x5469, + 0x1155, 0xc611, 0xaac6, 0x37ab, 0x2f37, 0xca2e, 0x87ca, 0xbd87, 0xabbd, + 0xb3ab, 0xcb4, 0xce0c, 0xfccd, 0xa5fd, 0x72a5, 0xf072, 0x83f0, 0xfe83, + 0x97fd, 0xc997, 0xb0c9, 0xadb0, 0xe6ac, 0x88e6, 0x1088, 0xbe10, 0x16be, + 0xa916, 0xa3a8, 0x46a3, 0x5447, 0xe953, 0x84e8, 0x2085, 0xa11f, 0xfa1, + 0xdd0f, 0xbedc, 0x5abe, 0x805a, 0xc97f, 0x6dc9, 0x826d, 0x4a82, 0x934a, + 0x5293, 0xd852, 0xd3d8, 0xadd3, 0xf4ad, 0xf3f4, 0xfcf3, 0xfefc, 0xcafe, + 0xb7ca, 0x3cb8, 0xa13c, 0x18a1, 0x1418, 0xea13, 0x91ea, 0xf891, 0x53f8, + 0xa254, 0xe9a2, 0x87ea, 0x4188, 0x1c41, 0xdc1b, 0xf5db, 0xcaf5, 0x45ca, + 0x6d45, 0x396d, 0xde39, 0x90dd, 0x1e91, 0x1e, 0x7b00, 0x6a7b, 0xa46a, + 0xc9a3, 0x9bc9, 0x389b, 0x1139, 0x5211, 0x1f52, 0xeb1f, 0xabeb, 0x48ab, + 0x9348, 0xb392, 0x17b3, 0x1618, 0x5b16, 0x175b, 0xdc17, 0xdedb, 0x1cdf, + 0xeb1c, 0xd1ea, 0x4ad2, 0xd4b, 0xc20c, 0x24c2, 0x7b25, 0x137b, 0x8b13, + 0x618b, 0xa061, 0xff9f, 0xfffe, 0x72ff, 0xf572, 0xe2f5, 0xcfe2, 0xd2cf, + 0x75d3, 0x6a76, 0xc469, 0x1ec4, 0xfc1d, 0x59fb, 0x455a, 0x7a45, 0xa479, + 0xb7a4 +}; + static u8 tmp_buf[TEST_BUFLEN]; #define full_csum(buff, len, sum) csum_fold(csum_partial(buff, len, sum)) @@ -338,10 +575,55 @@ static void test_csum_no_carry_inputs(struct kunit *test) } } +static void test_ip_fast_csum(struct kunit *test) +{ + __sum16 csum_result, expected; + + for (int len = IPv4_MIN_WORDS; len < IPv4_MAX_WORDS; len++) { + for (int index = 0; index < NUM_IP_FAST_CSUM_TESTS; index++) { + csum_result = ip_fast_csum(random_buf + index, len); + expected = + expected_fast_csum[(len - IPv4_MIN_WORDS) * + NUM_IP_FAST_CSUM_TESTS + + index]; + CHECK_EQ(expected, csum_result); + } + } +} + +static void test_csum_ipv6_magic(struct kunit *test) +{ + const struct in6_addr *saddr; + const struct in6_addr *daddr; + unsigned int len; + unsigned char proto; + unsigned int csum; + + const int daddr_offset = sizeof(struct in6_addr); + const int len_offset = sizeof(struct in6_addr) + sizeof(struct in6_addr); + const int proto_offset = sizeof(struct in6_addr) + sizeof(struct in6_addr) + + sizeof(int); + const int csum_offset = sizeof(struct in6_addr) + sizeof(struct in6_addr) + + sizeof(int) + sizeof(char); + + for (int i = 0; i < NUM_IPv6_TESTS; i++) { + saddr = (const struct in6_addr *)(random_buf + i); + daddr = (const struct in6_addr *)(random_buf + i + + daddr_offset); + len = *(unsigned int *)(random_buf + i + len_offset); + proto = *(random_buf + i + proto_offset); + csum = *(unsigned int *)(random_buf + i + csum_offset); + CHECK_EQ(expected_csum_ipv6_magic[i], + csum_ipv6_magic(saddr, daddr, len, proto, csum)); + } +} + static struct kunit_case __refdata checksum_test_cases[] = { KUNIT_CASE(test_csum_fixed_random_inputs), KUNIT_CASE(test_csum_all_carry_inputs), KUNIT_CASE(test_csum_no_carry_inputs), + KUNIT_CASE(test_ip_fast_csum), + KUNIT_CASE(test_csum_ipv6_magic), {} }; -- GitLab From 55b71d2ce133da893ffb1ecd69a34e1fee509292 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 5 Dec 2023 16:53:50 -0700 Subject: [PATCH 1079/1290] riscv: Hoist linker relaxation disabling logic into Kconfig Certain configurations may need to be disabled if linker relaxation is in use, such as DWARF5 with ld.lld < 18. Hoist the logic of whether or not linker relaxation is in use into Kconfig so decisions can be made at configuration time. Reviewed-by: Fangrui Song Signed-off-by: Nathan Chancellor Link: https://lore.kernel.org/r/20231205-riscv-restrict-dwarf5-llvm-v2-1-aedf00a382ac@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 5 +++++ arch/riscv/Makefile | 4 +--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 95a2a06acc6a6..72be1d8122a3c 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -181,6 +181,11 @@ config HAVE_SHADOW_CALL_STACK # https://github.com/riscv-non-isa/riscv-elf-psabi-doc/commit/a484e843e6eeb51f0cb7b8819e50da6d2444d769 depends on $(ld-option,--no-relax-gp) +config RISCV_USE_LINKER_RELAXATION + def_bool y + # https://github.com/llvm/llvm-project/commit/6611d58f5bbcbec77262d392e2923e1d680f6985 + depends on !LD_IS_LLD || LLD_VERSION >= 150000 + config ARCH_MMAP_RND_BITS_MIN default 18 if 64BIT default 8 diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index a74be78678eb0..e383aa9e27578 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -43,8 +43,7 @@ else KBUILD_LDFLAGS += -melf32lriscv endif -ifeq ($(CONFIG_LD_IS_LLD),y) -ifeq ($(call test-lt, $(CONFIG_LLD_VERSION), 150000),y) +ifndef CONFIG_RISCV_USE_LINKER_RELAXATION KBUILD_CFLAGS += -mno-relax KBUILD_AFLAGS += -mno-relax ifndef CONFIG_AS_IS_LLVM @@ -52,7 +51,6 @@ ifndef CONFIG_AS_IS_LLVM KBUILD_AFLAGS += -Wa,-mno-relax endif endif -endif ifeq ($(CONFIG_SHADOW_CALL_STACK),y) KBUILD_LDFLAGS += --no-relax-gp -- GitLab From ae84ff9a14a5a8d36a329a30626800155782e617 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 5 Dec 2023 16:53:51 -0700 Subject: [PATCH 1080/1290] riscv: Restrict DWARF5 when building with LLVM to known working versions LLVM prior to 18.0.0 would generate incorrect debug info for DWARF5 due to linker relaxation, which was worked around in clang by defaulting RISC-V to DWARF4 [1]. Unfortunately, this workaround does not work for the kernel because the DWARF version can be independently changed from the default in Kconfig. Do not allow DWARF5 to be selected for RISC-V when using linker relaxation (ld.lld >= 15.0.0) and a version of LLVM that does not have the fixes (the integrated assembler [2] and ld.lld [3] < 18.0.0) necessary to generate the correct debug info. Link: https://github.com/llvm/llvm-project/commit/bbc0f99f3bc96f1db16f649fc21dd18e5b0918f6 [1] Link: https://github.com/llvm/llvm-project/commit/1df5ea29b43690b6622db2cad7b745607ca4de6a [2] Link: https://github.com/llvm/llvm-project/commit/7ffabb61a5569444b5ac9322e22e5471cc5e4a77 [3] Signed-off-by: Nathan Chancellor Reviewed-by: Fangrui Song Link: https://lore.kernel.org/r/20231205-riscv-restrict-dwarf5-llvm-v2-2-aedf00a382ac@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 9 +++++++++ lib/Kconfig.debug | 1 + 2 files changed, 10 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 72be1d8122a3c..81b473cb47b05 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -186,6 +186,15 @@ config RISCV_USE_LINKER_RELAXATION # https://github.com/llvm/llvm-project/commit/6611d58f5bbcbec77262d392e2923e1d680f6985 depends on !LD_IS_LLD || LLD_VERSION >= 150000 +# https://github.com/llvm/llvm-project/commit/bbc0f99f3bc96f1db16f649fc21dd18e5b0918f6 +config ARCH_HAS_BROKEN_DWARF5 + def_bool y + depends on RISCV_USE_LINKER_RELAXATION + # https://github.com/llvm/llvm-project/commit/1df5ea29b43690b6622db2cad7b745607ca4de6a + depends on AS_IS_LLVM && AS_VERSION < 180000 + # https://github.com/llvm/llvm-project/commit/7ffabb61a5569444b5ac9322e22e5471cc5e4a77 + depends on LD_IS_LLD && LLD_VERSION < 180000 + config ARCH_MMAP_RND_BITS_MIN default 18 if 64BIT default 8 diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index cc7d53d9dc019..a0ebce05a368f 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -282,6 +282,7 @@ config DEBUG_INFO_DWARF4 config DEBUG_INFO_DWARF5 bool "Generate DWARF Version 5 debuginfo" select DEBUG_INFO + depends on !ARCH_HAS_BROKEN_DWARF5 depends on !CC_IS_CLANG || AS_IS_LLVM || (AS_IS_GNU && AS_VERSION >= 23502 && AS_HAS_NON_CONST_LEB128) help Generate DWARF v5 debug info. Requires binutils 2.35.2, gcc 5.0+ (gcc -- GitLab From a4426641f00cd2c293c91e881ab31faaf76b20fb Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 5 Dec 2023 16:53:52 -0700 Subject: [PATCH 1081/1290] lib/Kconfig.debug: Update AS_HAS_NON_CONST_LEB128 comment and name Fangrui noted that the comment around CONFIG_AS_HAS_NON_CONST_LEB128 could be made more accurate because explicit .sleb128 directives are not emitted, only .uleb128 directives are. Rename the symbol to CONFIG_AS_HAS_NON_CONST_ULEB128 as a result. Further clarifications include replacing "symbol deltas" with the more accurate "label differences", noting that this issue has been resolved in newer binutils (2.41+), and it only occurs when a port uses RISC-V style linker relaxation. Suggested-by: Fangrui Song Signed-off-by: Nathan Chancellor Reviewed-by: Charlie Jenkins Link: https://lore.kernel.org/r/20231205-riscv-restrict-dwarf5-llvm-v2-3-aedf00a382ac@kernel.org Signed-off-by: Palmer Dabbelt --- lib/Kconfig.debug | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a0ebce05a368f..76c2cc6975731 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -231,9 +231,10 @@ config DEBUG_INFO in the "Debug information" choice below, indicating that debug information will be generated for build targets. -# Clang is known to generate .{s,u}leb128 with symbol deltas with DWARF5, which -# some targets may not support: https://sourceware.org/bugzilla/show_bug.cgi?id=27215 -config AS_HAS_NON_CONST_LEB128 +# Clang generates .uleb128 with label differences for DWARF v5, a feature that +# older binutils ports do not support when utilizing RISC-V style linker +# relaxation: https://sourceware.org/bugzilla/show_bug.cgi?id=27215 +config AS_HAS_NON_CONST_ULEB128 def_bool $(as-instr,.uleb128 .Lexpr_end4 - .Lexpr_start3\n.Lexpr_start3:\n.Lexpr_end4:) choice @@ -258,7 +259,7 @@ config DEBUG_INFO_NONE config DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT bool "Rely on the toolchain's implicit default DWARF version" select DEBUG_INFO - depends on !CC_IS_CLANG || AS_IS_LLVM || CLANG_VERSION < 140000 || (AS_IS_GNU && AS_VERSION >= 23502 && AS_HAS_NON_CONST_LEB128) + depends on !CC_IS_CLANG || AS_IS_LLVM || CLANG_VERSION < 140000 || (AS_IS_GNU && AS_VERSION >= 23502 && AS_HAS_NON_CONST_ULEB128) help The implicit default version of DWARF debug info produced by a toolchain changes over time. @@ -283,7 +284,7 @@ config DEBUG_INFO_DWARF5 bool "Generate DWARF Version 5 debuginfo" select DEBUG_INFO depends on !ARCH_HAS_BROKEN_DWARF5 - depends on !CC_IS_CLANG || AS_IS_LLVM || (AS_IS_GNU && AS_VERSION >= 23502 && AS_HAS_NON_CONST_LEB128) + depends on !CC_IS_CLANG || AS_IS_LLVM || (AS_IS_GNU && AS_VERSION >= 23502 && AS_HAS_NON_CONST_ULEB128) help Generate DWARF v5 debug info. Requires binutils 2.35.2, gcc 5.0+ (gcc 5.0+ accepts the -gdwarf-5 flag but only had partial support for some -- GitLab From b546d6363af4791567dcd145109837fe97cc8ba5 Mon Sep 17 00:00:00 2001 From: Song Shuai Date: Thu, 30 Nov 2023 13:15:28 +0100 Subject: [PATCH 1082/1290] riscv: select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In commit afc76b8b8011 ("riscv: Using PATCHABLE_FUNCTION_ENTRY instead of MCOUNT") RISC-V added support for -fpatchable-function-entry, which removes the need for recordmcount. Select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY to tell the build system not to run recordmcount. Link: https://lore.kernel.org/linux-riscv/CAAYs2=j3Eak9vU6xbAw0zPuoh00rh8v5C2U3fePkokZFibWs2g@mail.gmail.com/T/#t Link: https://lore.kernel.org/linux-riscv/Y4jtfrJt+%2FQ5nMOz@spud/ Signed-off-by: Song Shuai Tested-by: Guo Ren Signed-off-by: Guo Ren Acked-by: Björn Töpel Link: https://lore.kernel.org/r/20231130121531.1178502-2-bjorn@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 95a2a06acc6a6..69c95e42be9f5 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -68,6 +68,7 @@ config RISCV select CPU_PM if CPU_IDLE || HIBERNATION select EDAC_SUPPORT select FRAME_POINTER if PERF_EVENTS || (FUNCTION_TRACER && !DYNAMIC_FTRACE) + select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY if DYNAMIC_FTRACE select GENERIC_ARCH_TOPOLOGY select GENERIC_ATOMIC64 if !64BIT select GENERIC_CLOCKEVENTS_BROADCAST if SMP -- GitLab From 35e61e8827ee8ea09e6093ab4d8ba45efd537e36 Mon Sep 17 00:00:00 2001 From: Song Shuai Date: Thu, 30 Nov 2023 13:15:29 +0100 Subject: [PATCH 1083/1290] riscv: ftrace: Make function graph use ftrace directly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Similar to commit 0c0593b45c9b ("x86/ftrace: Make function graph use ftrace directly") and commit c4a0ebf87ceb ("arm64/ftrace: Make function graph use ftrace directly"), RISC-V has no need for a special graph tracer hook. The graph_ops::func function can be used to install the return_hooker. This cleanup only changes the FTRACE_WITH_REGS implementation, leaving the mcount-based implementation is unaffected. Perform the simplification, and also cleanup the register save/restore macros. Signed-off-by: Song Shuai Tested-by: Guo Ren Signed-off-by: Guo Ren Acked-by: Björn Töpel Link: https://lore.kernel.org/r/20231130121531.1178502-3-bjorn@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/ftrace.h | 11 +- arch/riscv/kernel/ftrace.c | 30 +++-- arch/riscv/kernel/mcount-dyn.S | 190 +++++++++++++++++++++++++------- 3 files changed, 175 insertions(+), 56 deletions(-) diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h index 2b2f5df7ef2c7..b383926f73be1 100644 --- a/arch/riscv/include/asm/ftrace.h +++ b/arch/riscv/include/asm/ftrace.h @@ -128,7 +128,16 @@ do { \ struct dyn_ftrace; int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec); #define ftrace_init_nop ftrace_init_nop -#endif + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS +struct ftrace_ops; +struct ftrace_regs; +void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs); +#define ftrace_graph_func ftrace_graph_func +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ + +#endif /* __ASSEMBLY__ */ #endif /* CONFIG_DYNAMIC_FTRACE */ diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c index 03a6434a8cdd0..f5aa24d9e1c15 100644 --- a/arch/riscv/kernel/ftrace.c +++ b/arch/riscv/kernel/ftrace.c @@ -178,32 +178,28 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, } #ifdef CONFIG_DYNAMIC_FTRACE +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS +void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) +{ + struct pt_regs *regs = arch_ftrace_get_regs(fregs); + unsigned long *parent = (unsigned long *)®s->ra; + + prepare_ftrace_return(parent, ip, frame_pointer(regs)); +} +#else /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ extern void ftrace_graph_call(void); -extern void ftrace_graph_regs_call(void); int ftrace_enable_ftrace_graph_caller(void) { - int ret; - - ret = __ftrace_modify_call((unsigned long)&ftrace_graph_call, - (unsigned long)&prepare_ftrace_return, true, true); - if (ret) - return ret; - - return __ftrace_modify_call((unsigned long)&ftrace_graph_regs_call, + return __ftrace_modify_call((unsigned long)&ftrace_graph_call, (unsigned long)&prepare_ftrace_return, true, true); } int ftrace_disable_ftrace_graph_caller(void) { - int ret; - - ret = __ftrace_modify_call((unsigned long)&ftrace_graph_call, - (unsigned long)&prepare_ftrace_return, false, true); - if (ret) - return ret; - - return __ftrace_modify_call((unsigned long)&ftrace_graph_regs_call, + return __ftrace_modify_call((unsigned long)&ftrace_graph_call, (unsigned long)&prepare_ftrace_return, false, true); } +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S index 58dd96a2a1534..c902a7ddb3106 100644 --- a/arch/riscv/kernel/mcount-dyn.S +++ b/arch/riscv/kernel/mcount-dyn.S @@ -57,31 +57,150 @@ .endm #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS - .macro SAVE_ALL + +/** +* SAVE_ABI_REGS - save regs against the pt_regs struct +* +* @all: tell if saving all the regs +* +* If all is set, all the regs will be saved, otherwise only ABI +* related regs (a0-a7,epc,ra and optional s0) will be saved. +* +* After the stack is established, +* +* 0(sp) stores the PC of the traced function which can be accessed +* by &(fregs)->regs->epc in tracing function. Note that the real +* function entry address should be computed with -FENTRY_RA_OFFSET. +* +* 8(sp) stores the function return address (i.e. parent IP) that +* can be accessed by &(fregs)->regs->ra in tracing function. +* +* The other regs are saved at the respective localtion and accessed +* by the respective pt_regs member. +* +* Here is the layout of stack for your reference. +* +* PT_SIZE_ON_STACK -> +++++++++ +* + ..... + +* + t3-t6 + +* + s2-s11+ +* + a0-a7 + --++++-> ftrace_caller saved +* + s1 + + +* + s0 + --+ +* + t0-t2 + + +* + tp + + +* + gp + + +* + sp + + +* + ra + --+ // parent IP +* sp -> + epc + --+ // PC +* +++++++++ +**/ + .macro SAVE_ABI_REGS, all=0 addi sp, sp, -PT_SIZE_ON_STACK - REG_S t0, PT_EPC(sp) - REG_S x1, PT_RA(sp) - REG_S x2, PT_SP(sp) - REG_S x3, PT_GP(sp) - REG_S x4, PT_TP(sp) - REG_S x5, PT_T0(sp) - save_from_x6_to_x31 + REG_S t0, PT_EPC(sp) + REG_S x1, PT_RA(sp) + + // save the ABI regs + + REG_S x10, PT_A0(sp) + REG_S x11, PT_A1(sp) + REG_S x12, PT_A2(sp) + REG_S x13, PT_A3(sp) + REG_S x14, PT_A4(sp) + REG_S x15, PT_A5(sp) + REG_S x16, PT_A6(sp) + REG_S x17, PT_A7(sp) + + // save the leftover regs + + .if \all == 1 + REG_S x2, PT_SP(sp) + REG_S x3, PT_GP(sp) + REG_S x4, PT_TP(sp) + REG_S x5, PT_T0(sp) + REG_S x6, PT_T1(sp) + REG_S x7, PT_T2(sp) + REG_S x8, PT_S0(sp) + REG_S x9, PT_S1(sp) + REG_S x18, PT_S2(sp) + REG_S x19, PT_S3(sp) + REG_S x20, PT_S4(sp) + REG_S x21, PT_S5(sp) + REG_S x22, PT_S6(sp) + REG_S x23, PT_S7(sp) + REG_S x24, PT_S8(sp) + REG_S x25, PT_S9(sp) + REG_S x26, PT_S10(sp) + REG_S x27, PT_S11(sp) + REG_S x28, PT_T3(sp) + REG_S x29, PT_T4(sp) + REG_S x30, PT_T5(sp) + REG_S x31, PT_T6(sp) + + // save s0 if FP_TEST defined + + .else +#ifdef HAVE_FUNCTION_GRAPH_FP_TEST + REG_S x8, PT_S0(sp) +#endif + .endif .endm - .macro RESTORE_ALL - REG_L x1, PT_RA(sp) - REG_L x2, PT_SP(sp) - REG_L x3, PT_GP(sp) - REG_L x4, PT_TP(sp) - /* Restore t0 with PT_EPC */ - REG_L x5, PT_EPC(sp) - restore_from_x6_to_x31 - + .macro RESTORE_ABI_REGS, all=0 + REG_L t0, PT_EPC(sp) + REG_L x1, PT_RA(sp) + REG_L x10, PT_A0(sp) + REG_L x11, PT_A1(sp) + REG_L x12, PT_A2(sp) + REG_L x13, PT_A3(sp) + REG_L x14, PT_A4(sp) + REG_L x15, PT_A5(sp) + REG_L x16, PT_A6(sp) + REG_L x17, PT_A7(sp) + + .if \all == 1 + REG_L x2, PT_SP(sp) + REG_L x3, PT_GP(sp) + REG_L x4, PT_TP(sp) + REG_L x6, PT_T1(sp) + REG_L x7, PT_T2(sp) + REG_L x8, PT_S0(sp) + REG_L x9, PT_S1(sp) + REG_L x18, PT_S2(sp) + REG_L x19, PT_S3(sp) + REG_L x20, PT_S4(sp) + REG_L x21, PT_S5(sp) + REG_L x22, PT_S6(sp) + REG_L x23, PT_S7(sp) + REG_L x24, PT_S8(sp) + REG_L x25, PT_S9(sp) + REG_L x26, PT_S10(sp) + REG_L x27, PT_S11(sp) + REG_L x28, PT_T3(sp) + REG_L x29, PT_T4(sp) + REG_L x30, PT_T5(sp) + REG_L x31, PT_T6(sp) + + .else +#ifdef HAVE_FUNCTION_GRAPH_FP_TEST + REG_L x8, PT_S0(sp) +#endif + .endif addi sp, sp, PT_SIZE_ON_STACK .endm + + .macro PREPARE_ARGS + addi a0, t0, -FENTRY_RA_OFFSET + la a1, function_trace_op + REG_L a2, 0(a1) + mv a1, ra + mv a3, sp + .endm + #endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ +#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS SYM_FUNC_START(ftrace_caller) SAVE_ABI @@ -105,34 +224,29 @@ SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL) call ftrace_stub #endif RESTORE_ABI - jr t0 + jr t0 SYM_FUNC_END(ftrace_caller) -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS +#else /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ SYM_FUNC_START(ftrace_regs_caller) - SAVE_ALL - - addi a0, t0, -FENTRY_RA_OFFSET - la a1, function_trace_op - REG_L a2, 0(a1) - mv a1, ra - mv a3, sp + SAVE_ABI_REGS 1 + PREPARE_ARGS SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) call ftrace_stub -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - addi a0, sp, PT_RA - REG_L a1, PT_EPC(sp) - addi a1, a1, -FENTRY_RA_OFFSET -#ifdef HAVE_FUNCTION_GRAPH_FP_TEST - mv a2, s0 -#endif -SYM_INNER_LABEL(ftrace_graph_regs_call, SYM_L_GLOBAL) + RESTORE_ABI_REGS 1 + jr t0 +SYM_FUNC_END(ftrace_regs_caller) + +SYM_FUNC_START(ftrace_caller) + SAVE_ABI_REGS 0 + PREPARE_ARGS + +SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) call ftrace_stub -#endif - RESTORE_ALL - jr t0 -SYM_FUNC_END(ftrace_regs_caller) + RESTORE_ABI_REGS 0 + jr t0 +SYM_FUNC_END(ftrace_caller) #endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ -- GitLab From 196c79f19a92764d45005599f35338cf0a9eafbb Mon Sep 17 00:00:00 2001 From: Song Shuai Date: Thu, 30 Nov 2023 13:15:30 +0100 Subject: [PATCH 1084/1290] riscv: ftrace: Add DYNAMIC_FTRACE_WITH_DIRECT_CALLS support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Select the DYNAMIC_FTRACE_WITH_DIRECT_CALLS to provide the register_ftrace_direct[_multi] interfaces allowing users to register the customed trampoline (direct_caller) as the mcount for one or more target functions. And modify_ftrace_direct[_multi] are also provided for modifying direct_caller. To make the direct_caller and the other ftrace hooks (e.g. function/fgraph tracer, k[ret]probes) co-exist, a temporary register is nominated to store the address of direct_caller in ftrace_regs_caller. After the setting of the address direct_caller by direct_ops->func and the RESTORE_REGS in ftrace_regs_caller, direct_caller will be jumped to by the `jr` inst. Add DYNAMIC_FTRACE_WITH_DIRECT_CALLS support for RISC-V. Signed-off-by: Song Shuai Tested-by: Guo Ren Signed-off-by: Guo Ren Acked-by: Björn Töpel Link: https://lore.kernel.org/r/20231130121531.1178502-4-bjorn@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/ftrace.h | 7 +++++++ arch/riscv/kernel/mcount-dyn.S | 10 ++++++++++ 3 files changed, 18 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 69c95e42be9f5..4684cdc754a09 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -114,6 +114,7 @@ config RISCV select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS if MMU select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && MMU && (CLANG_SUPPORTS_DYNAMIC_FTRACE || GCC_SUPPORTS_DYNAMIC_FTRACE) + select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL select HAVE_FUNCTION_GRAPH_TRACER diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h index b383926f73be1..3291721229523 100644 --- a/arch/riscv/include/asm/ftrace.h +++ b/arch/riscv/include/asm/ftrace.h @@ -135,6 +135,13 @@ struct ftrace_regs; void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); #define ftrace_graph_func ftrace_graph_func + +static inline void __arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr) +{ + regs->t1 = addr; +} +#define arch_ftrace_set_direct_caller(fregs, addr) \ + __arch_ftrace_set_direct_caller(&(fregs)->regs, addr) #endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ #endif /* __ASSEMBLY__ */ diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S index c902a7ddb3106..b7ce001779c17 100644 --- a/arch/riscv/kernel/mcount-dyn.S +++ b/arch/riscv/kernel/mcount-dyn.S @@ -229,6 +229,7 @@ SYM_FUNC_END(ftrace_caller) #else /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ SYM_FUNC_START(ftrace_regs_caller) + mv t1, zero SAVE_ABI_REGS 1 PREPARE_ARGS @@ -236,7 +237,10 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) call ftrace_stub RESTORE_ABI_REGS 1 + bnez t1, .Ldirect jr t0 +.Ldirect: + jr t1 SYM_FUNC_END(ftrace_regs_caller) SYM_FUNC_START(ftrace_caller) @@ -250,3 +254,9 @@ SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) jr t0 SYM_FUNC_END(ftrace_caller) #endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +SYM_CODE_START(ftrace_stub_direct_tramp) + jr t0 +SYM_CODE_END(ftrace_stub_direct_tramp) +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ -- GitLab From 629291dd8499e4dc7dff6e9ab8c13b1a841059e8 Mon Sep 17 00:00:00 2001 From: Song Shuai Date: Thu, 30 Nov 2023 13:15:31 +0100 Subject: [PATCH 1085/1290] samples: ftrace: Add RISC-V support for SAMPLE_FTRACE_DIRECT[_MULTI] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add RISC-V variants of the ftrace-direct* samples. Tested-by: Evgenii Shatokhin Signed-off-by: Song Shuai Tested-by: Guo Ren Signed-off-by: Guo Ren Acked-by: Björn Töpel Link: https://lore.kernel.org/r/20231130121531.1178502-5-bjorn@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 2 + samples/ftrace/ftrace-direct-modify.c | 35 ++++++++++++++++++ samples/ftrace/ftrace-direct-multi-modify.c | 41 +++++++++++++++++++++ samples/ftrace/ftrace-direct-multi.c | 25 +++++++++++++ samples/ftrace/ftrace-direct-too.c | 28 ++++++++++++++ samples/ftrace/ftrace-direct.c | 24 ++++++++++++ 6 files changed, 155 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 4684cdc754a09..0ee79a92918dd 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -142,6 +142,8 @@ config RISCV select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RETHOOK if !XIP_KERNEL select HAVE_RSEQ + select HAVE_SAMPLE_FTRACE_DIRECT + select HAVE_SAMPLE_FTRACE_DIRECT_MULTI select HAVE_STACKPROTECTOR select HAVE_SYSCALL_TRACEPOINTS select HOTPLUG_CORE_SYNC_DEAD if HOTPLUG_CPU diff --git a/samples/ftrace/ftrace-direct-modify.c b/samples/ftrace/ftrace-direct-modify.c index e2a6a69352dfb..81220390851a3 100644 --- a/samples/ftrace/ftrace-direct-modify.c +++ b/samples/ftrace/ftrace-direct-modify.c @@ -24,6 +24,41 @@ extern void my_tramp2(void *); static unsigned long my_ip = (unsigned long)schedule; +#ifdef CONFIG_RISCV +#include + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp1, @function\n" +" .globl my_tramp1\n" +" my_tramp1:\n" +" addi sp,sp,-2*"SZREG"\n" +" "REG_S" t0,0*"SZREG"(sp)\n" +" "REG_S" ra,1*"SZREG"(sp)\n" +" call my_direct_func1\n" +" "REG_L" t0,0*"SZREG"(sp)\n" +" "REG_L" ra,1*"SZREG"(sp)\n" +" addi sp,sp,2*"SZREG"\n" +" jr t0\n" +" .size my_tramp1, .-my_tramp1\n" +" .type my_tramp2, @function\n" +" .globl my_tramp2\n" + +" my_tramp2:\n" +" addi sp,sp,-2*"SZREG"\n" +" "REG_S" t0,0*"SZREG"(sp)\n" +" "REG_S" ra,1*"SZREG"(sp)\n" +" call my_direct_func2\n" +" "REG_L" t0,0*"SZREG"(sp)\n" +" "REG_L" ra,1*"SZREG"(sp)\n" +" addi sp,sp,2*"SZREG"\n" +" jr t0\n" +" .size my_tramp2, .-my_tramp2\n" +" .popsection\n" +); + +#endif /* CONFIG_RISCV */ + #ifdef CONFIG_X86_64 #include diff --git a/samples/ftrace/ftrace-direct-multi-modify.c b/samples/ftrace/ftrace-direct-multi-modify.c index 2e349834d63c3..f943e40d57fd3 100644 --- a/samples/ftrace/ftrace-direct-multi-modify.c +++ b/samples/ftrace/ftrace-direct-multi-modify.c @@ -22,6 +22,47 @@ void my_direct_func2(unsigned long ip) extern void my_tramp1(void *); extern void my_tramp2(void *); +#ifdef CONFIG_RISCV +#include + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp1, @function\n" +" .globl my_tramp1\n" +" my_tramp1:\n" +" addi sp,sp,-3*"SZREG"\n" +" "REG_S" a0,0*"SZREG"(sp)\n" +" "REG_S" t0,1*"SZREG"(sp)\n" +" "REG_S" ra,2*"SZREG"(sp)\n" +" mv a0,t0\n" +" call my_direct_func1\n" +" "REG_L" a0,0*"SZREG"(sp)\n" +" "REG_L" t0,1*"SZREG"(sp)\n" +" "REG_L" ra,2*"SZREG"(sp)\n" +" addi sp,sp,3*"SZREG"\n" +" jr t0\n" +" .size my_tramp1, .-my_tramp1\n" + +" .type my_tramp2, @function\n" +" .globl my_tramp2\n" +" my_tramp2:\n" +" addi sp,sp,-3*"SZREG"\n" +" "REG_S" a0,0*"SZREG"(sp)\n" +" "REG_S" t0,1*"SZREG"(sp)\n" +" "REG_S" ra,2*"SZREG"(sp)\n" +" mv a0,t0\n" +" call my_direct_func2\n" +" "REG_L" a0,0*"SZREG"(sp)\n" +" "REG_L" t0,1*"SZREG"(sp)\n" +" "REG_L" ra,2*"SZREG"(sp)\n" +" addi sp,sp,3*"SZREG"\n" +" jr t0\n" +" .size my_tramp2, .-my_tramp2\n" +" .popsection\n" +); + +#endif /* CONFIG_RISCV */ + #ifdef CONFIG_X86_64 #include diff --git a/samples/ftrace/ftrace-direct-multi.c b/samples/ftrace/ftrace-direct-multi.c index 9243dbfe4d0c1..aed6df2927ce1 100644 --- a/samples/ftrace/ftrace-direct-multi.c +++ b/samples/ftrace/ftrace-direct-multi.c @@ -17,6 +17,31 @@ void my_direct_func(unsigned long ip) extern void my_tramp(void *); +#ifdef CONFIG_RISCV +#include + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:\n" +" addi sp,sp,-3*"SZREG"\n" +" "REG_S" a0,0*"SZREG"(sp)\n" +" "REG_S" t0,1*"SZREG"(sp)\n" +" "REG_S" ra,2*"SZREG"(sp)\n" +" mv a0,t0\n" +" call my_direct_func\n" +" "REG_L" a0,0*"SZREG"(sp)\n" +" "REG_L" t0,1*"SZREG"(sp)\n" +" "REG_L" ra,2*"SZREG"(sp)\n" +" addi sp,sp,3*"SZREG"\n" +" jr t0\n" +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_RISCV */ + #ifdef CONFIG_X86_64 #include diff --git a/samples/ftrace/ftrace-direct-too.c b/samples/ftrace/ftrace-direct-too.c index e39c3563ae4e4..6ff546a5d7eb0 100644 --- a/samples/ftrace/ftrace-direct-too.c +++ b/samples/ftrace/ftrace-direct-too.c @@ -19,6 +19,34 @@ void my_direct_func(struct vm_area_struct *vma, unsigned long address, extern void my_tramp(void *); +#ifdef CONFIG_RISCV +#include + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:\n" +" addi sp,sp,-5*"SZREG"\n" +" "REG_S" a0,0*"SZREG"(sp)\n" +" "REG_S" a1,1*"SZREG"(sp)\n" +" "REG_S" a2,2*"SZREG"(sp)\n" +" "REG_S" t0,3*"SZREG"(sp)\n" +" "REG_S" ra,4*"SZREG"(sp)\n" +" call my_direct_func\n" +" "REG_L" a0,0*"SZREG"(sp)\n" +" "REG_L" a1,1*"SZREG"(sp)\n" +" "REG_L" a2,2*"SZREG"(sp)\n" +" "REG_L" t0,3*"SZREG"(sp)\n" +" "REG_L" ra,4*"SZREG"(sp)\n" +" addi sp,sp,5*"SZREG"\n" +" jr t0\n" +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_RISCV */ + #ifdef CONFIG_X86_64 #include diff --git a/samples/ftrace/ftrace-direct.c b/samples/ftrace/ftrace-direct.c index 32c477da1e9aa..ef0945670e1eb 100644 --- a/samples/ftrace/ftrace-direct.c +++ b/samples/ftrace/ftrace-direct.c @@ -16,6 +16,30 @@ void my_direct_func(struct task_struct *p) extern void my_tramp(void *); +#ifdef CONFIG_RISCV +#include + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:\n" +" addi sp,sp,-3*"SZREG"\n" +" "REG_S" a0,0*"SZREG"(sp)\n" +" "REG_S" t0,1*"SZREG"(sp)\n" +" "REG_S" ra,2*"SZREG"(sp)\n" +" call my_direct_func\n" +" "REG_L" a0,0*"SZREG"(sp)\n" +" "REG_L" t0,1*"SZREG"(sp)\n" +" "REG_L" ra,2*"SZREG"(sp)\n" +" addi sp,sp,3*"SZREG"\n" +" jr t0\n" +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_RISCV */ + #ifdef CONFIG_X86_64 #include -- GitLab From c4db7ff7a9edf504752704f08aabb5554bd6c37f Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 19 Nov 2023 19:00:24 +0900 Subject: [PATCH 1086/1290] riscv: add dependency among Image(.gz), loader(.bin), and vmlinuz.efi A common issue in Makefile is a race in parallel building. You need to be careful to prevent multiple threads from writing to the same file simultaneously. Commit 3939f3345050 ("ARM: 8418/1: add boot image dependencies to not generate invalid images") addressed such a bad scenario. A similar symptom occurs with the following command: $ make -j$(nproc) ARCH=riscv Image Image.gz loader loader.bin vmlinuz.efi [ snip ] SORTTAB vmlinux OBJCOPY arch/riscv/boot/Image OBJCOPY arch/riscv/boot/Image OBJCOPY arch/riscv/boot/Image OBJCOPY arch/riscv/boot/Image OBJCOPY arch/riscv/boot/Image GZIP arch/riscv/boot/Image.gz AS arch/riscv/boot/loader.o AS arch/riscv/boot/loader.o Kernel: arch/riscv/boot/Image is ready PAD arch/riscv/boot/vmlinux.bin GZIP arch/riscv/boot/vmlinuz Kernel: arch/riscv/boot/loader is ready OBJCOPY arch/riscv/boot/loader.bin Kernel: arch/riscv/boot/loader.bin is ready Kernel: arch/riscv/boot/Image.gz is ready OBJCOPY arch/riscv/boot/vmlinuz.o LD arch/riscv/boot/vmlinuz.efi.elf OBJCOPY arch/riscv/boot/vmlinuz.efi Kernel: arch/riscv/boot/vmlinuz.efi is ready The log "OBJCOPY arch/riscv/boot/Image" is displayed 5 times. (also "AS arch/riscv/boot/loader.o" twice.) It indicates that 5 threads simultaneously enter arch/riscv/boot/ and write to arch/riscv/boot/Image. It occasionally leads to a build failure: $ make -j$(nproc) ARCH=riscv Image Image.gz loader loader.bin vmlinuz.efi [ snip ] SORTTAB vmlinux OBJCOPY arch/riscv/boot/Image OBJCOPY arch/riscv/boot/Image OBJCOPY arch/riscv/boot/Image OBJCOPY arch/riscv/boot/Image PAD arch/riscv/boot/vmlinux.bin truncate: Invalid number: 'arch/riscv/boot/vmlinux.bin' make[2]: *** [drivers/firmware/efi/libstub/Makefile.zboot:13: arch/riscv/boot/vmlinux.bin] Error 1 make[2]: *** Deleting file 'arch/riscv/boot/vmlinux.bin' make[1]: *** [arch/riscv/Makefile:167: vmlinuz.efi] Error 2 make[1]: *** Waiting for unfinished jobs.... Kernel: arch/riscv/boot/Image is ready GZIP arch/riscv/boot/Image.gz AS arch/riscv/boot/loader.o AS arch/riscv/boot/loader.o Kernel: arch/riscv/boot/loader is ready OBJCOPY arch/riscv/boot/loader.bin Kernel: arch/riscv/boot/loader.bin is ready Kernel: arch/riscv/boot/Image.gz is ready make: *** [Makefile:234: __sub-make] Error 2 Image.gz, loader, vmlinuz.efi depend on Image. loader.bin depends on loader. Such dependencies are not specified in arch/riscv/Makefile. Signed-off-by: Masahiro Yamada Acked-by: Ard Biesheuvel Reviewed-by: Samuel Holland Tested-by: Samuel Holland Link: https://lore.kernel.org/r/20231119100024.2370992-1-masahiroy@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index 6539e43f82767..0b7d109258e7d 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -163,6 +163,8 @@ BOOT_TARGETS := Image Image.gz loader loader.bin xipImage vmlinuz.efi all: $(notdir $(KBUILD_IMAGE)) +loader.bin: loader +Image.gz loader vmlinuz.efi: Image $(BOOT_TARGETS): vmlinux $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ @$(kecho) ' Kernel: $(boot)/$@ is ready' -- GitLab From 55ca8d7aa2af3ebdb6f85cccf1b0703d031c1678 Mon Sep 17 00:00:00 2001 From: Xiao Wang Date: Sun, 12 Nov 2023 17:52:44 +0800 Subject: [PATCH 1087/1290] riscv: Optimize hweight API with Zbb extension The Hamming Weight of a number is the total number of bits set in it, so the cpop/cpopw instruction from Zbb extension can be used to accelerate hweight() API. Signed-off-by: Xiao Wang Reviewed-by: Charlie Jenkins Link: https://lore.kernel.org/r/20231112095244.4015351-1-xiao.w.wang@intel.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/arch_hweight.h | 78 +++++++++++++++++++++++++++ arch/riscv/include/asm/bitops.h | 4 +- 2 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 arch/riscv/include/asm/arch_hweight.h diff --git a/arch/riscv/include/asm/arch_hweight.h b/arch/riscv/include/asm/arch_hweight.h new file mode 100644 index 0000000000000..c20236a0725b9 --- /dev/null +++ b/arch/riscv/include/asm/arch_hweight.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Based on arch/x86/include/asm/arch_hweight.h + */ + +#ifndef _ASM_RISCV_HWEIGHT_H +#define _ASM_RISCV_HWEIGHT_H + +#include +#include + +#if (BITS_PER_LONG == 64) +#define CPOPW "cpopw " +#elif (BITS_PER_LONG == 32) +#define CPOPW "cpop " +#else +#error "Unexpected BITS_PER_LONG" +#endif + +static __always_inline unsigned int __arch_hweight32(unsigned int w) +{ +#ifdef CONFIG_RISCV_ISA_ZBB + asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0, + RISCV_ISA_EXT_ZBB, 1) + : : : : legacy); + + asm (".option push\n" + ".option arch,+zbb\n" + CPOPW "%0, %0\n" + ".option pop\n" + : "+r" (w) : :); + + return w; + +legacy: +#endif + return __sw_hweight32(w); +} + +static inline unsigned int __arch_hweight16(unsigned int w) +{ + return __arch_hweight32(w & 0xffff); +} + +static inline unsigned int __arch_hweight8(unsigned int w) +{ + return __arch_hweight32(w & 0xff); +} + +#if BITS_PER_LONG == 64 +static __always_inline unsigned long __arch_hweight64(__u64 w) +{ +# ifdef CONFIG_RISCV_ISA_ZBB + asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0, + RISCV_ISA_EXT_ZBB, 1) + : : : : legacy); + + asm (".option push\n" + ".option arch,+zbb\n" + "cpop %0, %0\n" + ".option pop\n" + : "+r" (w) : :); + + return w; + +legacy: +# endif + return __sw_hweight64(w); +} +#else /* BITS_PER_LONG == 64 */ +static inline unsigned long __arch_hweight64(__u64 w) +{ + return __arch_hweight32((u32)w) + + __arch_hweight32((u32)(w >> 32)); +} +#endif /* !(BITS_PER_LONG == 64) */ + +#endif /* _ASM_RISCV_HWEIGHT_H */ diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h index 224b4dc02b50b..9ffc355370248 100644 --- a/arch/riscv/include/asm/bitops.h +++ b/arch/riscv/include/asm/bitops.h @@ -271,7 +271,9 @@ legacy: #include #include -#include +#include + +#include #if (BITS_PER_LONG == 64) #define __AMO(op) "amo" #op ".d" -- GitLab From 10243401059287868a5651f869a2494368872add Mon Sep 17 00:00:00 2001 From: Samuel Ortiz Date: Thu, 30 Nov 2023 12:17:02 +0100 Subject: [PATCH 1088/1290] RISC-V: Implement archrandom when Zkr is available MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Zkr extension is ratified and provides 16 bits of entropy seed when reading the SEED CSR. We can implement arch_get_random_seed_longs() by doing multiple csrrw to that CSR and filling an unsigned long with valid entropy bits. Acked-by: Conor Dooley Signed-off-by: Samuel Ortiz Signed-off-by: Clément Léger Link: https://lore.kernel.org/r/20231130111704.1319081-1-cleger@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/archrandom.h | 72 +++++++++++++++++++++++++++++ arch/riscv/include/asm/csr.h | 9 ++++ 2 files changed, 81 insertions(+) create mode 100644 arch/riscv/include/asm/archrandom.h diff --git a/arch/riscv/include/asm/archrandom.h b/arch/riscv/include/asm/archrandom.h new file mode 100644 index 0000000000000..5345360adfb9c --- /dev/null +++ b/arch/riscv/include/asm/archrandom.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Kernel interface for the RISCV arch_random_* functions + * + * Copyright (c) 2023 Rivos Inc. + * + */ + +#ifndef ASM_RISCV_ARCHRANDOM_H +#define ASM_RISCV_ARCHRANDOM_H + +#include +#include + +#define SEED_RETRY_LOOPS 100 + +static inline bool __must_check csr_seed_long(unsigned long *v) +{ + unsigned int retry = SEED_RETRY_LOOPS, valid_seeds = 0; + const int needed_seeds = sizeof(long) / sizeof(u16); + u16 *entropy = (u16 *)v; + + do { + /* + * The SEED CSR must be accessed with a read-write instruction. + */ + unsigned long csr_seed = csr_swap(CSR_SEED, 0); + unsigned long opst = csr_seed & SEED_OPST_MASK; + + switch (opst) { + case SEED_OPST_ES16: + entropy[valid_seeds++] = csr_seed & SEED_ENTROPY_MASK; + if (valid_seeds == needed_seeds) + return true; + break; + + case SEED_OPST_DEAD: + pr_err_once("archrandom: Unrecoverable error\n"); + return false; + + case SEED_OPST_BIST: + case SEED_OPST_WAIT: + default: + cpu_relax(); + continue; + } + } while (--retry); + + return false; +} + +static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs) +{ + return 0; +} + +static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs) +{ + if (!max_longs) + return 0; + + /* + * If Zkr is supported and csr_seed_long succeeds, we return one long + * worth of entropy. + */ + if (riscv_has_extension_likely(RISCV_ISA_EXT_ZKR) && csr_seed_long(v)) + return 1; + + return 0; +} + +#endif /* ASM_RISCV_ARCHRANDOM_H */ diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index 306a19a5509c1..510014051f5db 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -411,6 +411,15 @@ #define CSR_VTYPE 0xc21 #define CSR_VLENB 0xc22 +/* Scalar Crypto Extension - Entropy */ +#define CSR_SEED 0x015 +#define SEED_OPST_MASK _AC(0xC0000000, UL) +#define SEED_OPST_BIST _AC(0x00000000, UL) +#define SEED_OPST_WAIT _AC(0x40000000, UL) +#define SEED_OPST_ES16 _AC(0x80000000, UL) +#define SEED_OPST_DEAD _AC(0xC0000000, UL) +#define SEED_ENTROPY_MASK _AC(0xFFFF, UL) + #ifdef CONFIG_RISCV_M_MODE # define CSR_STATUS CSR_MSTATUS # define CSR_IE CSR_MIE -- GitLab From 080c4324fa5e81ff3780206a138223abfb57a68e Mon Sep 17 00:00:00 2001 From: Maxim Kochetkov Date: Thu, 14 Dec 2023 09:39:06 +0300 Subject: [PATCH 1089/1290] riscv: optimize ELF relocation function in riscv The patch can optimize the running times of insmod command by modify ELF relocation function. In the 5.10 and latest kernel, when install the riscv ELF drivers which contains multiple symbol table items to be relocated, kernel takes a lot of time to execute the relocation. For example, we install a 3+MB driver need 180+s. We focus on the riscv architecture handle R_RISCV_HI20 and R_RISCV_LO20 type items relocation function in the arch\riscv\kernel\module.c and find that there are two-loops in the function. If we modify the begin number in the second for-loops iteration, we could save significant time for installation. We install the same 3+MB driver could just need 2s. Signed-off-by: Amma Lee Signed-off-by: Maxim Kochetkov Reviewed-by: Charlie Jenkins Link: https://lore.kernel.org/r/20231214063906.13612-1-fido_max@inbox.ru Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/module.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c index c9d59a5448b6c..5e5a82644451e 100644 --- a/arch/riscv/kernel/module.c +++ b/arch/riscv/kernel/module.c @@ -783,6 +783,7 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, Elf_Sym *sym; void *location; unsigned int i, type; + unsigned int j_idx = 0; Elf_Addr v; int res; unsigned int num_relocations = sechdrs[relsec].sh_size / sizeof(*rel); @@ -833,9 +834,10 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, v = sym->st_value + rel[i].r_addend; if (type == R_RISCV_PCREL_LO12_I || type == R_RISCV_PCREL_LO12_S) { - unsigned int j; + unsigned int j = j_idx; + bool found = false; - for (j = 0; j < sechdrs[relsec].sh_size / sizeof(*rel); j++) { + do { unsigned long hi20_loc = sechdrs[sechdrs[relsec].sh_info].sh_addr + rel[j].r_offset; @@ -864,16 +866,26 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, hi20 = (offset + 0x800) & 0xfffff000; lo12 = offset - hi20; v = lo12; + found = true; break; } - } - if (j == sechdrs[relsec].sh_size / sizeof(*rel)) { + + j++; + if (j > sechdrs[relsec].sh_size / sizeof(*rel)) + j = 0; + + } while (j_idx != j); + + if (!found) { pr_err( "%s: Can not find HI20 relocation information\n", me->name); return -EINVAL; } + + /* Record the previous j-loop end index */ + j_idx = j; } if (reloc_handlers[type].accumulate_handler) -- GitLab From 01b55f4f0cd6ad1a16eca6c43a3190005892ef91 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 17 Jan 2024 19:31:39 -0800 Subject: [PATCH 1090/1290] libbpf: feature-detect arg:ctx tag support in kernel Add feature detector of kernel-side arg:ctx (__arg_ctx) tag support. If this is detected, libbpf will avoid doing any __arg_ctx-related BTF rewriting and checks in favor of letting kernel handle this completely. test_global_funcs/ctx_arg_rewrite subtest is adjusted to do the same feature detection (albeit in much simpler, though round-about and inefficient, way), and skip the tests. This is done to still be able to execute this test on older kernels (like in libbpf CI). Note, BPF token series ([0]) does a major refactor and code moving of libbpf-internal feature detection "framework", so to avoid unnecessary conflicts we keep newly added feature detection stand-alone with ad-hoc result caching. Once things settle, there will be a small follow up to re-integrate everything back and move code into its final place in newly-added (by BPF token series) features.c file. [0] https://patchwork.kernel.org/project/netdevbpf/list/?series=814209&state=* Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240118033143.3384355-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 67 +++++++++++++++++++ .../bpf/prog_tests/test_global_funcs.c | 13 ++++ 2 files changed, 80 insertions(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index c5a42ac309fdb..61db92189517b 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6757,6 +6757,69 @@ static int clone_func_btf_info(struct btf *btf, int orig_fn_id, struct bpf_progr return fn_id; } +static int probe_kern_arg_ctx_tag(void) +{ + /* To minimize merge conflicts with BPF token series that refactors + * feature detection code a lot, we don't integrate + * probe_kern_arg_ctx_tag() into kernel_supports() feature-detection + * framework yet, doing our own caching internally. + * This will be cleaned up a bit later when bpf/bpf-next trees settle. + */ + static int cached_result = -1; + static const char strs[] = "\0a\0b\0arg:ctx\0"; + const __u32 types[] = { + /* [1] INT */ + BTF_TYPE_INT_ENC(1 /* "a" */, BTF_INT_SIGNED, 0, 32, 4), + /* [2] PTR -> VOID */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 0), + /* [3] FUNC_PROTO `int(void *a)` */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 1), + BTF_PARAM_ENC(1 /* "a" */, 2), + /* [4] FUNC 'a' -> FUNC_PROTO (main prog) */ + BTF_TYPE_ENC(1 /* "a" */, BTF_INFO_ENC(BTF_KIND_FUNC, 0, BTF_FUNC_GLOBAL), 3), + /* [5] FUNC_PROTO `int(void *b __arg_ctx)` */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 1), + BTF_PARAM_ENC(3 /* "b" */, 2), + /* [6] FUNC 'b' -> FUNC_PROTO (subprog) */ + BTF_TYPE_ENC(3 /* "b" */, BTF_INFO_ENC(BTF_KIND_FUNC, 0, BTF_FUNC_GLOBAL), 5), + /* [7] DECL_TAG 'arg:ctx' -> func 'b' arg 'b' */ + BTF_TYPE_DECL_TAG_ENC(5 /* "arg:ctx" */, 6, 0), + }; + const struct bpf_insn insns[] = { + /* main prog */ + BPF_CALL_REL(+1), + BPF_EXIT_INSN(), + /* global subprog */ + BPF_EMIT_CALL(BPF_FUNC_get_func_ip), /* needs PTR_TO_CTX */ + BPF_EXIT_INSN(), + }; + const struct bpf_func_info_min func_infos[] = { + { 0, 4 }, /* main prog -> FUNC 'a' */ + { 2, 6 }, /* subprog -> FUNC 'b' */ + }; + LIBBPF_OPTS(bpf_prog_load_opts, opts); + int prog_fd, btf_fd, insn_cnt = ARRAY_SIZE(insns); + + if (cached_result >= 0) + return cached_result; + + btf_fd = libbpf__load_raw_btf((char *)types, sizeof(types), strs, sizeof(strs)); + if (btf_fd < 0) + return 0; + + opts.prog_btf_fd = btf_fd; + opts.func_info = &func_infos; + opts.func_info_cnt = ARRAY_SIZE(func_infos); + opts.func_info_rec_size = sizeof(func_infos[0]); + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_KPROBE, "det_arg_ctx", + "GPL", insns, insn_cnt, &opts); + close(btf_fd); + + cached_result = probe_fd(prog_fd); + return cached_result; +} + /* Check if main program or global subprog's function prototype has `arg:ctx` * argument tags, and, if necessary, substitute correct type to match what BPF * verifier would expect, taking into account specific program type. This @@ -6780,6 +6843,10 @@ static int bpf_program_fixup_func_info(struct bpf_object *obj, struct bpf_progra if (!obj->btf_ext || !prog->func_info) return 0; + /* don't do any fix ups if kernel natively supports __arg_ctx */ + if (probe_kern_arg_ctx_tag() > 0) + return 0; + /* some BPF program types just don't have named context structs, so * this fallback mechanism doesn't work for them */ diff --git a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c index 67d4ef9e62b37..e905cbaf6b3d1 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c +++ b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c @@ -47,6 +47,19 @@ static void subtest_ctx_arg_rewrite(void) struct btf *btf = NULL; __u32 info_len = sizeof(info); int err, fd, i; + struct btf *kern_btf = NULL; + + kern_btf = btf__load_vmlinux_btf(); + if (!ASSERT_OK_PTR(kern_btf, "kern_btf_load")) + return; + + /* simple detection of kernel native arg:ctx tag support */ + if (btf__find_by_name_kind(kern_btf, "bpf_subprog_arg_info", BTF_KIND_STRUCT) > 0) { + test__skip(); + btf__free(kern_btf); + return; + } + btf__free(kern_btf); skel = test_global_func_ctx_args__open(); if (!ASSERT_OK_PTR(skel, "skel_open")) -- GitLab From 66967a32d3b16ed447e76fed4d946bab52e43d86 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 17 Jan 2024 19:31:40 -0800 Subject: [PATCH 1091/1290] bpf: extract bpf_ctx_convert_map logic and make it more reusable Refactor btf_get_prog_ctx_type() a bit to allow reuse of bpf_ctx_convert_map logic in more than one places. Simplify interface by returning btf_type instead of btf_member (field reference in BTF). To do the above we need to touch and start untangling btf_translate_to_vmlinux() implementation. We do the bare minimum to not regress anything for btf_translate_to_vmlinux(), but its implementation is very questionable for what it claims to be doing. Mapping kfunc argument types to kernel corresponding types conceptually is quite different from recognizing program context types. Fixing this is out of scope for this change though. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240118033143.3384355-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 2 +- kernel/bpf/btf.c | 71 ++++++++++++++++++++++++++++----------------- 2 files changed, 46 insertions(+), 27 deletions(-) diff --git a/include/linux/btf.h b/include/linux/btf.h index 59d404e22814e..cf5c6ff489812 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -512,7 +512,7 @@ s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id); int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, struct module *owner); struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id); -const struct btf_member * +const struct btf_type * btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, enum bpf_prog_type prog_type, int arg); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 51e8b4bee0c83..10ac9efc662d0 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5615,21 +5615,46 @@ static u8 bpf_ctx_convert_map[] = { #undef BPF_MAP_TYPE #undef BPF_LINK_TYPE -const struct btf_member * -btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, - const struct btf_type *t, enum bpf_prog_type prog_type, - int arg) +static const struct btf_type *find_canonical_prog_ctx_type(enum bpf_prog_type prog_type) { const struct btf_type *conv_struct; - const struct btf_type *ctx_struct; const struct btf_member *ctx_type; - const char *tname, *ctx_tname; conv_struct = bpf_ctx_convert.t; - if (!conv_struct) { - bpf_log(log, "btf_vmlinux is malformed\n"); + if (!conv_struct) return NULL; - } + /* prog_type is valid bpf program type. No need for bounds check. */ + ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2; + /* ctx_type is a pointer to prog_ctx_type in vmlinux. + * Like 'struct __sk_buff' + */ + return btf_type_by_id(btf_vmlinux, ctx_type->type); +} + +static int find_kern_ctx_type_id(enum bpf_prog_type prog_type) +{ + const struct btf_type *conv_struct; + const struct btf_member *ctx_type; + + conv_struct = bpf_ctx_convert.t; + if (!conv_struct) + return -EFAULT; + /* prog_type is valid bpf program type. No need for bounds check. */ + ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2 + 1; + /* ctx_type is a pointer to prog_ctx_type in vmlinux. + * Like 'struct sk_buff' + */ + return ctx_type->type; +} + +const struct btf_type * +btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, enum bpf_prog_type prog_type, + int arg) +{ + const struct btf_type *ctx_type; + const char *tname, *ctx_tname; + t = btf_type_by_id(btf, t->type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); @@ -5646,17 +5671,15 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, bpf_log(log, "arg#%d struct doesn't have a name\n", arg); return NULL; } - /* prog_type is valid bpf program type. No need for bounds check. */ - ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2; - /* ctx_struct is a pointer to prog_ctx_type in vmlinux. - * Like 'struct __sk_buff' - */ - ctx_struct = btf_type_by_id(btf_vmlinux, ctx_type->type); - if (!ctx_struct) + + ctx_type = find_canonical_prog_ctx_type(prog_type); + if (!ctx_type) { + bpf_log(log, "btf_vmlinux is malformed\n"); /* should not happen */ return NULL; + } again: - ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_struct->name_off); + ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off); if (!ctx_tname) { /* should not happen */ bpf_log(log, "Please fix kernel include/linux/bpf_types.h\n"); @@ -5677,10 +5700,10 @@ again: /* bpf_user_pt_regs_t is a typedef, so resolve it to * underlying struct and check name again */ - if (!btf_type_is_modifier(ctx_struct)) + if (!btf_type_is_modifier(ctx_type)) return NULL; - while (btf_type_is_modifier(ctx_struct)) - ctx_struct = btf_type_by_id(btf_vmlinux, ctx_struct->type); + while (btf_type_is_modifier(ctx_type)) + ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type); goto again; } return ctx_type; @@ -5692,13 +5715,9 @@ static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, enum bpf_prog_type prog_type, int arg) { - const struct btf_member *prog_ctx_type, *kern_ctx_type; - - prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type, arg); - if (!prog_ctx_type) + if (!btf_get_prog_ctx_type(log, btf, t, prog_type, arg)) return -ENOENT; - kern_ctx_type = prog_ctx_type + 1; - return kern_ctx_type->type; + return find_kern_ctx_type_id(prog_type); } int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type) -- GitLab From 0ba971511d16603599f947459e59b435cc465b0d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 17 Jan 2024 19:31:41 -0800 Subject: [PATCH 1092/1290] bpf: enforce types for __arg_ctx-tagged arguments in global subprogs Add enforcement of expected types for context arguments tagged with arg:ctx (__arg_ctx) tag. First, any program type will accept generic `void *` context type when combined with __arg_ctx tag. Besides accepting "canonical" struct names and `void *`, for a bunch of program types for which program context is actually a named struct, we allows a bunch of pragmatic exceptions to match real-world and expected usage: - for both kprobes and perf_event we allow `bpf_user_pt_regs_t *` as canonical context argument type, where `bpf_user_pt_regs_t` is a *typedef*, not a struct; - for kprobes, we also always accept `struct pt_regs *`, as that's what actually is passed as a context to any kprobe program; - for perf_event, we resolve typedefs (unless it's `bpf_user_pt_regs_t`) down to actual struct type and accept `struct pt_regs *`, or `struct user_pt_regs *`, or `struct user_regs_struct *`, depending on the actual struct type kernel architecture points `bpf_user_pt_regs_t` typedef to; otherwise, canonical `struct bpf_perf_event_data *` is expected; - for raw_tp/raw_tp.w programs, `u64/long *` are accepted, as that's what's expected with BPF_PROG() usage; otherwise, canonical `struct bpf_raw_tracepoint_args *` is expected; - tp_btf supports both `struct bpf_raw_tracepoint_args *` and `u64 *` formats, both are coded as expections as tp_btf is actually a TRACING program type, which has no canonical context type; - iterator programs accept `struct bpf_iter__xxx *` structs, currently with no further iterator-type specific enforcement; - fentry/fexit/fmod_ret/lsm/struct_ops all accept `u64 *`; - classic tracepoint programs, as well as syscall and freplace programs allow any user-provided type. In all other cases kernel will enforce exact match of struct name to expected canonical type. And if user-provided type doesn't match that expectation, verifier will emit helpful message with expected type name. Note a bit unnatural way the check is done after processing all the arguments. This is done to avoid conflict between bpf and bpf-next trees. Once trees converge, a small follow up patch will place a simple btf_validate_prog_ctx_type() check into a proper ARG_PTR_TO_CTX branch (which bpf-next tree patch refactored already), removing duplicated arg:ctx detection logic. Suggested-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240118033143.3384355-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 160 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 10ac9efc662d0..5964711891767 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5709,6 +5709,149 @@ again: return ctx_type; } +/* forward declarations for arch-specific underlying types of + * bpf_user_pt_regs_t; this avoids the need for arch-specific #ifdef + * compilation guards below for BPF_PROG_TYPE_PERF_EVENT checks, but still + * works correctly with __builtin_types_compatible_p() on respective + * architectures + */ +struct user_regs_struct; +struct user_pt_regs; + +static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, int arg, + enum bpf_prog_type prog_type, + enum bpf_attach_type attach_type) +{ + const struct btf_type *ctx_type; + const char *tname, *ctx_tname; + + if (!btf_is_ptr(t)) { + bpf_log(log, "arg#%d type isn't a pointer\n", arg); + return -EINVAL; + } + t = btf_type_by_id(btf, t->type); + + /* KPROBE and PERF_EVENT programs allow bpf_user_pt_regs_t typedef */ + if (prog_type == BPF_PROG_TYPE_KPROBE || prog_type == BPF_PROG_TYPE_PERF_EVENT) { + while (btf_type_is_modifier(t) && !btf_type_is_typedef(t)) + t = btf_type_by_id(btf, t->type); + + if (btf_type_is_typedef(t)) { + tname = btf_name_by_offset(btf, t->name_off); + if (tname && strcmp(tname, "bpf_user_pt_regs_t") == 0) + return 0; + } + } + + /* all other program types don't use typedefs for context type */ + while (btf_type_is_modifier(t)) + t = btf_type_by_id(btf, t->type); + + /* `void *ctx __arg_ctx` is always valid */ + if (btf_type_is_void(t)) + return 0; + + tname = btf_name_by_offset(btf, t->name_off); + if (str_is_empty(tname)) { + bpf_log(log, "arg#%d type doesn't have a name\n", arg); + return -EINVAL; + } + + /* special cases */ + switch (prog_type) { + case BPF_PROG_TYPE_KPROBE: + if (__btf_type_is_struct(t) && strcmp(tname, "pt_regs") == 0) + return 0; + break; + case BPF_PROG_TYPE_PERF_EVENT: + if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct pt_regs) && + __btf_type_is_struct(t) && strcmp(tname, "pt_regs") == 0) + return 0; + if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_pt_regs) && + __btf_type_is_struct(t) && strcmp(tname, "user_pt_regs") == 0) + return 0; + if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_regs_struct) && + __btf_type_is_struct(t) && strcmp(tname, "user_regs_struct") == 0) + return 0; + break; + case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: + /* allow u64* as ctx */ + if (btf_is_int(t) && t->size == 8) + return 0; + break; + case BPF_PROG_TYPE_TRACING: + switch (attach_type) { + case BPF_TRACE_RAW_TP: + /* tp_btf program is TRACING, so need special case here */ + if (__btf_type_is_struct(t) && + strcmp(tname, "bpf_raw_tracepoint_args") == 0) + return 0; + /* allow u64* as ctx */ + if (btf_is_int(t) && t->size == 8) + return 0; + break; + case BPF_TRACE_ITER: + /* allow struct bpf_iter__xxx types only */ + if (__btf_type_is_struct(t) && + strncmp(tname, "bpf_iter__", sizeof("bpf_iter__") - 1) == 0) + return 0; + break; + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + case BPF_MODIFY_RETURN: + /* allow u64* as ctx */ + if (btf_is_int(t) && t->size == 8) + return 0; + break; + default: + break; + } + break; + case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_STRUCT_OPS: + /* allow u64* as ctx */ + if (btf_is_int(t) && t->size == 8) + return 0; + break; + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_SYSCALL: + case BPF_PROG_TYPE_EXT: + return 0; /* anything goes */ + default: + break; + } + + ctx_type = find_canonical_prog_ctx_type(prog_type); + if (!ctx_type) { + /* should not happen */ + bpf_log(log, "btf_vmlinux is malformed\n"); + return -EINVAL; + } + + /* resolve typedefs and check that underlying structs are matching as well */ + while (btf_type_is_modifier(ctx_type)) + ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type); + + /* if program type doesn't have distinctly named struct type for + * context, then __arg_ctx argument can only be `void *`, which we + * already checked above + */ + if (!__btf_type_is_struct(ctx_type)) { + bpf_log(log, "arg#%d should be void pointer\n", arg); + return -EINVAL; + } + + ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off); + if (!__btf_type_is_struct(t) || strcmp(ctx_tname, tname) != 0) { + bpf_log(log, "arg#%d should be `struct %s *`\n", arg, ctx_tname); + return -EINVAL; + } + + return 0; +} + static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, struct btf *btf, const struct btf_type *t, @@ -6953,6 +7096,23 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) return -EINVAL; } + for (i = 0; i < nargs; i++) { + const char *tag; + + if (sub->args[i].arg_type != ARG_PTR_TO_CTX) + continue; + + /* check if arg has "arg:ctx" tag */ + t = btf_type_by_id(btf, args[i].type); + tag = btf_find_decl_tag_value(btf, fn_t, i, "arg:"); + if (IS_ERR_OR_NULL(tag) || strcmp(tag, "ctx") != 0) + continue; + + if (btf_validate_prog_ctx_type(log, btf, t, i, prog_type, + prog->expected_attach_type)) + return -EINVAL; + } + sub->arg_cnt = nargs; sub->args_cached = true; -- GitLab From 989410cde81959c4033dc287d79b42b6eb04f04f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 17 Jan 2024 19:31:42 -0800 Subject: [PATCH 1093/1290] selftests/bpf: add tests confirming type logic in kernel for __arg_ctx Add a bunch of global subprogs across variety of program types to validate expected kernel type enforcement logic for __arg_ctx arguments. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240118033143.3384355-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- .../bpf/progs/verifier_global_subprogs.c | 164 +++++++++++++++++- 1 file changed, 161 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c index 9eeb2d89cda88..67dddd9418911 100644 --- a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c +++ b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c @@ -3,6 +3,7 @@ #include #include +#include #include "bpf_misc.h" #include "xdp_metadata.h" #include "bpf_kfuncs.h" @@ -138,25 +139,182 @@ __weak int subprog_ctx_tag(void *ctx __arg_ctx) return bpf_get_stack(ctx, stack, sizeof(stack), 0); } +__weak int raw_tp_canonical(struct bpf_raw_tracepoint_args *ctx __arg_ctx) +{ + return 0; +} + +__weak int raw_tp_u64_array(u64 *ctx __arg_ctx) +{ + return 0; +} + SEC("?raw_tp") __success __log_level(2) int arg_tag_ctx_raw_tp(void *ctx) { - return subprog_ctx_tag(ctx); + return subprog_ctx_tag(ctx) + raw_tp_canonical(ctx) + raw_tp_u64_array(ctx); +} + +SEC("?raw_tp.w") +__success __log_level(2) +int arg_tag_ctx_raw_tp_writable(void *ctx) +{ + return subprog_ctx_tag(ctx) + raw_tp_canonical(ctx) + raw_tp_u64_array(ctx); +} + +SEC("?tp_btf/sys_enter") +__success __log_level(2) +int arg_tag_ctx_raw_tp_btf(void *ctx) +{ + return subprog_ctx_tag(ctx) + raw_tp_canonical(ctx) + raw_tp_u64_array(ctx); +} + +struct whatever { }; + +__weak int tp_whatever(struct whatever *ctx __arg_ctx) +{ + return 0; } SEC("?tp") __success __log_level(2) int arg_tag_ctx_tp(void *ctx) { - return subprog_ctx_tag(ctx); + return subprog_ctx_tag(ctx) + tp_whatever(ctx); +} + +__weak int kprobe_subprog_pt_regs(struct pt_regs *ctx __arg_ctx) +{ + return 0; +} + +__weak int kprobe_subprog_typedef(bpf_user_pt_regs_t *ctx __arg_ctx) +{ + return 0; } SEC("?kprobe") __success __log_level(2) int arg_tag_ctx_kprobe(void *ctx) { - return subprog_ctx_tag(ctx); + return subprog_ctx_tag(ctx) + + kprobe_subprog_pt_regs(ctx) + + kprobe_subprog_typedef(ctx); +} + +__weak int perf_subprog_regs( +#if defined(bpf_target_riscv) + struct user_regs_struct *ctx __arg_ctx +#elif defined(bpf_target_s390) + /* user_pt_regs typedef is anonymous struct, so only `void *` works */ + void *ctx __arg_ctx +#elif defined(bpf_target_loongarch) || defined(bpf_target_arm64) || defined(bpf_target_powerpc) + struct user_pt_regs *ctx __arg_ctx +#else + struct pt_regs *ctx __arg_ctx +#endif +) +{ + return 0; +} + +__weak int perf_subprog_typedef(bpf_user_pt_regs_t *ctx __arg_ctx) +{ + return 0; +} + +__weak int perf_subprog_canonical(struct bpf_perf_event_data *ctx __arg_ctx) +{ + return 0; +} + +SEC("?perf_event") +__success __log_level(2) +int arg_tag_ctx_perf(void *ctx) +{ + return subprog_ctx_tag(ctx) + + perf_subprog_regs(ctx) + + perf_subprog_typedef(ctx) + + perf_subprog_canonical(ctx); +} + +__weak int iter_subprog_void(void *ctx __arg_ctx) +{ + return 0; +} + +__weak int iter_subprog_typed(struct bpf_iter__task *ctx __arg_ctx) +{ + return 0; +} + +SEC("?iter/task") +__success __log_level(2) +int arg_tag_ctx_iter_task(struct bpf_iter__task *ctx) +{ + return (iter_subprog_void(ctx) + iter_subprog_typed(ctx)) & 1; +} + +__weak int tracing_subprog_void(void *ctx __arg_ctx) +{ + return 0; +} + +__weak int tracing_subprog_u64(u64 *ctx __arg_ctx) +{ + return 0; +} + +int acc; + +SEC("?fentry/" SYS_PREFIX "sys_nanosleep") +__success __log_level(2) +int BPF_PROG(arg_tag_ctx_fentry) +{ + acc += tracing_subprog_void(ctx) + tracing_subprog_u64(ctx); + return 0; +} + +SEC("?fexit/" SYS_PREFIX "sys_nanosleep") +__success __log_level(2) +int BPF_PROG(arg_tag_ctx_fexit) +{ + acc += tracing_subprog_void(ctx) + tracing_subprog_u64(ctx); + return 0; +} + +SEC("?fmod_ret/" SYS_PREFIX "sys_nanosleep") +__success __log_level(2) +int BPF_PROG(arg_tag_ctx_fmod_ret) +{ + return tracing_subprog_void(ctx) + tracing_subprog_u64(ctx); +} + +SEC("?lsm/bpf") +__success __log_level(2) +int BPF_PROG(arg_tag_ctx_lsm) +{ + return tracing_subprog_void(ctx) + tracing_subprog_u64(ctx); +} + +SEC("?struct_ops/test_1") +__success __log_level(2) +int BPF_PROG(arg_tag_ctx_struct_ops) +{ + return tracing_subprog_void(ctx) + tracing_subprog_u64(ctx); +} + +SEC(".struct_ops") +struct bpf_dummy_ops dummy_1 = { + .test_1 = (void *)arg_tag_ctx_struct_ops, +}; + +SEC("?syscall") +__success __log_level(2) +int arg_tag_ctx_syscall(void *ctx) +{ + return tracing_subprog_void(ctx) + tracing_subprog_u64(ctx) + tp_whatever(ctx); } __weak int subprog_dynptr(struct bpf_dynptr *dptr) -- GitLab From 76ec90a996e3c707eb6772510afa36faeba2ecff Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 17 Jan 2024 19:31:43 -0800 Subject: [PATCH 1094/1290] libbpf: warn on unexpected __arg_ctx type when rewriting BTF On kernel that don't support arg:ctx tag, before adjusting global subprog BTF information to match kernel's expected canonical type names, make sure that types used by user are meaningful, and if not, warn and don't do BTF adjustments. This is similar to checks that kernel performs, but narrower in scope, as only a small subset of BPF program types can be accommodated by libbpf using canonical type names. Libbpf unconditionally allows `struct pt_regs *` for perf_event program types, unlike kernel, which supports that conditionally on architecture. This is done to keep things simple and not cause unnecessary false positives. This seems like a minor and harmless deviation, which in real-world programs will be caught by kernels with arg:ctx tag support anyways. So KISS principle. This logic is hard to test (especially on latest kernels), so manual testing was performed instead. Libbpf emitted the following warning for perf_event program with wrong context argument type: libbpf: prog 'arg_tag_ctx_perf': subprog 'subprog_ctx_tag' arg#0 is expected to be of `struct bpf_perf_event_data *` type Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240118033143.3384355-6-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 75 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 66 insertions(+), 9 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 61db92189517b..afd09571c482b 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6695,6 +6695,67 @@ static struct { /* all other program types don't have "named" context structs */ }; +static bool need_func_arg_type_fixup(const struct btf *btf, const struct bpf_program *prog, + const char *subprog_name, int arg_idx, + int arg_type_id, const char *ctx_name) +{ + const struct btf_type *t; + const char *tname; + + /* check if existing parameter already matches verifier expectations */ + t = skip_mods_and_typedefs(btf, arg_type_id, NULL); + if (!btf_is_ptr(t)) + goto out_warn; + + /* typedef bpf_user_pt_regs_t is a special PITA case, valid for kprobe + * and perf_event programs, so check this case early on and forget + * about it for subsequent checks + */ + while (btf_is_mod(t)) + t = btf__type_by_id(btf, t->type); + if (btf_is_typedef(t) && + (prog->type == BPF_PROG_TYPE_KPROBE || prog->type == BPF_PROG_TYPE_PERF_EVENT)) { + tname = btf__str_by_offset(btf, t->name_off) ?: ""; + if (strcmp(tname, "bpf_user_pt_regs_t") == 0) + return false; /* canonical type for kprobe/perf_event */ + } + + /* now we can ignore typedefs moving forward */ + t = skip_mods_and_typedefs(btf, t->type, NULL); + + /* if it's `void *`, definitely fix up BTF info */ + if (btf_is_void(t)) + return true; + + /* if it's already proper canonical type, no need to fix up */ + tname = btf__str_by_offset(btf, t->name_off) ?: ""; + if (btf_is_struct(t) && strcmp(tname, ctx_name) == 0) + return false; + + /* special cases */ + switch (prog->type) { + case BPF_PROG_TYPE_KPROBE: + case BPF_PROG_TYPE_PERF_EVENT: + /* `struct pt_regs *` is expected, but we need to fix up */ + if (btf_is_struct(t) && strcmp(tname, "pt_regs") == 0) + return true; + break; + case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: + /* allow u64* as ctx */ + if (btf_is_int(t) && t->size == 8) + return true; + break; + default: + break; + } + +out_warn: + pr_warn("prog '%s': subprog '%s' arg#%d is expected to be of `struct %s *` type\n", + prog->name, subprog_name, arg_idx, ctx_name); + return false; +} + static int clone_func_btf_info(struct btf *btf, int orig_fn_id, struct bpf_program *prog) { int fn_id, fn_proto_id, ret_type_id, orig_proto_id; @@ -6829,7 +6890,7 @@ static int probe_kern_arg_ctx_tag(void) */ static int bpf_program_fixup_func_info(struct bpf_object *obj, struct bpf_program *prog) { - const char *ctx_name = NULL, *ctx_tag = "arg:ctx"; + const char *ctx_name = NULL, *ctx_tag = "arg:ctx", *fn_name; struct bpf_func_info_min *func_rec; struct btf_type *fn_t, *fn_proto_t; struct btf *btf = obj->btf; @@ -6909,15 +6970,11 @@ static int bpf_program_fixup_func_info(struct bpf_object *obj, struct bpf_progra if (arg_idx < 0 || arg_idx >= arg_cnt) continue; - /* check if existing parameter already matches verifier expectations */ + /* check if we should fix up argument type */ p = &btf_params(fn_proto_t)[arg_idx]; - t = skip_mods_and_typedefs(btf, p->type, NULL); - if (btf_is_ptr(t) && - (t = skip_mods_and_typedefs(btf, t->type, NULL)) && - btf_is_struct(t) && - strcmp(btf__str_by_offset(btf, t->name_off), ctx_name) == 0) { - continue; /* no need for fix up */ - } + fn_name = btf__str_by_offset(btf, fn_t->name_off) ?: ""; + if (!need_func_arg_type_fixup(btf, prog, fn_name, arg_idx, p->type, ctx_name)) + continue; /* clone fn/fn_proto, unless we already did it for another arg */ if (func_rec->type_id == orig_fn_id) { -- GitLab From 39369c9a6e090619a7b5eedb2212c99ac8a03890 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Jan 2024 07:43:11 -0800 Subject: [PATCH 1095/1290] selftests: netdevsim: add a config file netdevsim tests aren't very well integrated with kselftest, which has its advantages and disadvantages. But regardless of the intended integration - a config file to know what kernel to build is very useful, add one. Fixes: fc4c93f145d7 ("selftests: add basic netdevsim devlink flash testing") Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20240116154311.1945801-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- tools/testing/selftests/drivers/net/netdevsim/config | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 tools/testing/selftests/drivers/net/netdevsim/config diff --git a/tools/testing/selftests/drivers/net/netdevsim/config b/tools/testing/selftests/drivers/net/netdevsim/config new file mode 100644 index 0000000000000..adf45a3a78b41 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netdevsim/config @@ -0,0 +1,10 @@ +CONFIG_DUMMY=y +CONFIG_GENEVE=m +CONFIG_IPV6=y +CONFIG_NETDEVSIM=m +CONFIG_NET_SCH_MQPRIO=y +CONFIG_NET_SCH_MULTIQ=y +CONFIG_NET_SCH_PRIO=y +CONFIG_PSAMPLE=y +CONFIG_PTP_1588_CLOCK_MOCK=y +CONFIG_VXLAN=m -- GitLab From dd2d40acdbb2b9e6bcddd5424b0e00c1760ecf26 Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Tue, 16 Jan 2024 10:49:26 -0500 Subject: [PATCH 1096/1290] selftests: bonding: Add more missing config options As a followup to commit 03fb8565c880 ("selftests: bonding: add missing build configs"), add more networking-specific config options which are needed for bonding tests. For testing, I used the minimal config generated by virtme-ng and I added the options in the config file. All bonding tests passed. Fixes: bbb774d921e2 ("net: Add tests for bonding and team address list management") # for ipv6 Fixes: 6cbe791c0f4e ("kselftest: bonding: add num_grat_arp test") # for tc options Fixes: 222c94ec0ad4 ("selftests: bonding: add tests for ether type changes") # for nlmon Suggested-by: Jakub Kicinski Signed-off-by: Benjamin Poirier Link: https://lore.kernel.org/r/20240116154926.202164-1-bpoirier@nvidia.com Signed-off-by: Paolo Abeni --- tools/testing/selftests/drivers/net/bonding/config | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config index f85b16fc5128c..899d7fb6ea8e9 100644 --- a/tools/testing/selftests/drivers/net/bonding/config +++ b/tools/testing/selftests/drivers/net/bonding/config @@ -1,5 +1,10 @@ CONFIG_BONDING=y CONFIG_BRIDGE=y CONFIG_DUMMY=y +CONFIG_IPV6=y CONFIG_MACVLAN=y +CONFIG_NET_ACT_GACT=y +CONFIG_NET_CLS_FLOWER=y +CONFIG_NET_SCH_INGRESS=y +CONFIG_NLMON=y CONFIG_VETH=y -- GitLab From 832dd634bd1b4e3bbe9f10b9c9ba5db6f6f2b97f Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 16 Jan 2024 11:02:20 +0000 Subject: [PATCH 1097/1290] arm64: entry: fix ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD Currently the ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD workaround isn't quite right, as it is supposed to be applied after the last explicit memory access, but is immediately followed by an LDR. The ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD workaround is used to handle Cortex-A520 erratum 2966298 and Cortex-A510 erratum 3117295, which are described in: * https://developer.arm.com/documentation/SDEN2444153/0600/?lang=en * https://developer.arm.com/documentation/SDEN1873361/1600/?lang=en In both cases the workaround is described as: | If pagetable isolation is disabled, the context switch logic in the | kernel can be updated to execute the following sequence on affected | cores before exiting to EL0, and after all explicit memory accesses: | | 1. A non-shareable TLBI to any context and/or address, including | unused contexts or addresses, such as a `TLBI VALE1 Xzr`. | | 2. A DSB NSH to guarantee completion of the TLBI. The important part being that the TLBI+DSB must be placed "after all explicit memory accesses". Unfortunately, as-implemented, the TLBI+DSB is immediately followed by an LDR, as we have: | alternative_if ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD | tlbi vale1, xzr | dsb nsh | alternative_else_nop_endif | alternative_if_not ARM64_UNMAP_KERNEL_AT_EL0 | ldr lr, [sp, #S_LR] | add sp, sp, #PT_REGS_SIZE // restore sp | eret | alternative_else_nop_endif | | [ ... KPTI exception return path ... ] This patch fixes this by reworking the logic to place the TLBI+DSB immediately before the ERET, after all explicit memory accesses. The ERET is currently in a separate alternative block, and alternatives cannot be nested. To account for this, the alternative block for ARM64_UNMAP_KERNEL_AT_EL0 is replaced with a single alternative branch to skip the KPTI logic, with the new shape of the logic being: | alternative_insn "b .L_skip_tramp_exit_\@", nop, ARM64_UNMAP_KERNEL_AT_EL0 | [ ... KPTI exception return path ... ] | .L_skip_tramp_exit_\@: | | ldr lr, [sp, #S_LR] | add sp, sp, #PT_REGS_SIZE // restore sp | | alternative_if ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD | tlbi vale1, xzr | dsb nsh | alternative_else_nop_endif | eret The new structure means that the workaround is only applied when KPTI is not in use; this is fine as noted in the documented implications of the erratum: | Pagetable isolation between EL0 and higher level ELs prevents the | issue from occurring. ... and as per the workaround description quoted above, the workaround is only necessary "If pagetable isolation is disabled". Fixes: 471470bc7052 ("arm64: errata: Add Cortex-A520 speculative unprivileged load workaround") Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: James Morse Cc: Rob Herring Cc: Will Deacon Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240116110221.420467-2-mark.rutland@arm.com Signed-off-by: Will Deacon --- arch/arm64/kernel/entry.S | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 544ab46649f3f..7fcbee0f6c0e4 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -428,16 +428,9 @@ alternative_else_nop_endif ldp x28, x29, [sp, #16 * 14] .if \el == 0 -alternative_if ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD - tlbi vale1, xzr - dsb nsh -alternative_else_nop_endif -alternative_if_not ARM64_UNMAP_KERNEL_AT_EL0 - ldr lr, [sp, #S_LR] - add sp, sp, #PT_REGS_SIZE // restore sp - eret -alternative_else_nop_endif #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + alternative_insn "b .L_skip_tramp_exit_\@", nop, ARM64_UNMAP_KERNEL_AT_EL0 + msr far_el1, x29 ldr_this_cpu x30, this_cpu_vector, x29 @@ -446,7 +439,18 @@ alternative_else_nop_endif ldr lr, [sp, #S_LR] // restore x30 add sp, sp, #PT_REGS_SIZE // restore sp br x29 + +.L_skip_tramp_exit_\@: #endif + ldr lr, [sp, #S_LR] + add sp, sp, #PT_REGS_SIZE // restore sp + + /* This must be after the last explicit memory access */ +alternative_if ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD + tlbi vale1, xzr + dsb nsh +alternative_else_nop_endif + eret .else ldr lr, [sp, #S_LR] add sp, sp, #PT_REGS_SIZE // restore sp -- GitLab From da59f1d051d57e85eca49401a3a36d5a622babde Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 16 Jan 2024 11:02:21 +0000 Subject: [PATCH 1098/1290] arm64: entry: simplify kernel_exit logic For historical reasons, the non-KPTI exception return path is duplicated for EL1 and EL0, with the structure: .if \el == 0 [ KPTI handling ] ldr lr, [sp, #S_LR] add sp, sp, #PT_REGS_SIZE // restore sp [ EL0 exception return workaround ] eret .else ldr lr, [sp, #S_LR] add sp, sp, #PT_REGS_SIZE // restore sp [ EL1 exception return workaround ] eret .endif sb This would be simpler and clearer with the common portions factored out, e.g. .if \el == 0 [ KPTI handling ] .endif ldr lr, [sp, #S_LR] add sp, sp, #PT_REGS_SIZE // restore sp .if \el == 0 [ EL0 exception return workaround ] .else [ EL1 exception return workaround ] .endif eret sb This expands to the same code, but is simpler for a human to follow as it avoids duplicates the restore of LR+SP, and makes it clear that the ERET is associated with the SB. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: James Morse Cc: Rob Herring Cc: Will Deacon Link: https://lore.kernel.org/r/20240116110221.420467-3-mark.rutland@arm.com Signed-off-by: Will Deacon --- arch/arm64/kernel/entry.S | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 7fcbee0f6c0e4..7ef0e127b149f 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -442,24 +442,23 @@ alternative_else_nop_endif .L_skip_tramp_exit_\@: #endif + .endif + ldr lr, [sp, #S_LR] add sp, sp, #PT_REGS_SIZE // restore sp + .if \el == 0 /* This must be after the last explicit memory access */ alternative_if ARM64_WORKAROUND_SPECULATIVE_UNPRIV_LOAD tlbi vale1, xzr dsb nsh alternative_else_nop_endif - eret .else - ldr lr, [sp, #S_LR] - add sp, sp, #PT_REGS_SIZE // restore sp - /* Ensure any device/NC reads complete */ alternative_insn nop, "dmb sy", ARM64_WORKAROUND_1508412 + .endif eret - .endif sb .endm -- GitLab From b7c510d049049409e8945b932f4b0b357fa17415 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 15 Jan 2024 18:42:38 +0000 Subject: [PATCH 1099/1290] arm64/ptrace: Don't flush ZA/ZT storage when writing ZA via ptrace When writing ZA we currently unconditionally flush the buffer used to store it as part of ensuring that it is allocated. Since this buffer is shared with ZT0 this means that a write to ZA when PSTATE.ZA is already set will corrupt the value of ZT0 on a SME2 system. Fix this by only flushing the backing storage if PSTATE.ZA was not previously set. This will mean that short or failed writes may leave stale data in the buffer, this seems as correct as our current behaviour and unlikely to be something that userspace will rely on. Fixes: f90b529bcbe5 ("arm64/sme: Implement ZT0 ptrace support") Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20240115-arm64-fix-ptrace-za-zt-v1-1-48617517028a@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/ptrace.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 20d7ef82de90a..b3f64144b5cd9 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -1107,12 +1107,13 @@ static int za_set(struct task_struct *target, } } - /* Allocate/reinit ZA storage */ - sme_alloc(target, true); - if (!target->thread.sme_state) { - ret = -ENOMEM; - goto out; - } + /* + * Only flush the storage if PSTATE.ZA was not already set, + * otherwise preserve any existing data. + */ + sme_alloc(target, !thread_za_enabled(&target->thread)); + if (!target->thread.sme_state) + return -ENOMEM; /* If there is no data then disable ZA */ if (!count) { -- GitLab From 8410186ca48002092818500b7c209e569b47a2ac Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 15 Jan 2024 19:53:01 +0000 Subject: [PATCH 1100/1290] arm64/fpsimd: Remove spurious check for SVE support There is no need to check for SVE support when changing vector lengths, even if the system is SME only we still need SVE storage for the streaming SVE state. Fixes: d4d5be94a878 ("arm64/fpsimd: Ensure SME storage is allocated after SVE VL changes") Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20240115-arm64-sve-enabled-check-v1-1-a26360b00f6d@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/fpsimd.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 505f389be3e0d..0983be2b1b614 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -898,10 +898,8 @@ int vec_set_vector_length(struct task_struct *task, enum vec_type type, * allocate SVE now in case it is needed for use in streaming * mode. */ - if (system_supports_sve()) { - sve_free(task); - sve_alloc(task, true); - } + sve_free(task); + sve_alloc(task, true); if (free_sme) sme_free(task); -- GitLab From dc7eb8755797ed41a0d1b5c0c39df3c8f401b3d9 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 15 Jan 2024 20:15:46 +0000 Subject: [PATCH 1101/1290] arm64/sme: Always exit sme_alloc() early with existing storage When sme_alloc() is called with existing storage and we are not flushing we will always allocate new storage, both leaking the existing storage and corrupting the state. Fix this by separating the checks for flushing and for existing storage as we do for SVE. Callers that reallocate (eg, due to changing the vector length) should call sme_free() themselves. Fixes: 5d0a8d2fba50 ("arm64/ptrace: Ensure that SME is set up for target when writing SSVE state") Signed-off-by: Mark Brown Cc: Link: https://lore.kernel.org/r/20240115-arm64-sme-flush-v1-1-7472bd3459b7@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/fpsimd.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 0983be2b1b614..a5dc6f7641958 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -1217,8 +1217,10 @@ void fpsimd_release_task(struct task_struct *dead_task) */ void sme_alloc(struct task_struct *task, bool flush) { - if (task->thread.sme_state && flush) { - memset(task->thread.sme_state, 0, sme_state_size(task)); + if (task->thread.sme_state) { + if (flush) + memset(task->thread.sme_state, 0, + sme_state_size(task)); return; } -- GitLab From 1b20d0486a602417defb5bf33320d31b2a7a47f8 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 17 Jan 2024 17:05:45 +0000 Subject: [PATCH 1102/1290] arm64: Fix silcon-errata.rst formatting Remove the errant blank lines to make the desired empty row separators around the Fujitsu and ASR entries in the main table, rather than them being their own separate tables which then look odd in the HTML view. Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/b6637654eda761e224f828a44a7bbc1eadf2ef88.1705511145.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- Documentation/arch/arm64/silicon-errata.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index 7acd64c61f50c..b30bd6abffbe9 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -227,11 +227,9 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | Rockchip | RK3588 | #3588001 | ROCKCHIP_ERRATUM_3588001 | +----------------+-----------------+-----------------+-----------------------------+ - +----------------+-----------------+-----------------+-----------------------------+ | Fujitsu | A64FX | E#010001 | FUJITSU_ERRATUM_010001 | +----------------+-----------------+-----------------+-----------------------------+ - +----------------+-----------------+-----------------+-----------------------------+ | ASR | ASR8601 | #8601001 | N/A | +----------------+-----------------+-----------------+-----------------------------+ -- GitLab From f1172f3ee3a98754d95b968968920a7d03fdebcc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ludvig=20P=C3=A4rsson?= Date: Wed, 17 Jan 2024 13:03:14 +0100 Subject: [PATCH 1103/1290] ethtool: netlink: Add missing ethnl_ops_begin/complete MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Accessing an ethernet device that is powered off or clock gated might cause the CPU to hang. Add ethnl_ops_begin/complete in ethnl_set_features() to protect against this. Fixes: 0980bfcd6954 ("ethtool: set netdev features with FEATURES_SET request") Signed-off-by: Ludvig Pärsson Link: https://lore.kernel.org/r/20240117-etht2-v2-1-1a96b6e8c650@axis.com Signed-off-by: Paolo Abeni --- net/ethtool/features.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/ethtool/features.c b/net/ethtool/features.c index a79af8c25a071..b6cb101d7f19e 100644 --- a/net/ethtool/features.c +++ b/net/ethtool/features.c @@ -234,17 +234,20 @@ int ethnl_set_features(struct sk_buff *skb, struct genl_info *info) dev = req_info.dev; rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; ethnl_features_to_bitmap(old_active, dev->features); ethnl_features_to_bitmap(old_wanted, dev->wanted_features); ret = ethnl_parse_bitset(req_wanted, req_mask, NETDEV_FEATURE_COUNT, tb[ETHTOOL_A_FEATURES_WANTED], netdev_features_strings, info->extack); if (ret < 0) - goto out_rtnl; + goto out_ops; if (ethnl_bitmap_to_features(req_mask) & ~NETIF_F_ETHTOOL_BITS) { GENL_SET_ERR_MSG(info, "attempt to change non-ethtool features"); ret = -EINVAL; - goto out_rtnl; + goto out_ops; } /* set req_wanted bits not in req_mask from old_wanted */ @@ -281,6 +284,8 @@ int ethnl_set_features(struct sk_buff *skb, struct genl_info *info) if (mod) netdev_features_change(dev); +out_ops: + ethnl_ops_complete(dev); out_rtnl: rtnl_unlock(); ethnl_parse_header_dev_put(&req_info); -- GitLab From 7a8e9cdf9405819105ae7405cd91e482bf574b01 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Tue, 16 Jan 2024 08:09:25 -0600 Subject: [PATCH 1104/1290] seq_buf: Make DECLARE_SEQ_BUF() usable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using the address operator on the array doesn't work: ./include/linux/seq_buf.h:27:27: error: initialization of ‘char *’ from incompatible pointer type ‘char (*)[128]’ [-Werror=incompatible-pointer-types] 27 | .buffer = &__ ## NAME ## _buffer, \ | ^ Apart from fixing that, we can improve DECLARE_SEQ_BUF() by using a compound literal to define the buffer array without attaching a name to it. This makes the macro a single statement, allowing constructs such as: static DECLARE_SEQ_BUF(my_seq_buf, MYSB_SIZE); to work as intended. Link: https://lkml.kernel.org/r/20240116-declare-seq-buf-fix-v1-1-915db4692f32@linux.ibm.com Cc: stable@vger.kernel.org Acked-by: Kees Cook Fixes: dcc4e5728eea ("seq_buf: Introduce DECLARE_SEQ_BUF and seq_buf_str()") Signed-off-by: Nathan Lynch Signed-off-by: Steven Rostedt (Google) --- include/linux/seq_buf.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h index 5fb1f12c33f90..c44f4b47b9453 100644 --- a/include/linux/seq_buf.h +++ b/include/linux/seq_buf.h @@ -22,9 +22,8 @@ struct seq_buf { }; #define DECLARE_SEQ_BUF(NAME, SIZE) \ - char __ ## NAME ## _buffer[SIZE] = ""; \ struct seq_buf NAME = { \ - .buffer = &__ ## NAME ## _buffer, \ + .buffer = (char[SIZE]) { 0 }, \ .size = SIZE, \ } -- GitLab From baa7d536077dcdfe2b70c476a8873d1745d3de0f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 17 Jan 2024 18:59:01 +0100 Subject: [PATCH 1105/1290] loop: fix the the direct I/O support check when used on top of block devices __loop_update_dio only checks the alignment requirement for block backed file systems, but misses them for the case where the loop device is created directly on top of another block device. Due to this creating a loop device with default option plus the direct I/O flag on a > 512 byte sector size file system will lead to incorrect I/O being submitted to the lower block device and a lot of error from the lock layer. This can be seen with xfstests generic/563. Fix the code in __loop_update_dio by factoring the alignment check into a helper, and calling that also for the struct block_device of a block device inode. Also remove the TODO comment talking about dynamically switching between buffered and direct I/O, which is a would be a recipe for horrible performance and occasional data loss. Fixes: 2e5ab5f379f9 ("block: loop: prepare for supporing direct IO") Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20240117175901.871796-1-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 52 +++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 2bd10f3bfcb29..01bb943624048 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -165,39 +165,37 @@ static loff_t get_loop_size(struct loop_device *lo, struct file *file) return get_size(lo->lo_offset, lo->lo_sizelimit, file); } +/* + * We support direct I/O only if lo_offset is aligned with the logical I/O size + * of backing device, and the logical block size of loop is bigger than that of + * the backing device. + */ +static bool lo_bdev_can_use_dio(struct loop_device *lo, + struct block_device *backing_bdev) +{ + unsigned short sb_bsize = bdev_logical_block_size(backing_bdev); + + if (queue_logical_block_size(lo->lo_queue) < sb_bsize) + return false; + if (lo->lo_offset & (sb_bsize - 1)) + return false; + return true; +} + static void __loop_update_dio(struct loop_device *lo, bool dio) { struct file *file = lo->lo_backing_file; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - unsigned short sb_bsize = 0; - unsigned dio_align = 0; + struct inode *inode = file->f_mapping->host; + struct block_device *backing_bdev = NULL; bool use_dio; - if (inode->i_sb->s_bdev) { - sb_bsize = bdev_logical_block_size(inode->i_sb->s_bdev); - dio_align = sb_bsize - 1; - } + if (S_ISBLK(inode->i_mode)) + backing_bdev = I_BDEV(inode); + else if (inode->i_sb->s_bdev) + backing_bdev = inode->i_sb->s_bdev; - /* - * We support direct I/O only if lo_offset is aligned with the - * logical I/O size of backing device, and the logical block - * size of loop is bigger than the backing device's. - * - * TODO: the above condition may be loosed in the future, and - * direct I/O may be switched runtime at that time because most - * of requests in sane applications should be PAGE_SIZE aligned - */ - if (dio) { - if (queue_logical_block_size(lo->lo_queue) >= sb_bsize && - !(lo->lo_offset & dio_align) && - (file->f_mode & FMODE_CAN_ODIRECT)) - use_dio = true; - else - use_dio = false; - } else { - use_dio = false; - } + use_dio = dio && (file->f_mode & FMODE_CAN_ODIRECT) && + (!backing_bdev || lo_bdev_can_use_dio(lo, backing_bdev)); if (lo->use_dio == use_dio) return; -- GitLab From b2e792ae883a0aa976d4176dfa7dc933263440ea Mon Sep 17 00:00:00 2001 From: Christian Loehle Date: Thu, 18 Jan 2024 09:29:56 +0000 Subject: [PATCH 1106/1290] Documentation: block: ioprio: Update schedulers This doc hasn't been touched in a while, in the meantime some new io schedulers were added (e.g. all of mq), some with ioprio support. Also reword the introduction to remove reference to CFQ and the limitation that io priorities only work on reads, which is no longer true. Signed-off-by: Christian Loehle Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/a86cfdc8-016f-40f1-8b58-0cb15d2a792c@arm.com Signed-off-by: Jens Axboe --- Documentation/block/ioprio.rst | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/Documentation/block/ioprio.rst b/Documentation/block/ioprio.rst index a25c6d5df87b2..4662e1ff3d81f 100644 --- a/Documentation/block/ioprio.rst +++ b/Documentation/block/ioprio.rst @@ -6,17 +6,16 @@ Block io priorities Intro ----- -With the introduction of cfq v3 (aka cfq-ts or time sliced cfq), basic io -priorities are supported for reads on files. This enables users to io nice -processes or process groups, similar to what has been possible with cpu -scheduling for ages. This document mainly details the current possibilities -with cfq; other io schedulers do not support io priorities thus far. +The io priority feature enables users to io nice processes or process groups, +similar to what has been possible with cpu scheduling for ages. Support for io +priorities is io scheduler dependent and currently supported by bfq and +mq-deadline. Scheduling classes ------------------ -CFQ implements three generic scheduling classes that determine how io is -served for a process. +Three generic scheduling classes are implemented for io priorities that +determine how io is served for a process. IOPRIO_CLASS_RT: This is the realtime io class. This scheduling class is given higher priority than any other in the system, processes from this class are -- GitLab From 6d6eeabcfaba2fcadf5443b575789ea606f9de83 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Wed, 17 Jan 2024 16:04:16 +0100 Subject: [PATCH 1107/1290] mlxsw: spectrum_acl_erp: Fix error flow of pool allocation failure Lately, a bug was found when many TC filters are added - at some point, several bugs are printed to dmesg [1] and the switch is crashed with segmentation fault. The issue starts when gen_pool_free() fails because of unexpected behavior - a try to free memory which is already freed, this leads to BUG() call which crashes the switch and makes many other bugs. Trying to track down the unexpected behavior led to a bug in eRP code. The function mlxsw_sp_acl_erp_table_alloc() gets a pointer to the allocated index, sets the value and returns an error code. When gen_pool_alloc() fails it returns address 0, we track it and return -ENOBUFS outside, BUT the call for gen_pool_alloc() already override the index in erp_table structure. This is a problem when such allocation is done as part of table expansion. This is not a new table, which will not be used in case of allocation failure. We try to expand eRP table and override the current index (non-zero) with zero. Then, it leads to an unexpected behavior when address 0 is freed twice. Note that address 0 is valid in erp_table->base_index and indeed other tables use it. gen_pool_alloc() fails in case that there is no space left in the pre-allocated pool, in our case, the pool is limited to ACL_MAX_ERPT_BANK_SIZE, which is read from hardware. When more than max erp entries are required, we exceed the limit and return an error, this error leads to "Failed to migrate vregion" print. Fix this by changing erp_table->base_index only in case of a successful allocation. Add a test case for such a scenario. Without this fix it causes segmentation fault: $ TESTS="max_erp_entries_test" ./tc_flower.sh ./tc_flower.sh: line 988: 1560 Segmentation fault tc filter del dev $h2 ingress chain $i protocol ip pref $i handle $j flower &>/dev/null [1]: kernel BUG at lib/genalloc.c:508! invalid opcode: 0000 [#1] PREEMPT SMP CPU: 6 PID: 3531 Comm: tc Not tainted 6.7.0-rc5-custom-ga6893f479f5e #1 Hardware name: Mellanox Technologies Ltd. MSN4700/VMOD0010, BIOS 5.11 07/12/2021 RIP: 0010:gen_pool_free_owner+0xc9/0xe0 ... Call Trace: __mlxsw_sp_acl_erp_table_other_dec+0x70/0xa0 [mlxsw_spectrum] mlxsw_sp_acl_erp_mask_destroy+0xf5/0x110 [mlxsw_spectrum] objagg_obj_root_destroy+0x18/0x80 [objagg] objagg_obj_destroy+0x12c/0x130 [objagg] mlxsw_sp_acl_erp_mask_put+0x37/0x50 [mlxsw_spectrum] mlxsw_sp_acl_ctcam_region_entry_remove+0x74/0xa0 [mlxsw_spectrum] mlxsw_sp_acl_ctcam_entry_del+0x1e/0x40 [mlxsw_spectrum] mlxsw_sp_acl_tcam_ventry_del+0x78/0xd0 [mlxsw_spectrum] mlxsw_sp_flower_destroy+0x4d/0x70 [mlxsw_spectrum] mlxsw_sp_flow_block_cb+0x73/0xb0 [mlxsw_spectrum] tc_setup_cb_destroy+0xc1/0x180 fl_hw_destroy_filter+0x94/0xc0 [cls_flower] __fl_delete+0x1ac/0x1c0 [cls_flower] fl_destroy+0xc2/0x150 [cls_flower] tcf_proto_destroy+0x1a/0xa0 ... mlxsw_spectrum3 0000:07:00.0: Failed to migrate vregion mlxsw_spectrum3 0000:07:00.0: Failed to migrate vregion Fixes: f465261aa105 ("mlxsw: spectrum_acl: Implement common eRP core") Signed-off-by: Amit Cohen Signed-off-by: Ido Schimmel Signed-off-by: Petr Machata Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/4cfca254dfc0e5d283974801a24371c7b6db5989.1705502064.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlxsw/spectrum_acl_erp.c | 8 +-- .../drivers/net/mlxsw/spectrum-2/tc_flower.sh | 52 ++++++++++++++++++- 2 files changed, 56 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c index 4c98950380d53..d231f4d2888be 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c @@ -301,6 +301,7 @@ mlxsw_sp_acl_erp_table_alloc(struct mlxsw_sp_acl_erp_core *erp_core, unsigned long *p_index) { unsigned int num_rows, entry_size; + unsigned long index; /* We only allow allocations of entire rows */ if (num_erps % erp_core->num_erp_banks != 0) @@ -309,10 +310,11 @@ mlxsw_sp_acl_erp_table_alloc(struct mlxsw_sp_acl_erp_core *erp_core, entry_size = erp_core->erpt_entries_size[region_type]; num_rows = num_erps / erp_core->num_erp_banks; - *p_index = gen_pool_alloc(erp_core->erp_tables, num_rows * entry_size); - if (*p_index == 0) + index = gen_pool_alloc(erp_core->erp_tables, num_rows * entry_size); + if (!index) return -ENOBUFS; - *p_index -= MLXSW_SP_ACL_ERP_GENALLOC_OFFSET; + + *p_index = index - MLXSW_SP_ACL_ERP_GENALLOC_OFFSET; return 0; } diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower.sh index fb850e0ec8375..7bf56ea161e35 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower.sh @@ -10,7 +10,8 @@ lib_dir=$(dirname $0)/../../../../net/forwarding ALL_TESTS="single_mask_test identical_filters_test two_masks_test \ multiple_masks_test ctcam_edge_cases_test delta_simple_test \ delta_two_masks_one_key_test delta_simple_rehash_test \ - bloom_simple_test bloom_complex_test bloom_delta_test" + bloom_simple_test bloom_complex_test bloom_delta_test \ + max_erp_entries_test" NUM_NETIFS=2 source $lib_dir/lib.sh source $lib_dir/tc_common.sh @@ -983,6 +984,55 @@ bloom_delta_test() log_test "bloom delta test ($tcflags)" } +max_erp_entries_test() +{ + # The number of eRP entries is limited. Once the maximum number of eRPs + # has been reached, filters cannot be added. This test verifies that + # when this limit is reached, inserstion fails without crashing. + + RET=0 + + local num_masks=32 + local num_regions=15 + local chain_failed + local mask_failed + local ret + + if [[ "$tcflags" != "skip_sw" ]]; then + return 0; + fi + + for ((i=1; i < $num_regions; i++)); do + for ((j=$num_masks; j >= 0; j--)); do + tc filter add dev $h2 ingress chain $i protocol ip \ + pref $i handle $j flower $tcflags \ + dst_ip 192.1.0.0/$j &> /dev/null + ret=$? + + if [ $ret -ne 0 ]; then + chain_failed=$i + mask_failed=$j + break 2 + fi + done + done + + # We expect to exceed the maximum number of eRP entries, so that + # insertion eventually fails. Otherwise, the test should be adjusted to + # add more filters. + check_fail $ret "expected to exceed number of eRP entries" + + for ((; i >= 1; i--)); do + for ((j=0; j <= $num_masks; j++)); do + tc filter del dev $h2 ingress chain $i protocol ip \ + pref $i handle $j flower &> /dev/null + done + done + + log_test "max eRP entries test ($tcflags). " \ + "max chain $chain_failed, mask $mask_failed" +} + setup_prepare() { h1=${NETIFS[p1]} -- GitLab From efeb7dfea8ee10cdec11b6b6ba4e405edbe75809 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 17 Jan 2024 16:04:17 +0100 Subject: [PATCH 1108/1290] mlxsw: spectrum_acl_tcam: Fix NULL pointer dereference in error path When calling mlxsw_sp_acl_tcam_region_destroy() from an error path after failing to attach the region to an ACL group, we hit a NULL pointer dereference upon 'region->group->tcam' [1]. Fix by retrieving the 'tcam' pointer using mlxsw_sp_acl_to_tcam(). [1] BUG: kernel NULL pointer dereference, address: 0000000000000000 [...] RIP: 0010:mlxsw_sp_acl_tcam_region_destroy+0xa0/0xd0 [...] Call Trace: mlxsw_sp_acl_tcam_vchunk_get+0x88b/0xa20 mlxsw_sp_acl_tcam_ventry_add+0x25/0xe0 mlxsw_sp_acl_rule_add+0x47/0x240 mlxsw_sp_flower_replace+0x1a9/0x1d0 tc_setup_cb_add+0xdc/0x1c0 fl_hw_replace_filter+0x146/0x1f0 fl_change+0xc17/0x1360 tc_new_tfilter+0x472/0xb90 rtnetlink_rcv_msg+0x313/0x3b0 netlink_rcv_skb+0x58/0x100 netlink_unicast+0x244/0x390 netlink_sendmsg+0x1e4/0x440 ____sys_sendmsg+0x164/0x260 ___sys_sendmsg+0x9a/0xe0 __sys_sendmsg+0x7a/0xc0 do_syscall_64+0x40/0xe0 entry_SYSCALL_64_after_hwframe+0x63/0x6b Fixes: 22a677661f56 ("mlxsw: spectrum: Introduce ACL core with simple TCAM implementation") Signed-off-by: Ido Schimmel Reviewed-by: Amit Cohen Reviewed-by: Jiri Pirko Signed-off-by: Petr Machata Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/fb6a4542bbc9fcab5a523802d97059bffbca7126.1705502064.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c index d50786b0a6ce4..7d1e91196e943 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c @@ -681,13 +681,13 @@ static void mlxsw_sp_acl_tcam_region_destroy(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_acl_tcam_region *region) { + struct mlxsw_sp_acl_tcam *tcam = mlxsw_sp_acl_to_tcam(mlxsw_sp->acl); const struct mlxsw_sp_acl_tcam_ops *ops = mlxsw_sp->acl_tcam_ops; ops->region_fini(mlxsw_sp, region->priv); mlxsw_sp_acl_tcam_region_disable(mlxsw_sp, region); mlxsw_sp_acl_tcam_region_free(mlxsw_sp, region); - mlxsw_sp_acl_tcam_region_id_put(region->group->tcam, - region->id); + mlxsw_sp_acl_tcam_region_id_put(tcam, region->id); kfree(region); } -- GitLab From 483ae90d8f976f8339cf81066312e1329f2d3706 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 17 Jan 2024 16:04:18 +0100 Subject: [PATCH 1109/1290] mlxsw: spectrum_acl_tcam: Fix stack corruption When tc filters are first added to a net device, the corresponding local port gets bound to an ACL group in the device. The group contains a list of ACLs. In turn, each ACL points to a different TCAM region where the filters are stored. During forwarding, the ACLs are sequentially evaluated until a match is found. One reason to place filters in different regions is when they are added with decreasing priorities and in an alternating order so that two consecutive filters can never fit in the same region because of their key usage. In Spectrum-2 and newer ASICs the firmware started to report that the maximum number of ACLs in a group is more than 16, but the layout of the register that configures ACL groups (PAGT) was not updated to account for that. It is therefore possible to hit stack corruption [1] in the rare case where more than 16 ACLs in a group are required. Fix by limiting the maximum ACL group size to the minimum between what the firmware reports and the maximum ACLs that fit in the PAGT register. Add a test case to make sure the machine does not crash when this condition is hit. [1] Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: mlxsw_sp_acl_tcam_group_update+0x116/0x120 [...] dump_stack_lvl+0x36/0x50 panic+0x305/0x330 __stack_chk_fail+0x15/0x20 mlxsw_sp_acl_tcam_group_update+0x116/0x120 mlxsw_sp_acl_tcam_group_region_attach+0x69/0x110 mlxsw_sp_acl_tcam_vchunk_get+0x492/0xa20 mlxsw_sp_acl_tcam_ventry_add+0x25/0xe0 mlxsw_sp_acl_rule_add+0x47/0x240 mlxsw_sp_flower_replace+0x1a9/0x1d0 tc_setup_cb_add+0xdc/0x1c0 fl_hw_replace_filter+0x146/0x1f0 fl_change+0xc17/0x1360 tc_new_tfilter+0x472/0xb90 rtnetlink_rcv_msg+0x313/0x3b0 netlink_rcv_skb+0x58/0x100 netlink_unicast+0x244/0x390 netlink_sendmsg+0x1e4/0x440 ____sys_sendmsg+0x164/0x260 ___sys_sendmsg+0x9a/0xe0 __sys_sendmsg+0x7a/0xc0 do_syscall_64+0x40/0xe0 entry_SYSCALL_64_after_hwframe+0x63/0x6b Fixes: c3ab435466d5 ("mlxsw: spectrum: Extend to support Spectrum-2 ASIC") Reported-by: Orel Hagag Signed-off-by: Ido Schimmel Reviewed-by: Amit Cohen Signed-off-by: Petr Machata Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/2d91c89afba59c22587b444994ae419dbea8d876.1705502064.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlxsw/spectrum_acl_tcam.c | 2 + .../drivers/net/mlxsw/spectrum-2/tc_flower.sh | 56 ++++++++++++++++++- 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c index 7d1e91196e943..50ea1eff02b2f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c @@ -1564,6 +1564,8 @@ int mlxsw_sp_acl_tcam_init(struct mlxsw_sp *mlxsw_sp, tcam->max_groups = max_groups; tcam->max_group_size = MLXSW_CORE_RES_GET(mlxsw_sp->core, ACL_MAX_GROUP_SIZE); + tcam->max_group_size = min_t(unsigned int, tcam->max_group_size, + MLXSW_REG_PAGT_ACL_MAX_NUM); err = ops->init(mlxsw_sp, tcam->priv, tcam); if (err) diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower.sh index 7bf56ea161e35..616d3581419ca 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower.sh @@ -11,7 +11,7 @@ ALL_TESTS="single_mask_test identical_filters_test two_masks_test \ multiple_masks_test ctcam_edge_cases_test delta_simple_test \ delta_two_masks_one_key_test delta_simple_rehash_test \ bloom_simple_test bloom_complex_test bloom_delta_test \ - max_erp_entries_test" + max_erp_entries_test max_group_size_test" NUM_NETIFS=2 source $lib_dir/lib.sh source $lib_dir/tc_common.sh @@ -1033,6 +1033,60 @@ max_erp_entries_test() "max chain $chain_failed, mask $mask_failed" } +max_group_size_test() +{ + # The number of ACLs in an ACL group is limited. Once the maximum + # number of ACLs has been reached, filters cannot be added. This test + # verifies that when this limit is reached, insertion fails without + # crashing. + + RET=0 + + local num_acls=32 + local max_size + local ret + + if [[ "$tcflags" != "skip_sw" ]]; then + return 0; + fi + + for ((i=1; i < $num_acls; i++)); do + if [[ $(( i % 2 )) == 1 ]]; then + tc filter add dev $h2 ingress pref $i proto ipv4 \ + flower $tcflags dst_ip 198.51.100.1/32 \ + ip_proto tcp tcp_flags 0x01/0x01 \ + action drop &> /dev/null + else + tc filter add dev $h2 ingress pref $i proto ipv6 \ + flower $tcflags dst_ip 2001:db8:1::1/128 \ + action drop &> /dev/null + fi + + ret=$? + [[ $ret -ne 0 ]] && max_size=$((i - 1)) && break + done + + # We expect to exceed the maximum number of ACLs in a group, so that + # insertion eventually fails. Otherwise, the test should be adjusted to + # add more filters. + check_fail $ret "expected to exceed number of ACLs in a group" + + for ((; i >= 1; i--)); do + if [[ $(( i % 2 )) == 1 ]]; then + tc filter del dev $h2 ingress pref $i proto ipv4 \ + flower $tcflags dst_ip 198.51.100.1/32 \ + ip_proto tcp tcp_flags 0x01/0x01 \ + action drop &> /dev/null + else + tc filter del dev $h2 ingress pref $i proto ipv6 \ + flower $tcflags dst_ip 2001:db8:1::1/128 \ + action drop &> /dev/null + fi + done + + log_test "max ACL group size test ($tcflags). max size $max_size" +} + setup_prepare() { h1=${NETIFS[p1]} -- GitLab From 62bef63646c194e0f82b40304a0f2d060b28687b Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 17 Jan 2024 16:04:19 +0100 Subject: [PATCH 1110/1290] mlxsw: spectrum_router: Register netdevice notifier before nexthop If there are IPIP nexthops at the time when the driver is loaded (or the devlink instance reloaded), the driver looks up the corresponding IPIP entry. But IPIP entries are only created as a result of netdevice notifications. Since the netdevice notifier is registered after the nexthop notifier, mlxsw_sp_nexthop_type_init() never finds the IPIP entry, registers the nexthop MLXSW_SP_NEXTHOP_TYPE_ETH, and fails to assign a CRIF to the nexthop. Later on when the CRIF is necessary, the WARN_ON in mlxsw_sp_nexthop_rif() triggers, causing the splat [1]. In order to fix the issue, reorder the netdevice notifier to be registered before the nexthop one. [1] (edited for clarity): WARNING: CPU: 1 PID: 1364 at drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c:3245 mlxsw_sp_nexthop_rif (drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c:3246 (discriminator 1)) mlxsw_spectrum Hardware name: Mellanox Technologies Ltd. MSN4410/VMOD0010, BIOS 5.11 01/06/2019 Call Trace: ? mlxsw_sp_nexthop_rif (drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c:3246 (discriminator 1)) mlxsw_spectrum __mlxsw_sp_nexthop_eth_update (drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c:3637) mlxsw_spectrum mlxsw_sp_nexthop_update (drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c:3679 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c:3727) mlxsw_spectrum mlxsw_sp_nexthop_group_update (drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c:3757) mlxsw_spectrum mlxsw_sp_nexthop_group_refresh (drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c:4112) mlxsw_spectrum mlxsw_sp_nexthop_obj_event (drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c:5118 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c:5191 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c:5315 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c:5500) mlxsw_spectrum nexthops_dump (net/ipv4/nexthop.c:217 net/ipv4/nexthop.c:440 net/ipv4/nexthop.c:3609) register_nexthop_notifier (net/ipv4/nexthop.c:3624) mlxsw_sp_router_init (drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c:11486) mlxsw_spectrum mlxsw_sp_init (drivers/net/ethernet/mellanox/mlxsw/spectrum.c:3267) mlxsw_spectrum __mlxsw_core_bus_device_register (drivers/net/ethernet/mellanox/mlxsw/core.c:2202) mlxsw_core mlxsw_devlink_core_bus_device_reload_up (drivers/net/ethernet/mellanox/mlxsw/core.c:2265 drivers/net/ethernet/mellanox/mlxsw/core.c:1603) mlxsw_core devlink_reload (net/devlink/dev.c:314 net/devlink/dev.c:475) [...] Fixes: 9464a3d68ea9 ("mlxsw: spectrum_router: Track next hops at CRIFs") Reported-by: Maksym Yaremchuk Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/74edb8d45d004e8d8f5318eede6ccc3d786d8ba9.1705502064.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- .../ethernet/mellanox/mlxsw/spectrum_router.c | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 2c255ed9b8a94..7164f9e6370fb 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -11472,6 +11472,13 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp, if (err) goto err_register_netevent_notifier; + mlxsw_sp->router->netdevice_nb.notifier_call = + mlxsw_sp_router_netdevice_event; + err = register_netdevice_notifier_net(mlxsw_sp_net(mlxsw_sp), + &mlxsw_sp->router->netdevice_nb); + if (err) + goto err_register_netdev_notifier; + mlxsw_sp->router->nexthop_nb.notifier_call = mlxsw_sp_nexthop_obj_event; err = register_nexthop_notifier(mlxsw_sp_net(mlxsw_sp), @@ -11487,22 +11494,15 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp, if (err) goto err_register_fib_notifier; - mlxsw_sp->router->netdevice_nb.notifier_call = - mlxsw_sp_router_netdevice_event; - err = register_netdevice_notifier_net(mlxsw_sp_net(mlxsw_sp), - &mlxsw_sp->router->netdevice_nb); - if (err) - goto err_register_netdev_notifier; - return 0; -err_register_netdev_notifier: - unregister_fib_notifier(mlxsw_sp_net(mlxsw_sp), - &mlxsw_sp->router->fib_nb); err_register_fib_notifier: unregister_nexthop_notifier(mlxsw_sp_net(mlxsw_sp), &mlxsw_sp->router->nexthop_nb); err_register_nexthop_notifier: + unregister_netdevice_notifier_net(mlxsw_sp_net(mlxsw_sp), + &router->netdevice_nb); +err_register_netdev_notifier: unregister_netevent_notifier(&mlxsw_sp->router->netevent_nb); err_register_netevent_notifier: unregister_inet6addr_validator_notifier(&router->inet6addr_valid_nb); @@ -11550,11 +11550,11 @@ void mlxsw_sp_router_fini(struct mlxsw_sp *mlxsw_sp) { struct mlxsw_sp_router *router = mlxsw_sp->router; - unregister_netdevice_notifier_net(mlxsw_sp_net(mlxsw_sp), - &router->netdevice_nb); unregister_fib_notifier(mlxsw_sp_net(mlxsw_sp), &router->fib_nb); unregister_nexthop_notifier(mlxsw_sp_net(mlxsw_sp), &router->nexthop_nb); + unregister_netdevice_notifier_net(mlxsw_sp_net(mlxsw_sp), + &router->netdevice_nb); unregister_netevent_notifier(&router->netevent_nb); unregister_inet6addr_validator_notifier(&router->inet6addr_valid_nb); unregister_inetaddr_validator_notifier(&router->inetaddr_valid_nb); -- GitLab From 40cc674bafd50fc728c4a73d7dc77728a0b86759 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Wed, 17 Jan 2024 16:04:20 +0100 Subject: [PATCH 1111/1290] selftests: mlxsw: qos_pfc: Remove wrong description In the diagram of the topology, $swp3 and $swp4 are described as 1Gbps ports. This is wrong information, the test does not configure such speed. Cc: Shuah Khan Fixes: bfa804784e32 ("selftests: mlxsw: Add a PFC test") Signed-off-by: Amit Cohen Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/0087e2d416aff7e444d15f7c2958fc1d438dc27e.1705502064.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh index 42ce602d8d492..49bef76083b83 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh @@ -40,7 +40,6 @@ # | + $swp1 $swp3 + + $swp4 | # | | iPOOL1 iPOOL0 | | iPOOL2 | # | | ePOOL4 ePOOL5 | | ePOOL4 | -# | | 1Gbps | | 1Gbps | # | | PFC:enabled=1 | | PFC:enabled=1 | # | +-|----------------------|-+ +-|------------------------+ | # | | + $swp1.111 $swp3.111 + | | + $swp4.111 | | -- GitLab From b34f4de6d30cbaa8fed905a5080b6eace8c84dc7 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Wed, 17 Jan 2024 16:04:21 +0100 Subject: [PATCH 1112/1290] selftests: mlxsw: qos_pfc: Adjust the test to support 8 lanes 'qos_pfc' test checks PFC behavior. The idea is to limit the traffic using a shaper somewhere in the flow of the packets. In this area, the buffer is smaller than the buffer at the beginning of the flow, so it fills up until there is no more space left. The test configures there PFC which is supposed to notice that the headroom is filling up and send PFC Xoff to indicate the transmitter to stop sending traffic for the priorities sharing this PG. The Xon/Xoff threshold is auto-configured and always equal to 2*(MTU rounded up to cell size). Even after sending the PFC Xoff packet, traffic will keep arriving until the transmitter receives and processes the PFC packet. This amount of traffic is known as the PFC delay allowance. Currently the buffer for the delay traffic is configured as 100KB. The MTU in the test is 10KB, therefore the threshold for Xoff is about 20KB. This allows 80KB extra to be stored in this buffer. 8-lane ports use two buffers among which the configured buffer is split, the Xoff threshold then applies to each buffer in parallel. The test does not take into account the behavior of 8-lane ports, when the ports are configured to 400Gbps with 8 lanes or 800Gbps with 8 lanes, packets are dropped and the test fails. Check if the relevant ports use 8 lanes, in such case double the size of the buffer, as the headroom is split half-half. Cc: Shuah Khan Fixes: bfa804784e32 ("selftests: mlxsw: Add a PFC test") Signed-off-by: Amit Cohen Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/23ff11b7dff031eb04a41c0f5254a2b636cd8ebb.1705502064.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- .../selftests/drivers/net/mlxsw/qos_pfc.sh | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh index 49bef76083b83..0f0f4f05807c9 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh @@ -119,6 +119,9 @@ h2_destroy() switch_create() { + local lanes_swp4 + local pg1_size + # pools # ----- @@ -228,7 +231,20 @@ switch_create() dcb pfc set dev $swp4 prio-pfc all:off 1:on # PG0 will get autoconfigured to Xoff, give PG1 arbitrarily 100K, which # is (-2*MTU) about 80K of delay provision. - dcb buffer set dev $swp4 buffer-size all:0 1:$_100KB + pg1_size=$_100KB + + setup_wait_dev_with_timeout $swp4 + + lanes_swp4=$(ethtool $swp4 | grep 'Lanes:') + lanes_swp4=${lanes_swp4#*"Lanes: "} + + # 8-lane ports use two buffers among which the configured buffer + # is split, so double the size to get twice (20K + 80K). + if [[ $lanes_swp4 -eq 8 ]]; then + pg1_size=$((pg1_size * 2)) + fi + + dcb buffer set dev $swp4 buffer-size all:0 1:$pg1_size # bridges # ------- -- GitLab From 2e7ef287f07c74985f1bf2858bedc62bd9ebf155 Mon Sep 17 00:00:00 2001 From: Nikita Zhandarovich Date: Wed, 17 Jan 2024 09:21:02 -0800 Subject: [PATCH 1113/1290] ipv6: mcast: fix data-race in ipv6_mc_down / mld_ifc_work idev->mc_ifc_count can be written over without proper locking. Originally found by syzbot [1], fix this issue by encapsulating calls to mld_ifc_stop_work() (and mld_gq_stop_work() for good measure) with mutex_lock() and mutex_unlock() accordingly as these functions should only be called with mc_lock per their declarations. [1] BUG: KCSAN: data-race in ipv6_mc_down / mld_ifc_work write to 0xffff88813a80c832 of 1 bytes by task 3771 on cpu 0: mld_ifc_stop_work net/ipv6/mcast.c:1080 [inline] ipv6_mc_down+0x10a/0x280 net/ipv6/mcast.c:2725 addrconf_ifdown+0xe32/0xf10 net/ipv6/addrconf.c:3949 addrconf_notify+0x310/0x980 notifier_call_chain kernel/notifier.c:93 [inline] raw_notifier_call_chain+0x6b/0x1c0 kernel/notifier.c:461 __dev_notify_flags+0x205/0x3d0 dev_change_flags+0xab/0xd0 net/core/dev.c:8685 do_setlink+0x9f6/0x2430 net/core/rtnetlink.c:2916 rtnl_group_changelink net/core/rtnetlink.c:3458 [inline] __rtnl_newlink net/core/rtnetlink.c:3717 [inline] rtnl_newlink+0xbb3/0x1670 net/core/rtnetlink.c:3754 rtnetlink_rcv_msg+0x807/0x8c0 net/core/rtnetlink.c:6558 netlink_rcv_skb+0x126/0x220 net/netlink/af_netlink.c:2545 rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:6576 netlink_unicast_kernel net/netlink/af_netlink.c:1342 [inline] netlink_unicast+0x589/0x650 net/netlink/af_netlink.c:1368 netlink_sendmsg+0x66e/0x770 net/netlink/af_netlink.c:1910 ... write to 0xffff88813a80c832 of 1 bytes by task 22 on cpu 1: mld_ifc_work+0x54c/0x7b0 net/ipv6/mcast.c:2653 process_one_work kernel/workqueue.c:2627 [inline] process_scheduled_works+0x5b8/0xa30 kernel/workqueue.c:2700 worker_thread+0x525/0x730 kernel/workqueue.c:2781 ... Fixes: 2d9a93b4902b ("mld: convert from timer to delayed work") Reported-by: syzbot+a9400cabb1d784e49abf@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/000000000000994e09060ebcdffb@google.com/ Signed-off-by: Nikita Zhandarovich Acked-by: Taehee Yoo Reviewed-by: Eric Dumazet Reviewed-by: Hangbin Liu Link: https://lore.kernel.org/r/20240117172102.12001-1-n.zhandarovich@fintech.ru Signed-off-by: Jakub Kicinski --- net/ipv6/mcast.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index b75d3c9d41bb5..bc6e0a0bad3c1 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -2722,8 +2722,12 @@ void ipv6_mc_down(struct inet6_dev *idev) synchronize_net(); mld_query_stop_work(idev); mld_report_stop_work(idev); + + mutex_lock(&idev->mc_lock); mld_ifc_stop_work(idev); mld_gq_stop_work(idev); + mutex_unlock(&idev->mc_lock); + mld_dad_stop_work(idev); } -- GitLab From 9cfd3b502153810b66ac0ce47f1fba682228f2d2 Mon Sep 17 00:00:00 2001 From: Tony Nguyen Date: Wed, 17 Jan 2024 09:25:32 -0800 Subject: [PATCH 1114/1290] i40e: Include types.h to some headers Commit 56df345917c0 ("i40e: Remove circular header dependencies and fix headers") redistributed a number of includes from one large header file to the locations they were needed. In some environments, types.h is not included and causing compile issues. The driver should not rely on implicit inclusion from other locations; explicitly include it to these files. Snippet of issue. Entire log can be seen through the Closes: link. In file included from drivers/net/ethernet/intel/i40e/i40e_diag.h:7, from drivers/net/ethernet/intel/i40e/i40e_diag.c:4: drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h:33:9: error: unknown type name '__le16' 33 | __le16 flags; | ^~~~~~ drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h:34:9: error: unknown type name '__le16' 34 | __le16 opcode; | ^~~~~~ ... drivers/net/ethernet/intel/i40e/i40e_diag.h:22:9: error: unknown type name 'u32' 22 | u32 elements; /* number of elements if array */ | ^~~ drivers/net/ethernet/intel/i40e/i40e_diag.h:23:9: error: unknown type name 'u32' 23 | u32 stride; /* bytes between each element */ Reported-by: Martin Zaharinov Closes: https://lore.kernel.org/netdev/21BBD62A-F874-4E42-B347-93087EEA8126@gmail.com/ Fixes: 56df345917c0 ("i40e: Remove circular header dependencies and fix headers") Reviewed-by: Jesse Brandeburg Reviewed-by: Simon Horman Tested-by: Arpana Arland (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Link: https://lore.kernel.org/r/20240117172534.3555162-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h | 1 + drivers/net/ethernet/intel/i40e/i40e_diag.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h index 18a1c3b6d72c5..c8f35d4de271a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h +++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h @@ -5,6 +5,7 @@ #define _I40E_ADMINQ_CMD_H_ #include +#include /* This header file defines the i40e Admin Queue commands and is shared between * i40e Firmware and Software. diff --git a/drivers/net/ethernet/intel/i40e/i40e_diag.h b/drivers/net/ethernet/intel/i40e/i40e_diag.h index ece3a6b9a5c61..ab20202a3da3c 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_diag.h +++ b/drivers/net/ethernet/intel/i40e/i40e_diag.h @@ -4,6 +4,7 @@ #ifndef _I40E_DIAG_H_ #define _I40E_DIAG_H_ +#include #include "i40e_adminq_cmd.h" /* forward-declare the HW struct for the compiler */ -- GitLab From d8392c203e84ec7daa2afecdb8f4db69bc32416a Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 17 Jan 2024 16:15:18 -0600 Subject: [PATCH 1115/1290] smb3: show beginning time for per share stats In analyzing problems, one missing piece of debug data is when the mount occurred. A related problem is when collecting stats we don't know the period of time the stats covered, ie when this set of stats for the tcon started to be collected. To make debugging easier track the stats begin time. Set it when the mount occurred at mount time, and reset it to current time whenever stats are reset. For example, ... 1) \\localhost\test SMBs: 14 since 2024-01-17 22:17:30 UTC Bytes read: 0 Bytes written: 0 Open files: 0 total (local), 0 open on server TreeConnects: 1 total 0 failed TreeDisconnects: 0 total 0 failed ... 2) \\localhost\scratch SMBs: 24 since 2024-01-17 22:16:04 UTC Bytes read: 0 Bytes written: 0 Open files: 0 total (local), 0 open on server TreeConnects: 1 total 0 failed TreeDisconnects: 0 total 0 failed ... Note the time "since ... UTC" is now displayed in /proc/fs/cifs/Stats for each share that is mounted. Suggested-by: Shyam Prasad N Reviewed-by: Bharath SM Signed-off-by: Steve French --- fs/smb/client/cifs_debug.c | 6 ++++-- fs/smb/client/cifsglob.h | 1 + fs/smb/client/misc.c | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index 60027f5aebe87..3e4209f41c18f 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -659,6 +659,7 @@ static ssize_t cifs_stats_proc_write(struct file *file, spin_lock(&tcon->stat_lock); tcon->bytes_read = 0; tcon->bytes_written = 0; + tcon->stats_from_time = ktime_get_real_seconds(); spin_unlock(&tcon->stat_lock); if (server->ops->clear_stats) server->ops->clear_stats(tcon); @@ -737,8 +738,9 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v) seq_printf(m, "\n%d) %s", i, tcon->tree_name); if (tcon->need_reconnect) seq_puts(m, "\tDISCONNECTED "); - seq_printf(m, "\nSMBs: %d", - atomic_read(&tcon->num_smbs_sent)); + seq_printf(m, "\nSMBs: %d since %ptTs UTC", + atomic_read(&tcon->num_smbs_sent), + &tcon->stats_from_time); if (server->ops->print_stats) server->ops->print_stats(m, tcon); } diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 879d5ef8a66ed..f576ceee6157a 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1207,6 +1207,7 @@ struct cifs_tcon { __u64 bytes_read; __u64 bytes_written; spinlock_t stat_lock; /* protects the two fields above */ + time64_t stats_from_time; FILE_SYSTEM_DEVICE_INFO fsDevInfo; FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo; /* ok if fs name truncated */ FILE_SYSTEM_UNIX_INFO fsUnixInfo; diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index c2137ea3c2538..0748d7b757b95 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -140,6 +140,7 @@ tcon_info_alloc(bool dir_leases_enabled) spin_lock_init(&ret_buf->stat_lock); atomic_set(&ret_buf->num_local_opens, 0); atomic_set(&ret_buf->num_remote_opens, 0); + ret_buf->stats_from_time = ktime_get_real_seconds(); #ifdef CONFIG_CIFS_DFS_UPCALL INIT_LIST_HEAD(&ret_buf->dfs_ses_list); #endif -- GitLab From 0b549c4f594167d7ef056393c6a06ac77f5690ff Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 17 Jan 2024 16:56:05 -0600 Subject: [PATCH 1116/1290] cifs: minor comment cleanup minor comment cleanup and trivial camelCase removal Reviewed-by: Bharath SM Signed-off-by: Steve French --- fs/smb/client/readdir.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index 056cae1ddccef..e24684112ab0a 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -645,10 +645,10 @@ static int cifs_entry_is_dot(struct cifs_dirent *de, bool is_unicode) static int is_dir_changed(struct file *file) { struct inode *inode = file_inode(file); - struct cifsInodeInfo *cifsInfo = CIFS_I(inode); + struct cifsInodeInfo *cifs_inode_info = CIFS_I(inode); - if (cifsInfo->time == 0) - return 1; /* directory was changed, perhaps due to unlink */ + if (cifs_inode_info->time == 0) + return 1; /* directory was changed, e.g. unlink or new file */ else return 0; -- GitLab From c3365ced1375db779d2244695a7f936fd2f1bdb5 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 17 Jan 2024 17:12:57 -0600 Subject: [PATCH 1117/1290] Update MAINTAINERS email address Ronnie is no longer at Redhat. Update his email address. Signed-off-by: Steve French --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 4c0135b70caeb..75d5308f57774 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5205,7 +5205,7 @@ X: drivers/clk/clkdev.c COMMON INTERNET FILE SYSTEM CLIENT (CIFS and SMB3) M: Steve French R: Paulo Alcantara (DFS, global name space) -R: Ronnie Sahlberg (directory leases, sparse files) +R: Ronnie Sahlberg (directory leases, sparse files) R: Shyam Prasad N (multichannel) R: Tom Talpey (RDMA, smbdirect) L: linux-cifs@vger.kernel.org -- GitLab From 18f14afe281648e31ed35c9ad2fcb724c4838ad9 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 15 Dec 2023 23:44:49 +1100 Subject: [PATCH 1118/1290] powerpc/64s: Increase default stack size to 32KB There are reports of kernels crashing due to stack overflow while running OpenShift (Kubernetes). The primary contributor to the stack usage seems to be openvswitch, which is used by OVN-Kubernetes (based on OVN (Open Virtual Network)), but NFS also contributes in some stack traces. There may be some opportunities to reduce stack usage in the openvswitch code, but doing so potentially require tradeoffs vs performance, and also requires testing across architectures. Looking at stack usage across the kernel (using -fstack-usage), shows that ppc64le stack frames are on average 50-100% larger than the equivalent function built for x86-64. Which is not surprising given the minimum stack frame size is 32 bytes on ppc64le vs 16 bytes on x86-64. So increase the default stack size to 32KB for the modern 64-bit Book3S platforms, ie. pseries (virtualised) and powernv (bare metal). That leaves the older systems like G5s, and the AmigaOne (pasemi) with a 16KB stack which should be sufficient on those machines. Signed-off-by: Michael Ellerman Signed-off-by: Aneesh Kumar K.V (IBM) Link: https://msgid.link/20231215124449.317597-1-mpe@ellerman.id.au --- arch/powerpc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 6f105ee4f3cf5..2df545c1446ec 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -858,6 +858,7 @@ config THREAD_SHIFT int "Thread shift" if EXPERT range 13 15 default "15" if PPC_256K_PAGES + default "15" if PPC_PSERIES || PPC_POWERNV default "14" if PPC64 default "13" help -- GitLab From e965a707276760cc010eb77fba64b08ee9e8781f Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 23 Nov 2023 10:40:21 +0100 Subject: [PATCH 1119/1290] drm: remove I2C_CLASS_DDC support After removal of the legacy EEPROM driver and I2C_CLASS_DDC support in olpc_dcon there's no i2c client driver left supporting I2C_CLASS_DDC. Class-based device auto-detection is a legacy mechanism and shouldn't be used in new code. So we can remove this class completely now. Acked-by: Alex Deucher Acked-by: Dmitry Baryshkov Acked-by: Harry Wentland Acked-by: Heiko Stuebner Acked-by: Jani Nikula Acked-by: Jernej Skrabec Reviewed-by: Thomas Zimmermann Reviewed-by: Andi Shyti Reviewed-by: AngeloGioacchino Del Regno Reviewed-by: Neil Armstrong Signed-off-by: Heiner Kallweit Signed-off-by: Wolfram Sang --- drivers/gpu/drm/amd/amdgpu/amdgpu_i2c.c | 1 - drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 1 - drivers/gpu/drm/ast/ast_i2c.c | 1 - drivers/gpu/drm/bridge/synopsys/dw-hdmi.c | 1 - drivers/gpu/drm/display/drm_dp_helper.c | 1 - drivers/gpu/drm/display/drm_dp_mst_topology.c | 1 - drivers/gpu/drm/gma500/cdv_intel_dp.c | 1 - drivers/gpu/drm/gma500/intel_gmbus.c | 1 - drivers/gpu/drm/gma500/oaktrail_hdmi_i2c.c | 1 - drivers/gpu/drm/gma500/psb_intel_sdvo.c | 1 - drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_i2c.c | 1 - drivers/gpu/drm/i915/display/intel_gmbus.c | 1 - drivers/gpu/drm/i915/display/intel_sdvo.c | 1 - drivers/gpu/drm/loongson/lsdc_i2c.c | 1 - drivers/gpu/drm/mediatek/mtk_hdmi_ddc.c | 1 - drivers/gpu/drm/mgag200/mgag200_i2c.c | 1 - drivers/gpu/drm/msm/hdmi/hdmi_i2c.c | 1 - drivers/gpu/drm/radeon/radeon_i2c.c | 1 - drivers/gpu/drm/rockchip/inno_hdmi.c | 1 - drivers/gpu/drm/rockchip/rk3066_hdmi.c | 1 - drivers/gpu/drm/sun4i/sun4i_hdmi_i2c.c | 1 - 21 files changed, 21 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_i2c.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_i2c.c index 82608df433964..d79cb13e1aa83 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_i2c.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_i2c.c @@ -175,7 +175,6 @@ struct amdgpu_i2c_chan *amdgpu_i2c_create(struct drm_device *dev, i2c->rec = *rec; i2c->adapter.owner = THIS_MODULE; - i2c->adapter.class = I2C_CLASS_DDC; i2c->adapter.dev.parent = dev->dev; i2c->dev = dev; i2c_set_adapdata(&i2c->adapter, i2c); diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index ee97814ebd994..bbd511567a73a 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -7533,7 +7533,6 @@ create_i2c(struct ddc_service *ddc_service, if (!i2c) return NULL; i2c->base.owner = THIS_MODULE; - i2c->base.class = I2C_CLASS_DDC; i2c->base.dev.parent = &adev->pdev->dev; i2c->base.algo = &amdgpu_dm_i2c_algo; snprintf(i2c->base.name, sizeof(i2c->base.name), "AMDGPU DM i2c hw bus %d", link_index); diff --git a/drivers/gpu/drm/ast/ast_i2c.c b/drivers/gpu/drm/ast/ast_i2c.c index 0e845e7acd9b5..e5d3f7121de42 100644 --- a/drivers/gpu/drm/ast/ast_i2c.c +++ b/drivers/gpu/drm/ast/ast_i2c.c @@ -120,7 +120,6 @@ struct ast_i2c_chan *ast_i2c_create(struct drm_device *dev) return NULL; i2c->adapter.owner = THIS_MODULE; - i2c->adapter.class = I2C_CLASS_DDC; i2c->adapter.dev.parent = dev->dev; i2c->dev = dev; i2c_set_adapdata(&i2c->adapter, i2c); diff --git a/drivers/gpu/drm/bridge/synopsys/dw-hdmi.c b/drivers/gpu/drm/bridge/synopsys/dw-hdmi.c index 52d91a0df85e9..aca5bb0866f88 100644 --- a/drivers/gpu/drm/bridge/synopsys/dw-hdmi.c +++ b/drivers/gpu/drm/bridge/synopsys/dw-hdmi.c @@ -515,7 +515,6 @@ static struct i2c_adapter *dw_hdmi_i2c_adapter(struct dw_hdmi *hdmi) init_completion(&i2c->cmp); adap = &i2c->adap; - adap->class = I2C_CLASS_DDC; adap->owner = THIS_MODULE; adap->dev.parent = hdmi->dev; adap->algo = &dw_hdmi_algorithm; diff --git a/drivers/gpu/drm/display/drm_dp_helper.c b/drivers/gpu/drm/display/drm_dp_helper.c index f3680f4e69708..ac901f4b48f23 100644 --- a/drivers/gpu/drm/display/drm_dp_helper.c +++ b/drivers/gpu/drm/display/drm_dp_helper.c @@ -2102,7 +2102,6 @@ int drm_dp_aux_register(struct drm_dp_aux *aux) if (!aux->ddc.algo) drm_dp_aux_init(aux); - aux->ddc.class = I2C_CLASS_DDC; aux->ddc.owner = THIS_MODULE; aux->ddc.dev.parent = aux->dev; diff --git a/drivers/gpu/drm/display/drm_dp_mst_topology.c b/drivers/gpu/drm/display/drm_dp_mst_topology.c index 0e0d0e76de065..4376e2c1f2d43 100644 --- a/drivers/gpu/drm/display/drm_dp_mst_topology.c +++ b/drivers/gpu/drm/display/drm_dp_mst_topology.c @@ -5803,7 +5803,6 @@ static int drm_dp_mst_register_i2c_bus(struct drm_dp_mst_port *port) aux->ddc.algo_data = aux; aux->ddc.retries = 3; - aux->ddc.class = I2C_CLASS_DDC; aux->ddc.owner = THIS_MODULE; /* FIXME: set the kdev of the port's connector as parent */ aux->ddc.dev.parent = parent_dev; diff --git a/drivers/gpu/drm/gma500/cdv_intel_dp.c b/drivers/gpu/drm/gma500/cdv_intel_dp.c index 8992a95076f29..dd1eb7e9877d4 100644 --- a/drivers/gpu/drm/gma500/cdv_intel_dp.c +++ b/drivers/gpu/drm/gma500/cdv_intel_dp.c @@ -855,7 +855,6 @@ cdv_intel_dp_i2c_init(struct gma_connector *connector, memset(&intel_dp->adapter, '\0', sizeof (intel_dp->adapter)); intel_dp->adapter.owner = THIS_MODULE; - intel_dp->adapter.class = I2C_CLASS_DDC; strncpy (intel_dp->adapter.name, name, sizeof(intel_dp->adapter.name) - 1); intel_dp->adapter.name[sizeof(intel_dp->adapter.name) - 1] = '\0'; intel_dp->adapter.algo_data = &intel_dp->algo; diff --git a/drivers/gpu/drm/gma500/intel_gmbus.c b/drivers/gpu/drm/gma500/intel_gmbus.c index 09cedabf4776d..aa45509859f21 100644 --- a/drivers/gpu/drm/gma500/intel_gmbus.c +++ b/drivers/gpu/drm/gma500/intel_gmbus.c @@ -411,7 +411,6 @@ int gma_intel_setup_gmbus(struct drm_device *dev) struct intel_gmbus *bus = &dev_priv->gmbus[i]; bus->adapter.owner = THIS_MODULE; - bus->adapter.class = I2C_CLASS_DDC; snprintf(bus->adapter.name, sizeof(bus->adapter.name), "gma500 gmbus %s", diff --git a/drivers/gpu/drm/gma500/oaktrail_hdmi_i2c.c b/drivers/gpu/drm/gma500/oaktrail_hdmi_i2c.c index fc9a34ed58bd1..6daa6669ed237 100644 --- a/drivers/gpu/drm/gma500/oaktrail_hdmi_i2c.c +++ b/drivers/gpu/drm/gma500/oaktrail_hdmi_i2c.c @@ -168,7 +168,6 @@ static struct i2c_adapter oaktrail_hdmi_i2c_adapter = { .name = "oaktrail_hdmi_i2c", .nr = 3, .owner = THIS_MODULE, - .class = I2C_CLASS_DDC, .algo = &oaktrail_hdmi_i2c_algorithm, }; diff --git a/drivers/gpu/drm/gma500/psb_intel_sdvo.c b/drivers/gpu/drm/gma500/psb_intel_sdvo.c index d6fd5d7262160..e4f914decebae 100644 --- a/drivers/gpu/drm/gma500/psb_intel_sdvo.c +++ b/drivers/gpu/drm/gma500/psb_intel_sdvo.c @@ -2426,7 +2426,6 @@ psb_intel_sdvo_init_ddc_proxy(struct psb_intel_sdvo *sdvo, struct drm_device *dev) { sdvo->ddc.owner = THIS_MODULE; - sdvo->ddc.class = I2C_CLASS_DDC; snprintf(sdvo->ddc.name, I2C_NAME_SIZE, "SDVO DDC proxy"); sdvo->ddc.dev.parent = dev->dev; sdvo->ddc.algo_data = sdvo; diff --git a/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_i2c.c b/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_i2c.c index 410bd019bb357..e6e48651c15c6 100644 --- a/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_i2c.c +++ b/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_i2c.c @@ -81,7 +81,6 @@ int hibmc_ddc_create(struct drm_device *drm_dev, struct hibmc_connector *connector) { connector->adapter.owner = THIS_MODULE; - connector->adapter.class = I2C_CLASS_DDC; snprintf(connector->adapter.name, I2C_NAME_SIZE, "HIS i2c bit bus"); connector->adapter.dev.parent = drm_dev->dev; i2c_set_adapdata(&connector->adapter, connector); diff --git a/drivers/gpu/drm/i915/display/intel_gmbus.c b/drivers/gpu/drm/i915/display/intel_gmbus.c index 40d7b6f3f4891..e9e4dcf345f95 100644 --- a/drivers/gpu/drm/i915/display/intel_gmbus.c +++ b/drivers/gpu/drm/i915/display/intel_gmbus.c @@ -899,7 +899,6 @@ int intel_gmbus_setup(struct drm_i915_private *i915) } bus->adapter.owner = THIS_MODULE; - bus->adapter.class = I2C_CLASS_DDC; snprintf(bus->adapter.name, sizeof(bus->adapter.name), "i915 gmbus %s", gmbus_pin->name); diff --git a/drivers/gpu/drm/i915/display/intel_sdvo.c b/drivers/gpu/drm/i915/display/intel_sdvo.c index a636f42ceae55..5e64d1baffe88 100644 --- a/drivers/gpu/drm/i915/display/intel_sdvo.c +++ b/drivers/gpu/drm/i915/display/intel_sdvo.c @@ -3311,7 +3311,6 @@ intel_sdvo_init_ddc_proxy(struct intel_sdvo_ddc *ddc, ddc->ddc_bus = ddc_bus; ddc->ddc.owner = THIS_MODULE; - ddc->ddc.class = I2C_CLASS_DDC; snprintf(ddc->ddc.name, I2C_NAME_SIZE, "SDVO %c DDC%d", port_name(sdvo->base.port), ddc_bus); ddc->ddc.dev.parent = &pdev->dev; diff --git a/drivers/gpu/drm/loongson/lsdc_i2c.c b/drivers/gpu/drm/loongson/lsdc_i2c.c index 9625d0b1d0b4d..ce90c25536d24 100644 --- a/drivers/gpu/drm/loongson/lsdc_i2c.c +++ b/drivers/gpu/drm/loongson/lsdc_i2c.c @@ -154,7 +154,6 @@ int lsdc_create_i2c_chan(struct drm_device *ddev, adapter = &li2c->adapter; adapter->algo_data = &li2c->bit; adapter->owner = THIS_MODULE; - adapter->class = I2C_CLASS_DDC; adapter->dev.parent = ddev->dev; adapter->nr = -1; diff --git a/drivers/gpu/drm/mediatek/mtk_hdmi_ddc.c b/drivers/gpu/drm/mediatek/mtk_hdmi_ddc.c index d675c954befe3..54e46e440e0f0 100644 --- a/drivers/gpu/drm/mediatek/mtk_hdmi_ddc.c +++ b/drivers/gpu/drm/mediatek/mtk_hdmi_ddc.c @@ -297,7 +297,6 @@ static int mtk_hdmi_ddc_probe(struct platform_device *pdev) strscpy(ddc->adap.name, "mediatek-hdmi-ddc", sizeof(ddc->adap.name)); ddc->adap.owner = THIS_MODULE; - ddc->adap.class = I2C_CLASS_DDC; ddc->adap.algo = &mtk_hdmi_ddc_algorithm; ddc->adap.retries = 3; ddc->adap.dev.of_node = dev->of_node; diff --git a/drivers/gpu/drm/mgag200/mgag200_i2c.c b/drivers/gpu/drm/mgag200/mgag200_i2c.c index 0c48bdf3e7f80..423eb302be7eb 100644 --- a/drivers/gpu/drm/mgag200/mgag200_i2c.c +++ b/drivers/gpu/drm/mgag200/mgag200_i2c.c @@ -106,7 +106,6 @@ int mgag200_i2c_init(struct mga_device *mdev, struct mga_i2c_chan *i2c) i2c->data = BIT(info->i2c.data_bit); i2c->clock = BIT(info->i2c.clock_bit); i2c->adapter.owner = THIS_MODULE; - i2c->adapter.class = I2C_CLASS_DDC; i2c->adapter.dev.parent = dev->dev; i2c->dev = dev; i2c_set_adapdata(&i2c->adapter, i2c); diff --git a/drivers/gpu/drm/msm/hdmi/hdmi_i2c.c b/drivers/gpu/drm/msm/hdmi/hdmi_i2c.c index de182c0048434..7aa500d24240f 100644 --- a/drivers/gpu/drm/msm/hdmi/hdmi_i2c.c +++ b/drivers/gpu/drm/msm/hdmi/hdmi_i2c.c @@ -249,7 +249,6 @@ struct i2c_adapter *msm_hdmi_i2c_init(struct hdmi *hdmi) i2c->owner = THIS_MODULE; - i2c->class = I2C_CLASS_DDC; snprintf(i2c->name, sizeof(i2c->name), "msm hdmi i2c"); i2c->dev.parent = &hdmi->pdev->dev; i2c->algo = &msm_hdmi_i2c_algorithm; diff --git a/drivers/gpu/drm/radeon/radeon_i2c.c b/drivers/gpu/drm/radeon/radeon_i2c.c index 314d066e68e9d..3d174390a8afe 100644 --- a/drivers/gpu/drm/radeon/radeon_i2c.c +++ b/drivers/gpu/drm/radeon/radeon_i2c.c @@ -918,7 +918,6 @@ struct radeon_i2c_chan *radeon_i2c_create(struct drm_device *dev, i2c->rec = *rec; i2c->adapter.owner = THIS_MODULE; - i2c->adapter.class = I2C_CLASS_DDC; i2c->adapter.dev.parent = dev->dev; i2c->dev = dev; i2c_set_adapdata(&i2c->adapter, i2c); diff --git a/drivers/gpu/drm/rockchip/inno_hdmi.c b/drivers/gpu/drm/rockchip/inno_hdmi.c index 6e5b922a121e2..a7739b27c42b5 100644 --- a/drivers/gpu/drm/rockchip/inno_hdmi.c +++ b/drivers/gpu/drm/rockchip/inno_hdmi.c @@ -793,7 +793,6 @@ static struct i2c_adapter *inno_hdmi_i2c_adapter(struct inno_hdmi *hdmi) init_completion(&i2c->cmp); adap = &i2c->adap; - adap->class = I2C_CLASS_DDC; adap->owner = THIS_MODULE; adap->dev.parent = hdmi->dev; adap->dev.of_node = hdmi->dev->of_node; diff --git a/drivers/gpu/drm/rockchip/rk3066_hdmi.c b/drivers/gpu/drm/rockchip/rk3066_hdmi.c index fa6e592e0276c..7a3f71aa2ca9d 100644 --- a/drivers/gpu/drm/rockchip/rk3066_hdmi.c +++ b/drivers/gpu/drm/rockchip/rk3066_hdmi.c @@ -725,7 +725,6 @@ static struct i2c_adapter *rk3066_hdmi_i2c_adapter(struct rk3066_hdmi *hdmi) init_completion(&i2c->cmpltn); adap = &i2c->adap; - adap->class = I2C_CLASS_DDC; adap->owner = THIS_MODULE; adap->dev.parent = hdmi->dev; adap->dev.of_node = hdmi->dev->of_node; diff --git a/drivers/gpu/drm/sun4i/sun4i_hdmi_i2c.c b/drivers/gpu/drm/sun4i/sun4i_hdmi_i2c.c index d1a65a921f5af..f5f62eb0eecaa 100644 --- a/drivers/gpu/drm/sun4i/sun4i_hdmi_i2c.c +++ b/drivers/gpu/drm/sun4i/sun4i_hdmi_i2c.c @@ -302,7 +302,6 @@ int sun4i_hdmi_i2c_create(struct device *dev, struct sun4i_hdmi *hdmi) return -ENOMEM; adap->owner = THIS_MODULE; - adap->class = I2C_CLASS_DDC; adap->algo = &sun4i_hdmi_i2c_algorithm; strscpy(adap->name, "sun4i_hdmi_i2c adapter", sizeof(adap->name)); i2c_set_adapdata(adap, hdmi); -- GitLab From 754bd2fffc913e523d1b9e686e8b1fd3a70d8f7e Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 23 Nov 2023 10:40:25 +0100 Subject: [PATCH 1120/1290] fbdev: remove I2C_CLASS_DDC support After removal of the legacy EEPROM driver and I2C_CLASS_DDC support in olpc_dcon there's no i2c client driver left supporting I2C_CLASS_DDC. Class-based device auto-detection is a legacy mechanism and shouldn't be used in new code. So we can remove this class completely now. Acked-by: Helge Deller Acked-by: Thomas Zimmermann Signed-off-by: Heiner Kallweit Signed-off-by: Wolfram Sang --- drivers/video/fbdev/core/fb_ddc.c | 1 - drivers/video/fbdev/cyber2000fb.c | 1 - drivers/video/fbdev/i740fb.c | 1 - drivers/video/fbdev/intelfb/intelfb_i2c.c | 15 +++++---------- drivers/video/fbdev/matrox/i2c-matroxfb.c | 15 +++++---------- drivers/video/fbdev/s3fb.c | 1 - drivers/video/fbdev/tdfxfb.c | 1 - drivers/video/fbdev/tridentfb.c | 1 - drivers/video/fbdev/via/via_i2c.c | 1 - 9 files changed, 10 insertions(+), 27 deletions(-) diff --git a/drivers/video/fbdev/core/fb_ddc.c b/drivers/video/fbdev/core/fb_ddc.c index 8bf5f2f54be7b..e251432198629 100644 --- a/drivers/video/fbdev/core/fb_ddc.c +++ b/drivers/video/fbdev/core/fb_ddc.c @@ -116,7 +116,6 @@ unsigned char *fb_ddc_read(struct i2c_adapter *adapter) algo_data->setsda(algo_data->data, 1); algo_data->setscl(algo_data->data, 1); - adapter->class |= I2C_CLASS_DDC; return edid; } diff --git a/drivers/video/fbdev/cyber2000fb.c b/drivers/video/fbdev/cyber2000fb.c index 52105dc1a72f3..79775dedacebc 100644 --- a/drivers/video/fbdev/cyber2000fb.c +++ b/drivers/video/fbdev/cyber2000fb.c @@ -1234,7 +1234,6 @@ static int cyber2000fb_setup_ddc_bus(struct cfb_info *cfb) strscpy(cfb->ddc_adapter.name, cfb->fb.fix.id, sizeof(cfb->ddc_adapter.name)); cfb->ddc_adapter.owner = THIS_MODULE; - cfb->ddc_adapter.class = I2C_CLASS_DDC; cfb->ddc_adapter.algo_data = &cfb->ddc_algo; cfb->ddc_adapter.dev.parent = cfb->fb.device; cfb->ddc_algo.setsda = cyber2000fb_ddc_setsda; diff --git a/drivers/video/fbdev/i740fb.c b/drivers/video/fbdev/i740fb.c index 1897e65ab7031..9b74dae71472c 100644 --- a/drivers/video/fbdev/i740fb.c +++ b/drivers/video/fbdev/i740fb.c @@ -163,7 +163,6 @@ static int i740fb_setup_ddc_bus(struct fb_info *info) strscpy(par->ddc_adapter.name, info->fix.id, sizeof(par->ddc_adapter.name)); par->ddc_adapter.owner = THIS_MODULE; - par->ddc_adapter.class = I2C_CLASS_DDC; par->ddc_adapter.algo_data = &par->ddc_algo; par->ddc_adapter.dev.parent = info->device; par->ddc_algo.setsda = i740fb_ddc_setsda; diff --git a/drivers/video/fbdev/intelfb/intelfb_i2c.c b/drivers/video/fbdev/intelfb/intelfb_i2c.c index 3300bd31d9d7e..f24c7cb4c2ea6 100644 --- a/drivers/video/fbdev/intelfb/intelfb_i2c.c +++ b/drivers/video/fbdev/intelfb/intelfb_i2c.c @@ -99,8 +99,7 @@ static int intelfb_gpio_getsda(void *data) static int intelfb_setup_i2c_bus(struct intelfb_info *dinfo, struct intelfb_i2c_chan *chan, - const u32 reg, const char *name, - int class) + const u32 reg, const char *name) { int rc; @@ -108,7 +107,6 @@ static int intelfb_setup_i2c_bus(struct intelfb_info *dinfo, chan->reg = reg; snprintf(chan->adapter.name, sizeof(chan->adapter.name), "intelfb %s", name); - chan->adapter.class = class; chan->adapter.owner = THIS_MODULE; chan->adapter.algo_data = &chan->algo; chan->adapter.dev.parent = &chan->dinfo->pdev->dev; @@ -144,8 +142,7 @@ void intelfb_create_i2c_busses(struct intelfb_info *dinfo) dinfo->output[i].type = INTELFB_OUTPUT_ANALOG; /* setup the DDC bus for analog output */ - intelfb_setup_i2c_bus(dinfo, &dinfo->output[i].ddc_bus, GPIOA, - "CRTDDC_A", I2C_CLASS_DDC); + intelfb_setup_i2c_bus(dinfo, &dinfo->output[i].ddc_bus, GPIOA, "CRTDDC_A"); i++; /* need to add the output busses for each device @@ -159,10 +156,8 @@ void intelfb_create_i2c_busses(struct intelfb_info *dinfo) case INTEL_855GM: case INTEL_865G: dinfo->output[i].type = INTELFB_OUTPUT_DVO; - intelfb_setup_i2c_bus(dinfo, &dinfo->output[i].ddc_bus, - GPIOD, "DVODDC_D", I2C_CLASS_DDC); - intelfb_setup_i2c_bus(dinfo, &dinfo->output[i].i2c_bus, - GPIOE, "DVOI2C_E", 0); + intelfb_setup_i2c_bus(dinfo, &dinfo->output[i].ddc_bus, GPIOD, "DVODDC_D"); + intelfb_setup_i2c_bus(dinfo, &dinfo->output[i].i2c_bus, GPIOE, "DVOI2C_E"); i++; break; case INTEL_915G: @@ -176,7 +171,7 @@ void intelfb_create_i2c_busses(struct intelfb_info *dinfo) /* SDVO ports have a single control bus - 2 devices */ dinfo->output[i].type = INTELFB_OUTPUT_SDVO; intelfb_setup_i2c_bus(dinfo, &dinfo->output[i].i2c_bus, - GPIOE, "SDVOCTRL_E", 0); + GPIOE, "SDVOCTRL_E"); /* TODO: initialize the SDVO */ /* I830SDVOInit(pScrn, i, DVOB); */ i++; diff --git a/drivers/video/fbdev/matrox/i2c-matroxfb.c b/drivers/video/fbdev/matrox/i2c-matroxfb.c index e2e4705e3fe0e..bb048e14b2cf1 100644 --- a/drivers/video/fbdev/matrox/i2c-matroxfb.c +++ b/drivers/video/fbdev/matrox/i2c-matroxfb.c @@ -100,8 +100,7 @@ static const struct i2c_algo_bit_data matrox_i2c_algo_template = }; static int i2c_bus_reg(struct i2c_bit_adapter* b, struct matrox_fb_info* minfo, - unsigned int data, unsigned int clock, const char *name, - int class) + unsigned int data, unsigned int clock, const char *name) { int err; @@ -112,7 +111,6 @@ static int i2c_bus_reg(struct i2c_bit_adapter* b, struct matrox_fb_info* minfo, snprintf(b->adapter.name, sizeof(b->adapter.name), name, minfo->fbcon.node); i2c_set_adapdata(&b->adapter, b); - b->adapter.class = class; b->adapter.algo_data = &b->bac; b->adapter.dev.parent = &minfo->pcidev->dev; b->bac = matrox_i2c_algo_template; @@ -160,27 +158,24 @@ static void* i2c_matroxfb_probe(struct matrox_fb_info* minfo) { case MGA_2164: err = i2c_bus_reg(&m2info->ddc1, minfo, DDC1B_DATA, DDC1B_CLK, - "DDC:fb%u #0", I2C_CLASS_DDC); + "DDC:fb%u #0"); break; default: err = i2c_bus_reg(&m2info->ddc1, minfo, DDC1_DATA, DDC1_CLK, - "DDC:fb%u #0", I2C_CLASS_DDC); + "DDC:fb%u #0"); break; } if (err) goto fail_ddc1; if (minfo->devflags.dualhead) { - err = i2c_bus_reg(&m2info->ddc2, minfo, - DDC2_DATA, DDC2_CLK, - "DDC:fb%u #1", I2C_CLASS_DDC); + err = i2c_bus_reg(&m2info->ddc2, minfo, DDC2_DATA, DDC2_CLK, "DDC:fb%u #1"); if (err == -ENODEV) { printk(KERN_INFO "i2c-matroxfb: VGA->TV plug detected, DDC unavailable.\n"); } else if (err) printk(KERN_INFO "i2c-matroxfb: Could not register secondary output i2c bus. Continuing anyway.\n"); /* Register maven bus even on G450/G550 */ - err = i2c_bus_reg(&m2info->maven, minfo, - MAT_DATA, MAT_CLK, "MAVEN:fb%u", 0); + err = i2c_bus_reg(&m2info->maven, minfo, MAT_DATA, MAT_CLK, "MAVEN:fb%u"); if (err) printk(KERN_INFO "i2c-matroxfb: Could not register Maven i2c bus. Continuing anyway.\n"); else { diff --git a/drivers/video/fbdev/s3fb.c b/drivers/video/fbdev/s3fb.c index 589b349cb63e0..07722a5ea8eff 100644 --- a/drivers/video/fbdev/s3fb.c +++ b/drivers/video/fbdev/s3fb.c @@ -252,7 +252,6 @@ static int s3fb_setup_ddc_bus(struct fb_info *info) strscpy(par->ddc_adapter.name, info->fix.id, sizeof(par->ddc_adapter.name)); par->ddc_adapter.owner = THIS_MODULE; - par->ddc_adapter.class = I2C_CLASS_DDC; par->ddc_adapter.algo_data = &par->ddc_algo; par->ddc_adapter.dev.parent = info->device; par->ddc_algo.setsda = s3fb_ddc_setsda; diff --git a/drivers/video/fbdev/tdfxfb.c b/drivers/video/fbdev/tdfxfb.c index 22aa953138b0f..51ebe78359ec3 100644 --- a/drivers/video/fbdev/tdfxfb.c +++ b/drivers/video/fbdev/tdfxfb.c @@ -1267,7 +1267,6 @@ static int tdfxfb_setup_ddc_bus(struct tdfxfb_i2c_chan *chan, const char *name, strscpy(chan->adapter.name, name, sizeof(chan->adapter.name)); chan->adapter.owner = THIS_MODULE; - chan->adapter.class = I2C_CLASS_DDC; chan->adapter.algo_data = &chan->algo; chan->adapter.dev.parent = dev; chan->algo.setsda = tdfxfb_ddc_setsda; diff --git a/drivers/video/fbdev/tridentfb.c b/drivers/video/fbdev/tridentfb.c index 816d40b6f689c..516cf2a187575 100644 --- a/drivers/video/fbdev/tridentfb.c +++ b/drivers/video/fbdev/tridentfb.c @@ -274,7 +274,6 @@ static int tridentfb_setup_ddc_bus(struct fb_info *info) strscpy(par->ddc_adapter.name, info->fix.id, sizeof(par->ddc_adapter.name)); par->ddc_adapter.owner = THIS_MODULE; - par->ddc_adapter.class = I2C_CLASS_DDC; par->ddc_adapter.algo_data = &par->ddc_algo; par->ddc_adapter.dev.parent = info->device; if (is_oldclock(par->chip_id)) { /* not sure if this check is OK */ diff --git a/drivers/video/fbdev/via/via_i2c.c b/drivers/video/fbdev/via/via_i2c.c index c35e530e0ec9d..5825028105759 100644 --- a/drivers/video/fbdev/via/via_i2c.c +++ b/drivers/video/fbdev/via/via_i2c.c @@ -201,7 +201,6 @@ static int create_i2c_bus(struct i2c_adapter *adapter, sprintf(adapter->name, "viafb i2c io_port idx 0x%02x", adap_cfg->ioport_index); adapter->owner = THIS_MODULE; - adapter->class = I2C_CLASS_DDC; adapter->algo_data = algo; if (pdev) adapter->dev.parent = &pdev->dev; -- GitLab From b60db383e2ba64a18e49b6bef3be1ab18aa159f1 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 23 Nov 2023 10:40:40 +0100 Subject: [PATCH 1121/1290] include/linux/i2c.h: remove I2C_CLASS_DDC support After removal of the legacy EEPROM driver and I2C_CLASS_DDC support in olpc_dcon there's no i2c client driver left supporting I2C_CLASS_DDC. Class-based device auto-detection is a legacy mechanism and shouldn't be used in new code. So we can remove this class completely now. Signed-off-by: Heiner Kallweit Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 0dae9db275380..d029aade338fd 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -850,7 +850,6 @@ static inline void i2c_mark_adapter_resumed(struct i2c_adapter *adap) /* i2c adapter classes (bitmask) */ #define I2C_CLASS_HWMON (1<<0) /* lm_sensors, ... */ -#define I2C_CLASS_DDC (1<<3) /* DDC bus on graphics adapters */ #define I2C_CLASS_SPD (1<<7) /* Memory modules */ /* Warn users that the adapter doesn't support classes anymore */ #define I2C_CLASS_DEPRECATED (1<<8) -- GitLab From f21682b362b67833e4f4f481c30abcb432861b0c Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 13 Nov 2023 12:37:15 +0100 Subject: [PATCH 1122/1290] drm/amd/pm: Remove I2C_CLASS_SPD support I2C_CLASS_SPD was used to expose the EEPROM content to user space, via the legacy eeprom driver. Now that this driver has been removed, we can remove I2C_CLASS_SPD support. at24 driver with explicit instantiation should be used instead. Signed-off-by: Heiner Kallweit Acked-by: Alex Deucher Signed-off-by: Wolfram Sang --- drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 1 - drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 1 - drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 1 - 3 files changed, 3 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c index 1a6675d70a4bc..bc56a29e63fad 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c @@ -1527,7 +1527,6 @@ static int aldebaran_i2c_control_init(struct smu_context *smu) smu_i2c->port = 0; mutex_init(&smu_i2c->mutex); control->owner = THIS_MODULE; - control->class = I2C_CLASS_SPD; control->dev.parent = &adev->pdev->dev; control->algo = &aldebaran_i2c_algo; snprintf(control->name, sizeof(control->name), "AMDGPU SMU 0"); diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c index 82c4e1f1c6f07..c0e62bab97a12 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c @@ -2720,7 +2720,6 @@ static int smu_v13_0_0_i2c_control_init(struct smu_context *smu) smu_i2c->port = i; mutex_init(&smu_i2c->mutex); control->owner = THIS_MODULE; - control->class = I2C_CLASS_SPD; control->dev.parent = &adev->pdev->dev; control->algo = &smu_v13_0_0_i2c_algo; snprintf(control->name, sizeof(control->name), "AMDGPU SMU %d", i); diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 0e5a77c3c2e21..8bedd8d2ab2f4 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -1884,7 +1884,6 @@ static int smu_v13_0_6_i2c_control_init(struct smu_context *smu) smu_i2c->port = i; mutex_init(&smu_i2c->mutex); control->owner = THIS_MODULE; - control->class = I2C_CLASS_SPD; control->dev.parent = &adev->pdev->dev; control->algo = &smu_v13_0_6_i2c_algo; snprintf(control->name, sizeof(control->name), "AMDGPU SMU %d", i); -- GitLab From 9fd12f385720a85b4ce9a3cb98e66d470a5efd20 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 24 Nov 2023 11:16:10 +0100 Subject: [PATCH 1123/1290] i2c: Don't let i2c adapters declare I2C_CLASS_SPD support if they support I2C_CLASS_HWMON After removal of the legacy eeprom driver the only remaining I2C client device driver supporting I2C_CLASS_SPD is jc42. Because this driver also supports I2C_CLASS_HWMON, adapters don't have to declare support for I2C_CLASS_SPD if they support I2C_CLASS_HWMON. It's one step towards getting rid of I2C_CLASS_SPD mid-term. Signed-off-by: Heiner Kallweit Acked-by: Andi Shyti Acked-by: Jim Cromie # for SCX Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-ali1535.c | 2 +- drivers/i2c/busses/i2c-ali1563.c | 2 +- drivers/i2c/busses/i2c-ali15x3.c | 2 +- drivers/i2c/busses/i2c-amd756.c | 2 +- drivers/i2c/busses/i2c-amd8111.c | 2 +- drivers/i2c/busses/i2c-elektor.c | 2 +- drivers/i2c/busses/i2c-gpio.c | 2 +- drivers/i2c/busses/i2c-ibm_iic.c | 2 +- drivers/i2c/busses/i2c-iop3xx.c | 2 +- drivers/i2c/busses/i2c-isch.c | 2 +- drivers/i2c/busses/i2c-kempld.c | 3 +-- drivers/i2c/busses/i2c-mlxcpld.c | 2 +- drivers/i2c/busses/i2c-nforce2.c | 2 +- drivers/i2c/busses/i2c-pasemi-pci.c | 2 +- drivers/i2c/busses/i2c-piix4.c | 2 +- drivers/i2c/busses/i2c-scmi.c | 2 +- drivers/i2c/busses/i2c-sh7760.c | 2 +- drivers/i2c/busses/i2c-sibyte.c | 4 ++-- drivers/i2c/busses/i2c-sis5595.c | 2 +- drivers/i2c/busses/i2c-sis630.c | 2 +- drivers/i2c/busses/i2c-sis96x.c | 2 +- drivers/i2c/busses/i2c-via.c | 2 +- drivers/i2c/busses/i2c-viapro.c | 2 +- drivers/i2c/busses/scx200_acb.c | 2 +- 24 files changed, 25 insertions(+), 26 deletions(-) diff --git a/drivers/i2c/busses/i2c-ali1535.c b/drivers/i2c/busses/i2c-ali1535.c index ee83c4581bce0..461eb23f9d476 100644 --- a/drivers/i2c/busses/i2c-ali1535.c +++ b/drivers/i2c/busses/i2c-ali1535.c @@ -477,7 +477,7 @@ static const struct i2c_algorithm smbus_algorithm = { static struct i2c_adapter ali1535_adapter = { .owner = THIS_MODULE, - .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, + .class = I2C_CLASS_HWMON, .algo = &smbus_algorithm, }; diff --git a/drivers/i2c/busses/i2c-ali1563.c b/drivers/i2c/busses/i2c-ali1563.c index 55a9e93fbfeb5..307fb0666ecb2 100644 --- a/drivers/i2c/busses/i2c-ali1563.c +++ b/drivers/i2c/busses/i2c-ali1563.c @@ -390,7 +390,7 @@ static const struct i2c_algorithm ali1563_algorithm = { static struct i2c_adapter ali1563_adapter = { .owner = THIS_MODULE, - .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, + .class = I2C_CLASS_HWMON, .algo = &ali1563_algorithm, }; diff --git a/drivers/i2c/busses/i2c-ali15x3.c b/drivers/i2c/busses/i2c-ali15x3.c index 0231c5be6354f..d2fa30deb054c 100644 --- a/drivers/i2c/busses/i2c-ali15x3.c +++ b/drivers/i2c/busses/i2c-ali15x3.c @@ -461,7 +461,7 @@ static const struct i2c_algorithm smbus_algorithm = { static struct i2c_adapter ali15x3_adapter = { .owner = THIS_MODULE, - .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, + .class = I2C_CLASS_HWMON, .algo = &smbus_algorithm, }; diff --git a/drivers/i2c/busses/i2c-amd756.c b/drivers/i2c/busses/i2c-amd756.c index ef1307a258e95..208310db906df 100644 --- a/drivers/i2c/busses/i2c-amd756.c +++ b/drivers/i2c/busses/i2c-amd756.c @@ -285,7 +285,7 @@ static const struct i2c_algorithm smbus_algorithm = { struct i2c_adapter amd756_smbus = { .owner = THIS_MODULE, - .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, + .class = I2C_CLASS_HWMON, .algo = &smbus_algorithm, }; diff --git a/drivers/i2c/busses/i2c-amd8111.c b/drivers/i2c/busses/i2c-amd8111.c index 1ed7e945bb6d1..42a9b1221065f 100644 --- a/drivers/i2c/busses/i2c-amd8111.c +++ b/drivers/i2c/busses/i2c-amd8111.c @@ -449,7 +449,7 @@ static int amd8111_probe(struct pci_dev *dev, const struct pci_device_id *id) smbus->adapter.owner = THIS_MODULE; snprintf(smbus->adapter.name, sizeof(smbus->adapter.name), "SMBus2 AMD8111 adapter at %04x", smbus->base); - smbus->adapter.class = I2C_CLASS_HWMON | I2C_CLASS_SPD; + smbus->adapter.class = I2C_CLASS_HWMON; smbus->adapter.algo = &smbus_algorithm; smbus->adapter.algo_data = smbus; diff --git a/drivers/i2c/busses/i2c-elektor.c b/drivers/i2c/busses/i2c-elektor.c index b0f50dce9d0fe..cfe8665cacd27 100644 --- a/drivers/i2c/busses/i2c-elektor.c +++ b/drivers/i2c/busses/i2c-elektor.c @@ -188,7 +188,7 @@ static struct i2c_algo_pcf_data pcf_isa_data = { static struct i2c_adapter pcf_isa_ops = { .owner = THIS_MODULE, - .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, + .class = I2C_CLASS_HWMON, .algo_data = &pcf_isa_data, .name = "i2c-elektor", }; diff --git a/drivers/i2c/busses/i2c-gpio.c b/drivers/i2c/busses/i2c-gpio.c index fb35a75fe0e32..df2b183caa23a 100644 --- a/drivers/i2c/busses/i2c-gpio.c +++ b/drivers/i2c/busses/i2c-gpio.c @@ -444,7 +444,7 @@ static int i2c_gpio_probe(struct platform_device *pdev) snprintf(adap->name, sizeof(adap->name), "i2c-gpio%d", pdev->id); adap->algo_data = bit_data; - adap->class = I2C_CLASS_HWMON | I2C_CLASS_SPD; + adap->class = I2C_CLASS_HWMON; adap->dev.parent = dev; device_set_node(&adap->dev, fwnode); diff --git a/drivers/i2c/busses/i2c-ibm_iic.c b/drivers/i2c/busses/i2c-ibm_iic.c index 408820319ec48..7fb87b78923e4 100644 --- a/drivers/i2c/busses/i2c-ibm_iic.c +++ b/drivers/i2c/busses/i2c-ibm_iic.c @@ -739,7 +739,7 @@ static int iic_probe(struct platform_device *ofdev) adap->dev.of_node = of_node_get(np); strscpy(adap->name, "IBM IIC", sizeof(adap->name)); i2c_set_adapdata(adap, dev); - adap->class = I2C_CLASS_HWMON | I2C_CLASS_SPD; + adap->class = I2C_CLASS_HWMON; adap->algo = &iic_algo; adap->timeout = HZ; diff --git a/drivers/i2c/busses/i2c-iop3xx.c b/drivers/i2c/busses/i2c-iop3xx.c index f2f7ebeeaecb0..2e5f0165c3d30 100644 --- a/drivers/i2c/busses/i2c-iop3xx.c +++ b/drivers/i2c/busses/i2c-iop3xx.c @@ -478,7 +478,7 @@ iop3xx_i2c_probe(struct platform_device *pdev) memcpy(new_adapter->name, pdev->name, strlen(pdev->name)); new_adapter->owner = THIS_MODULE; - new_adapter->class = I2C_CLASS_HWMON | I2C_CLASS_SPD; + new_adapter->class = I2C_CLASS_HWMON; new_adapter->dev.parent = &pdev->dev; new_adapter->dev.of_node = pdev->dev.of_node; new_adapter->nr = pdev->id; diff --git a/drivers/i2c/busses/i2c-isch.c b/drivers/i2c/busses/i2c-isch.c index 1dc1ceaa44439..416a9968ed287 100644 --- a/drivers/i2c/busses/i2c-isch.c +++ b/drivers/i2c/busses/i2c-isch.c @@ -249,7 +249,7 @@ static const struct i2c_algorithm smbus_algorithm = { static struct i2c_adapter sch_adapter = { .owner = THIS_MODULE, - .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, + .class = I2C_CLASS_HWMON, .algo = &smbus_algorithm, }; diff --git a/drivers/i2c/busses/i2c-kempld.c b/drivers/i2c/busses/i2c-kempld.c index e01d753082884..c3a529a73b5bc 100644 --- a/drivers/i2c/busses/i2c-kempld.c +++ b/drivers/i2c/busses/i2c-kempld.c @@ -283,8 +283,7 @@ static const struct i2c_algorithm kempld_i2c_algorithm = { static const struct i2c_adapter kempld_i2c_adapter = { .owner = THIS_MODULE, .name = "i2c-kempld", - .class = I2C_CLASS_HWMON | I2C_CLASS_SPD | - I2C_CLASS_DEPRECATED, + .class = I2C_CLASS_HWMON | I2C_CLASS_DEPRECATED, .algo = &kempld_i2c_algorithm, }; diff --git a/drivers/i2c/busses/i2c-mlxcpld.c b/drivers/i2c/busses/i2c-mlxcpld.c index 6fec64ea67fbc..099291a0411de 100644 --- a/drivers/i2c/busses/i2c-mlxcpld.c +++ b/drivers/i2c/busses/i2c-mlxcpld.c @@ -477,7 +477,7 @@ static const struct i2c_adapter_quirks mlxcpld_i2c_quirks_ext2 = { static struct i2c_adapter mlxcpld_i2c_adapter = { .owner = THIS_MODULE, .name = "i2c-mlxcpld", - .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, + .class = I2C_CLASS_HWMON, .algo = &mlxcpld_i2c_algo, .quirks = &mlxcpld_i2c_quirks, .retries = MLXCPLD_I2C_RETR_NUM, diff --git a/drivers/i2c/busses/i2c-nforce2.c b/drivers/i2c/busses/i2c-nforce2.c index 38d203d93eeec..fab662e6bc084 100644 --- a/drivers/i2c/busses/i2c-nforce2.c +++ b/drivers/i2c/busses/i2c-nforce2.c @@ -349,7 +349,7 @@ static int nforce2_probe_smb(struct pci_dev *dev, int bar, int alt_reg, return -EBUSY; } smbus->adapter.owner = THIS_MODULE; - smbus->adapter.class = I2C_CLASS_HWMON | I2C_CLASS_SPD; + smbus->adapter.class = I2C_CLASS_HWMON; smbus->adapter.algo = &smbus_algorithm; smbus->adapter.algo_data = smbus; smbus->adapter.dev.parent = &dev->dev; diff --git a/drivers/i2c/busses/i2c-pasemi-pci.c b/drivers/i2c/busses/i2c-pasemi-pci.c index cfc89e04eb94c..77f90c7436eda 100644 --- a/drivers/i2c/busses/i2c-pasemi-pci.c +++ b/drivers/i2c/busses/i2c-pasemi-pci.c @@ -56,7 +56,7 @@ static int pasemi_smb_pci_probe(struct pci_dev *dev, if (!smbus->ioaddr) return -EBUSY; - smbus->adapter.class = I2C_CLASS_HWMON | I2C_CLASS_SPD; + smbus->adapter.class = I2C_CLASS_HWMON; error = pasemi_i2c_common_probe(smbus); if (error) return error; diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c index 809fbd014cd68..6a0392172b2f2 100644 --- a/drivers/i2c/busses/i2c-piix4.c +++ b/drivers/i2c/busses/i2c-piix4.c @@ -943,7 +943,7 @@ static int piix4_add_adapter(struct pci_dev *dev, unsigned short smba, } adap->owner = THIS_MODULE; - adap->class = I2C_CLASS_HWMON | I2C_CLASS_SPD; + adap->class = I2C_CLASS_HWMON; adap->algo = sb800_main ? &piix4_smbus_algorithm_sb800 : &smbus_algorithm; diff --git a/drivers/i2c/busses/i2c-scmi.c b/drivers/i2c/busses/i2c-scmi.c index 421735acfa141..d7af8e0d7599e 100644 --- a/drivers/i2c/busses/i2c-scmi.c +++ b/drivers/i2c/busses/i2c-scmi.c @@ -385,7 +385,7 @@ static int smbus_cmi_probe(struct platform_device *device) smbus_cmi->adapter.owner = THIS_MODULE; smbus_cmi->adapter.algo = &acpi_smbus_cmi_algorithm; smbus_cmi->adapter.algo_data = smbus_cmi; - smbus_cmi->adapter.class = I2C_CLASS_HWMON | I2C_CLASS_SPD; + smbus_cmi->adapter.class = I2C_CLASS_HWMON; smbus_cmi->adapter.dev.parent = &device->dev; ret = i2c_add_adapter(&smbus_cmi->adapter); diff --git a/drivers/i2c/busses/i2c-sh7760.c b/drivers/i2c/busses/i2c-sh7760.c index 1ad2a26156d17..8a043f5fca1e0 100644 --- a/drivers/i2c/busses/i2c-sh7760.c +++ b/drivers/i2c/busses/i2c-sh7760.c @@ -477,7 +477,7 @@ static int sh7760_i2c_probe(struct platform_device *pdev) id->adap.nr = pdev->id; id->adap.algo = &sh7760_i2c_algo; - id->adap.class = I2C_CLASS_HWMON | I2C_CLASS_SPD; + id->adap.class = I2C_CLASS_HWMON; id->adap.retries = 3; id->adap.algo_data = id; id->adap.dev.parent = &pdev->dev; diff --git a/drivers/i2c/busses/i2c-sibyte.c b/drivers/i2c/busses/i2c-sibyte.c index 8f71f01cb169b..49f8f4f1b0f0f 100644 --- a/drivers/i2c/busses/i2c-sibyte.c +++ b/drivers/i2c/busses/i2c-sibyte.c @@ -142,7 +142,7 @@ static struct i2c_algo_sibyte_data sibyte_board_data[2] = { static struct i2c_adapter sibyte_board_adapter[2] = { { .owner = THIS_MODULE, - .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, + .class = I2C_CLASS_HWMON, .algo = NULL, .algo_data = &sibyte_board_data[0], .nr = 0, @@ -150,7 +150,7 @@ static struct i2c_adapter sibyte_board_adapter[2] = { }, { .owner = THIS_MODULE, - .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, + .class = I2C_CLASS_HWMON, .algo = NULL, .algo_data = &sibyte_board_data[1], .nr = 1, diff --git a/drivers/i2c/busses/i2c-sis5595.c b/drivers/i2c/busses/i2c-sis5595.c index 486f1e9dfb74a..32476dc10ad61 100644 --- a/drivers/i2c/busses/i2c-sis5595.c +++ b/drivers/i2c/busses/i2c-sis5595.c @@ -353,7 +353,7 @@ static const struct i2c_algorithm smbus_algorithm = { static struct i2c_adapter sis5595_adapter = { .owner = THIS_MODULE, - .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, + .class = I2C_CLASS_HWMON, .algo = &smbus_algorithm, }; diff --git a/drivers/i2c/busses/i2c-sis630.c b/drivers/i2c/busses/i2c-sis630.c index 87d56250d78a3..3505cf29cedda 100644 --- a/drivers/i2c/busses/i2c-sis630.c +++ b/drivers/i2c/busses/i2c-sis630.c @@ -493,7 +493,7 @@ static const struct i2c_algorithm smbus_algorithm = { static struct i2c_adapter sis630_adapter = { .owner = THIS_MODULE, - .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, + .class = I2C_CLASS_HWMON, .algo = &smbus_algorithm, .retries = 3 }; diff --git a/drivers/i2c/busses/i2c-sis96x.c b/drivers/i2c/busses/i2c-sis96x.c index cde8003985a58..77529dda6fcde 100644 --- a/drivers/i2c/busses/i2c-sis96x.c +++ b/drivers/i2c/busses/i2c-sis96x.c @@ -228,7 +228,7 @@ static const struct i2c_algorithm smbus_algorithm = { static struct i2c_adapter sis96x_adapter = { .owner = THIS_MODULE, - .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, + .class = I2C_CLASS_HWMON, .algo = &smbus_algorithm, }; diff --git a/drivers/i2c/busses/i2c-via.c b/drivers/i2c/busses/i2c-via.c index ad4f09c7f0275..7ed29992a97ff 100644 --- a/drivers/i2c/busses/i2c-via.c +++ b/drivers/i2c/busses/i2c-via.c @@ -70,7 +70,7 @@ static struct i2c_algo_bit_data bit_data = { static struct i2c_adapter vt586b_adapter = { .owner = THIS_MODULE, - .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, + .class = I2C_CLASS_HWMON, .name = "VIA i2c", .algo_data = &bit_data, }; diff --git a/drivers/i2c/busses/i2c-viapro.c b/drivers/i2c/busses/i2c-viapro.c index 970ccdcbb8896..2cc7bba3b8bf8 100644 --- a/drivers/i2c/busses/i2c-viapro.c +++ b/drivers/i2c/busses/i2c-viapro.c @@ -304,7 +304,7 @@ static const struct i2c_algorithm smbus_algorithm = { static struct i2c_adapter vt596_adapter = { .owner = THIS_MODULE, - .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, + .class = I2C_CLASS_HWMON, .algo = &smbus_algorithm, }; diff --git a/drivers/i2c/busses/scx200_acb.c b/drivers/i2c/busses/scx200_acb.c index 83c1db610f54b..3648382b885a4 100644 --- a/drivers/i2c/busses/scx200_acb.c +++ b/drivers/i2c/busses/scx200_acb.c @@ -427,7 +427,7 @@ static struct scx200_acb_iface *scx200_create_iface(const char *text, snprintf(adapter->name, sizeof(adapter->name), "%s ACB%d", text, index); adapter->owner = THIS_MODULE; adapter->algo = &scx200_acb_algorithm; - adapter->class = I2C_CLASS_HWMON | I2C_CLASS_SPD; + adapter->class = I2C_CLASS_HWMON; adapter->dev.parent = dev; mutex_init(&iface->mutex); -- GitLab From c1cc7ccb0ff7596a7025a844d29487df0efd40b2 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 24 Nov 2023 11:16:14 +0100 Subject: [PATCH 1124/1290] i2c: stub: Don't let i2c adapters declare I2C_CLASS_SPD support if they support I2C_CLASS_HWMON After removal of the legacy eeprom driver the only remaining I2C client device driver supporting I2C_CLASS_SPD is jc42. Because this driver also supports I2C_CLASS_HWMON, adapters don't have to declare support for I2C_CLASS_SPD if they support I2C_CLASS_HWMON. It's one step towards getting rid of I2C_CLASS_SPD mid-term. Signed-off-by: Heiner Kallweit Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-stub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/i2c-stub.c b/drivers/i2c/i2c-stub.c index d642cad219d9e..09e7b7bf4c5f7 100644 --- a/drivers/i2c/i2c-stub.c +++ b/drivers/i2c/i2c-stub.c @@ -308,7 +308,7 @@ static const struct i2c_algorithm smbus_algorithm = { static struct i2c_adapter stub_adapter = { .owner = THIS_MODULE, - .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, + .class = I2C_CLASS_HWMON, .algo = &smbus_algorithm, .name = "SMBus stub driver", }; -- GitLab From 8cd210d200ad00b29a8e0476ceef585bdca1d2a7 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 24 Nov 2023 11:16:17 +0100 Subject: [PATCH 1125/1290] media: netup_unidvb: Don't let i2c adapters declare I2C_CLASS_SPD support if they support I2C_CLASS_HWMON After removal of the legacy eeprom driver the only remaining I2C client device driver supporting I2C_CLASS_SPD is jc42. Because this driver also supports I2C_CLASS_HWMON, adapters don't have to declare support for I2C_CLASS_SPD if they support I2C_CLASS_HWMON. It's one step towards getting rid of I2C_CLASS_SPD mid-term. Signed-off-by: Heiner Kallweit Acked-by: Hans Verkuil Signed-off-by: Wolfram Sang --- drivers/media/pci/netup_unidvb/netup_unidvb_i2c.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/pci/netup_unidvb/netup_unidvb_i2c.c b/drivers/media/pci/netup_unidvb/netup_unidvb_i2c.c index bd38ce4442325..46676f2c89c72 100644 --- a/drivers/media/pci/netup_unidvb/netup_unidvb_i2c.c +++ b/drivers/media/pci/netup_unidvb/netup_unidvb_i2c.c @@ -289,7 +289,7 @@ static const struct i2c_algorithm netup_i2c_algorithm = { static const struct i2c_adapter netup_i2c_adapter = { .owner = THIS_MODULE, .name = NETUP_UNIDVB_NAME, - .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, + .class = I2C_CLASS_HWMON, .algo = &netup_i2c_algorithm, }; -- GitLab From f79ad78a25841fdde6cd589969966fb113cf1d10 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 24 Nov 2023 11:16:11 +0100 Subject: [PATCH 1126/1290] staging: greybus: Don't let i2c adapters declare I2C_CLASS_SPD support if they support I2C_CLASS_HWMON After removal of the legacy eeprom driver the only remaining I2C client device driver supporting I2C_CLASS_SPD is jc42. Because this driver also supports I2C_CLASS_HWMON, adapters don't have to declare support for I2C_CLASS_SPD if they support I2C_CLASS_HWMON. It's one step towards getting rid of I2C_CLASS_SPD mid-term. Acked-by: Greg Kroah-Hartman Signed-off-by: Heiner Kallweit Signed-off-by: Wolfram Sang --- drivers/staging/greybus/i2c.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/greybus/i2c.c b/drivers/staging/greybus/i2c.c index de2f6516da095..22325ab9d6521 100644 --- a/drivers/staging/greybus/i2c.c +++ b/drivers/staging/greybus/i2c.c @@ -264,7 +264,7 @@ static int gb_i2c_probe(struct gbphy_device *gbphy_dev, /* Looks good; up our i2c adapter */ adapter = &gb_i2c_dev->adapter; adapter->owner = THIS_MODULE; - adapter->class = I2C_CLASS_HWMON | I2C_CLASS_SPD; + adapter->class = I2C_CLASS_HWMON; adapter->algo = &gb_i2c_algorithm; adapter->dev.parent = &gbphy_dev->dev; -- GitLab From 73febd775bdbdb98c81255ff85773ac410ded5c4 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sun, 12 Nov 2023 17:54:41 -0500 Subject: [PATCH 1127/1290] i2c: create debugfs entry per adapter Two drivers already implement custom debugfs handling for their i2c_adapter and more will come. So, let the core create a debugfs directory per adapter and pass that to drivers for their debugfs files. Signed-off-by: Wolfram Sang Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-core-base.c | 11 +++++++++++ include/linux/i2c.h | 2 ++ 2 files changed, 13 insertions(+) diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c index eac90a3cf61a4..f6c828bbd166a 100644 --- a/drivers/i2c/i2c-core-base.c +++ b/drivers/i2c/i2c-core-base.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -67,6 +68,8 @@ static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver); static DEFINE_STATIC_KEY_FALSE(i2c_trace_msg_key); static bool is_registered; +static struct dentry *i2c_debugfs_root; + int i2c_transfer_trace_reg(void) { static_branch_inc(&i2c_trace_msg_key); @@ -1524,6 +1527,8 @@ static int i2c_register_adapter(struct i2c_adapter *adap) goto out_list; } + adap->debugfs = debugfs_create_dir(dev_name(&adap->dev), i2c_debugfs_root); + res = i2c_setup_smbus_alert(adap); if (res) goto out_reg; @@ -1563,6 +1568,7 @@ static int i2c_register_adapter(struct i2c_adapter *adap) return 0; out_reg: + debugfs_remove_recursive(adap->debugfs); init_completion(&adap->dev_released); device_unregister(&adap->dev); wait_for_completion(&adap->dev_released); @@ -1764,6 +1770,8 @@ void i2c_del_adapter(struct i2c_adapter *adap) i2c_host_notify_irq_teardown(adap); + debugfs_remove_recursive(adap->debugfs); + /* wait until all references to the device are gone * * FIXME: This is old code and should ideally be replaced by an @@ -2061,6 +2069,8 @@ static int __init i2c_init(void) is_registered = true; + i2c_debugfs_root = debugfs_create_dir("i2c", NULL); + #ifdef CONFIG_I2C_COMPAT i2c_adapter_compat_class = class_compat_register("i2c-adapter"); if (!i2c_adapter_compat_class) { @@ -2099,6 +2109,7 @@ static void __exit i2c_exit(void) #ifdef CONFIG_I2C_COMPAT class_compat_unregister(i2c_adapter_compat_class); #endif + debugfs_remove_recursive(i2c_debugfs_root); bus_unregister(&i2c_bus_type); tracepoint_synchronize_unregister(); } diff --git a/include/linux/i2c.h b/include/linux/i2c.h index d029aade338fd..e01fb1097868c 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -746,6 +746,8 @@ struct i2c_adapter { struct irq_domain *host_notify_domain; struct regulator *bus_regulator; + + struct dentry *debugfs; }; #define to_i2c_adapter(d) container_of(d, struct i2c_adapter, dev) -- GitLab From c66520c02c2f5b75e4f06a83ece4382fe2f92cfc Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sun, 12 Nov 2023 17:59:10 -0500 Subject: [PATCH 1128/1290] i2c: gpio: move to per-adapter debugfs directory The I2C core now provides a per-adapter debugfs directory. Use it instead of creating a custom one. Signed-off-by: Wolfram Sang Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-gpio.c | 34 ++++++---------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) diff --git a/drivers/i2c/busses/i2c-gpio.c b/drivers/i2c/busses/i2c-gpio.c index df2b183caa23a..4f1411b1a7754 100644 --- a/drivers/i2c/busses/i2c-gpio.c +++ b/drivers/i2c/busses/i2c-gpio.c @@ -25,7 +25,6 @@ struct i2c_gpio_private_data { struct i2c_algo_bit_data bit_data; struct i2c_gpio_platform_data pdata; #ifdef CONFIG_I2C_GPIO_FAULT_INJECTOR - struct dentry *debug_dir; /* these must be protected by bus lock */ struct completion scl_irq_completion; u64 scl_irq_data; @@ -72,7 +71,6 @@ static int i2c_gpio_getscl(void *data) } #ifdef CONFIG_I2C_GPIO_FAULT_INJECTOR -static struct dentry *i2c_gpio_debug_dir; #define setsda(bd, val) ((bd)->setsda((bd)->data, val)) #define setscl(bd, val) ((bd)->setscl((bd)->data, val)) @@ -258,41 +256,23 @@ static void i2c_gpio_fault_injector_init(struct platform_device *pdev) { struct i2c_gpio_private_data *priv = platform_get_drvdata(pdev); - /* - * If there will be a debugfs-dir per i2c adapter somewhen, put the - * 'fault-injector' dir there. Until then, we have a global dir with - * all adapters as subdirs. - */ - if (!i2c_gpio_debug_dir) - i2c_gpio_debug_dir = debugfs_create_dir("i2c-fault-injector", NULL); - - priv->debug_dir = debugfs_create_dir(pdev->name, i2c_gpio_debug_dir); - init_completion(&priv->scl_irq_completion); - debugfs_create_file_unsafe("incomplete_address_phase", 0200, priv->debug_dir, + debugfs_create_file_unsafe("incomplete_address_phase", 0200, priv->adap.debugfs, priv, &fops_incomplete_addr_phase); - debugfs_create_file_unsafe("incomplete_write_byte", 0200, priv->debug_dir, + debugfs_create_file_unsafe("incomplete_write_byte", 0200, priv->adap.debugfs, priv, &fops_incomplete_write_byte); if (priv->bit_data.getscl) { - debugfs_create_file_unsafe("inject_panic", 0200, priv->debug_dir, + debugfs_create_file_unsafe("inject_panic", 0200, priv->adap.debugfs, priv, &fops_inject_panic); - debugfs_create_file_unsafe("lose_arbitration", 0200, priv->debug_dir, + debugfs_create_file_unsafe("lose_arbitration", 0200, priv->adap.debugfs, priv, &fops_lose_arbitration); } - debugfs_create_file_unsafe("scl", 0600, priv->debug_dir, priv, &fops_scl); - debugfs_create_file_unsafe("sda", 0600, priv->debug_dir, priv, &fops_sda); -} - -static void i2c_gpio_fault_injector_exit(struct platform_device *pdev) -{ - struct i2c_gpio_private_data *priv = platform_get_drvdata(pdev); - - debugfs_remove_recursive(priv->debug_dir); + debugfs_create_file_unsafe("scl", 0600, priv->adap.debugfs, priv, &fops_scl); + debugfs_create_file_unsafe("sda", 0600, priv->adap.debugfs, priv, &fops_sda); } #else static inline void i2c_gpio_fault_injector_init(struct platform_device *pdev) {} -static inline void i2c_gpio_fault_injector_exit(struct platform_device *pdev) {} #endif /* CONFIG_I2C_GPIO_FAULT_INJECTOR*/ /* Get i2c-gpio properties from DT or ACPI table */ @@ -475,8 +455,6 @@ static void i2c_gpio_remove(struct platform_device *pdev) struct i2c_gpio_private_data *priv; struct i2c_adapter *adap; - i2c_gpio_fault_injector_exit(pdev); - priv = platform_get_drvdata(pdev); adap = &priv->adap; -- GitLab From e19e1abc9ce45f4d4cad2f7eab1827e683be4e84 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sun, 12 Nov 2023 17:59:11 -0500 Subject: [PATCH 1129/1290] i2c: npcm7xx: move to per-adapter debugfs directory The I2C core now provides a per-adapter debugfs directory. Use it instead of creating a custom one. Signed-off-by: Wolfram Sang Reviewed-by: Tali Perry Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-npcm7xx.c | 49 +++++--------------------------- 1 file changed, 7 insertions(+), 42 deletions(-) diff --git a/drivers/i2c/busses/i2c-npcm7xx.c b/drivers/i2c/busses/i2c-npcm7xx.c index ae4bae63ad4f3..54181b3f19196 100644 --- a/drivers/i2c/busses/i2c-npcm7xx.c +++ b/drivers/i2c/busses/i2c-npcm7xx.c @@ -326,7 +326,6 @@ struct npcm_i2c { u8 slv_rd_buf[MAX_I2C_HW_FIFO_SIZE]; u8 slv_wr_buf[MAX_I2C_HW_FIFO_SIZE]; #endif - struct dentry *debugfs; /* debugfs device directory */ u64 ber_cnt; u64 rec_succ_cnt; u64 rec_fail_cnt; @@ -2250,27 +2249,15 @@ static const struct i2c_algorithm npcm_i2c_algo = { #endif }; -/* i2c debugfs directory: used to keep health monitor of i2c devices */ -static struct dentry *npcm_i2c_debugfs_dir; - static void npcm_i2c_init_debugfs(struct platform_device *pdev, struct npcm_i2c *bus) { - struct dentry *d; - - if (!npcm_i2c_debugfs_dir) - return; - d = debugfs_create_dir(dev_name(&pdev->dev), npcm_i2c_debugfs_dir); - if (IS_ERR_OR_NULL(d)) - return; - debugfs_create_u64("ber_cnt", 0444, d, &bus->ber_cnt); - debugfs_create_u64("nack_cnt", 0444, d, &bus->nack_cnt); - debugfs_create_u64("rec_succ_cnt", 0444, d, &bus->rec_succ_cnt); - debugfs_create_u64("rec_fail_cnt", 0444, d, &bus->rec_fail_cnt); - debugfs_create_u64("timeout_cnt", 0444, d, &bus->timeout_cnt); - debugfs_create_u64("tx_complete_cnt", 0444, d, &bus->tx_complete_cnt); - - bus->debugfs = d; + debugfs_create_u64("ber_cnt", 0444, bus->adap.debugfs, &bus->ber_cnt); + debugfs_create_u64("nack_cnt", 0444, bus->adap.debugfs, &bus->nack_cnt); + debugfs_create_u64("rec_succ_cnt", 0444, bus->adap.debugfs, &bus->rec_succ_cnt); + debugfs_create_u64("rec_fail_cnt", 0444, bus->adap.debugfs, &bus->rec_fail_cnt); + debugfs_create_u64("timeout_cnt", 0444, bus->adap.debugfs, &bus->timeout_cnt); + debugfs_create_u64("tx_complete_cnt", 0444, bus->adap.debugfs, &bus->tx_complete_cnt); } static int npcm_i2c_probe_bus(struct platform_device *pdev) @@ -2362,7 +2349,6 @@ static void npcm_i2c_remove_bus(struct platform_device *pdev) unsigned long lock_flags; struct npcm_i2c *bus = platform_get_drvdata(pdev); - debugfs_remove_recursive(bus->debugfs); spin_lock_irqsave(&bus->lock, lock_flags); npcm_i2c_disable(bus); spin_unlock_irqrestore(&bus->lock, lock_flags); @@ -2385,28 +2371,7 @@ static struct platform_driver npcm_i2c_bus_driver = { } }; -static int __init npcm_i2c_init(void) -{ - int ret; - - npcm_i2c_debugfs_dir = debugfs_create_dir("npcm_i2c", NULL); - - ret = platform_driver_register(&npcm_i2c_bus_driver); - if (ret) { - debugfs_remove_recursive(npcm_i2c_debugfs_dir); - return ret; - } - - return 0; -} -module_init(npcm_i2c_init); - -static void __exit npcm_i2c_exit(void) -{ - platform_driver_unregister(&npcm_i2c_bus_driver); - debugfs_remove_recursive(npcm_i2c_debugfs_dir); -} -module_exit(npcm_i2c_exit); +module_platform_driver(npcm_i2c_bus_driver); MODULE_AUTHOR("Avi Fishman "); MODULE_AUTHOR("Tali Perry "); -- GitLab From 2b523c46e81ebd621515ab47117f95de197dfcbf Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 14 Dec 2023 08:43:57 +0100 Subject: [PATCH 1130/1290] i2c: rcar: introduce Gen4 devices So far, we treated Gen4 as Gen3. But we are soon adding FM+ as a Gen4 specific feature, so prepare the code for the new devtype. Signed-off-by: Wolfram Sang Reviewed-by: Geert Uytterhoeven Reviewed-by: Andi Shyti Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-rcar.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/i2c/busses/i2c-rcar.c b/drivers/i2c/busses/i2c-rcar.c index 829ac053bbb7c..973811a6c27ba 100644 --- a/drivers/i2c/busses/i2c-rcar.c +++ b/drivers/i2c/busses/i2c-rcar.c @@ -132,6 +132,7 @@ enum rcar_i2c_type { I2C_RCAR_GEN1, I2C_RCAR_GEN2, I2C_RCAR_GEN3, + I2C_RCAR_GEN4, }; struct rcar_i2c_priv { @@ -431,8 +432,8 @@ static void rcar_i2c_cleanup_dma(struct rcar_i2c_priv *priv, bool terminate) dma_unmap_single(chan->device->dev, sg_dma_address(&priv->sg), sg_dma_len(&priv->sg), priv->dma_direction); - /* Gen3 can only do one RXDMA per transfer and we just completed it */ - if (priv->devtype == I2C_RCAR_GEN3 && + /* Gen3+ can only do one RXDMA per transfer and we just completed it */ + if (priv->devtype >= I2C_RCAR_GEN3 && priv->dma_direction == DMA_FROM_DEVICE) priv->flags |= ID_P_NO_RXDMA; @@ -886,8 +887,8 @@ static int rcar_i2c_master_xfer(struct i2c_adapter *adap, if (ret < 0) goto out; - /* Gen3 needs a reset before allowing RXDMA once */ - if (priv->devtype == I2C_RCAR_GEN3) { + /* Gen3+ needs a reset. That also allows RXDMA once */ + if (priv->devtype >= I2C_RCAR_GEN3) { priv->flags &= ~ID_P_NO_RXDMA; ret = rcar_i2c_do_reset(priv); if (ret) @@ -1075,7 +1076,7 @@ static const struct of_device_id rcar_i2c_dt_ids[] = { { .compatible = "renesas,rcar-gen1-i2c", .data = (void *)I2C_RCAR_GEN1 }, { .compatible = "renesas,rcar-gen2-i2c", .data = (void *)I2C_RCAR_GEN2 }, { .compatible = "renesas,rcar-gen3-i2c", .data = (void *)I2C_RCAR_GEN3 }, - { .compatible = "renesas,rcar-gen4-i2c", .data = (void *)I2C_RCAR_GEN3 }, + { .compatible = "renesas,rcar-gen4-i2c", .data = (void *)I2C_RCAR_GEN4 }, {}, }; MODULE_DEVICE_TABLE(of, rcar_i2c_dt_ids); @@ -1151,7 +1152,7 @@ static int rcar_i2c_probe(struct platform_device *pdev) if (of_property_read_bool(dev->of_node, "smbus")) priv->flags |= ID_P_HOST_NOTIFY; - if (priv->devtype == I2C_RCAR_GEN3) { + if (priv->devtype >= I2C_RCAR_GEN3) { priv->rstc = devm_reset_control_get_exclusive(&pdev->dev, NULL); if (IS_ERR(priv->rstc)) { ret = PTR_ERR(priv->rstc); -- GitLab From d0520eb3ed54df89eb5961ec3b88634f313123f2 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 14 Dec 2023 08:43:58 +0100 Subject: [PATCH 1131/1290] i2c: rcar: add FastMode+ support for Gen4 To support FM+, we mainly need to turn the SMD constant into a parameter and set it accordingly. That also means we can finally fix SMD to our needs instead of bailing out. A sanity check for SMD then becomes a sanity check for 'x == 0'. After all that, activating the enable bit for FM+ is all we need to do. Tested with a Renesas Falcon board using R-Car V3U. Signed-off-by: Wolfram Sang Reviewed-by: Geert Uytterhoeven Reviewed-by: Andi Shyti Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-rcar.c | 38 +++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/drivers/i2c/busses/i2c-rcar.c b/drivers/i2c/busses/i2c-rcar.c index 973811a6c27ba..828aa2ea0fe4c 100644 --- a/drivers/i2c/busses/i2c-rcar.c +++ b/drivers/i2c/busses/i2c-rcar.c @@ -89,6 +89,7 @@ #define TMDMAE BIT(0) /* DMA Master Transmitted Enable */ /* ICCCR2 */ +#define FMPE BIT(7) /* Fast Mode Plus Enable */ #define CDFD BIT(2) /* CDF Disable */ #define HLSE BIT(1) /* HIGH/LOW Separate Control Enable */ #define SME BIT(0) /* SCL Mask Enable */ @@ -122,11 +123,12 @@ #define ID_NACK BIT(4) #define ID_EPROTO BIT(5) /* persistent flags */ +#define ID_P_FMPLUS BIT(27) #define ID_P_NOT_ATOMIC BIT(28) #define ID_P_HOST_NOTIFY BIT(29) #define ID_P_NO_RXDMA BIT(30) /* HW forbids RXDMA sometimes */ #define ID_P_PM_BLOCKED BIT(31) -#define ID_P_MASK GENMASK(31, 28) +#define ID_P_MASK GENMASK(31, 27) enum rcar_i2c_type { I2C_RCAR_GEN1, @@ -149,6 +151,7 @@ struct rcar_i2c_priv { u32 icccr; u16 schd; u16 scld; + u8 smd; u8 recovery_icmcr; /* protected by adapter lock */ enum rcar_i2c_type devtype; struct i2c_client *slave; @@ -240,9 +243,14 @@ static void rcar_i2c_init(struct rcar_i2c_priv *priv) if (priv->devtype < I2C_RCAR_GEN3) { rcar_i2c_write(priv, ICCCR, priv->icccr); } else { - rcar_i2c_write(priv, ICCCR2, CDFD | HLSE | SME); + u32 icccr2 = CDFD | HLSE | SME; + + if (priv->flags & ID_P_FMPLUS) + icccr2 |= FMPE; + + rcar_i2c_write(priv, ICCCR2, icccr2); rcar_i2c_write(priv, ICCCR, priv->icccr); - rcar_i2c_write(priv, ICMPR, RCAR_DEFAULT_SMD); + rcar_i2c_write(priv, ICMPR, priv->smd); rcar_i2c_write(priv, ICHPR, priv->schd); rcar_i2c_write(priv, ICLPR, priv->scld); rcar_i2c_write(priv, ICFBSCR, TCYC17); @@ -279,6 +287,7 @@ static int rcar_i2c_clock_calculate(struct rcar_i2c_priv *priv) /* Fall back to previously used values if not supplied */ i2c_parse_fw_timings(dev, &t, false); + priv->smd = RCAR_DEFAULT_SMD; /* * calculate SCL clock @@ -304,6 +313,11 @@ static int rcar_i2c_clock_calculate(struct rcar_i2c_priv *priv) if (cdf >= 1U << cdf_width) goto err_no_val; + if (t.bus_freq_hz > I2C_MAX_FAST_MODE_FREQ && priv->devtype >= I2C_RCAR_GEN4) + priv->flags |= ID_P_FMPLUS; + else + priv->flags &= ~ID_P_FMPLUS; + /* On Gen3+, we use cdf only for the filters, not as a SCL divider */ ick = rate / (priv->devtype < I2C_RCAR_GEN3 ? (cdf + 1) : 1); @@ -345,30 +359,30 @@ static int rcar_i2c_clock_calculate(struct rcar_i2c_priv *priv) * x as a base value for the SCLD/SCHD ratio: * * SCL = clkp / (8 + 2 * SMD + SCLD + SCHD + F[(ticf + tr + intd) * clkp]) - * SCL = clkp / (8 + 2 * RCAR_DEFAULT_SMD + RCAR_SCLD_RATIO * x + * SCL = clkp / (8 + 2 * SMD + RCAR_SCLD_RATIO * x * + RCAR_SCHD_RATIO * x + F[...]) * * with: sum_ratio = RCAR_SCLD_RATIO + RCAR_SCHD_RATIO - * and: smd = RCAR_DEFAULT_SMD * * SCL = clkp / (8 + 2 * smd + sum_ratio * x + F[...]) * 8 + 2 * smd + sum_ratio * x + F[...] = clkp / SCL * x = ((clkp / SCL) - 8 - 2 * smd - F[...]) / sum_ratio */ x = DIV_ROUND_UP(rate, t.bus_freq_hz ?: 1); - x = DIV_ROUND_UP(x - 8 - 2 * RCAR_DEFAULT_SMD - round, sum_ratio); - scl = rate / (8 + 2 * RCAR_DEFAULT_SMD + sum_ratio * x + round); + x = DIV_ROUND_UP(x - 8 - 2 * priv->smd - round, sum_ratio); + scl = rate / (8 + 2 * priv->smd + sum_ratio * x + round); - /* Bail out if values don't fit into 16 bit or SMD became too large */ - if (x * RCAR_SCLD_RATIO > 0xffff || RCAR_DEFAULT_SMD > x * RCAR_SCHD_RATIO) + if (x == 0 || x * RCAR_SCLD_RATIO > 0xffff) goto err_no_val; priv->icccr = cdf; priv->schd = RCAR_SCHD_RATIO * x; priv->scld = RCAR_SCLD_RATIO * x; + if (priv->smd >= priv->schd) + priv->smd = priv->schd - 1; - dev_dbg(dev, "clk %u/%u(%lu), round %u, CDF: %u SCHD %u SCLD %u\n", - scl, t.bus_freq_hz, rate, round, cdf, priv->schd, priv->scld); + dev_dbg(dev, "clk %u/%u(%lu), round %u, CDF: %u SCHD %u SCLD %u SMD %u\n", + scl, t.bus_freq_hz, rate, round, cdf, priv->schd, priv->scld, priv->smd); } return 0; @@ -1073,6 +1087,8 @@ static const struct of_device_id rcar_i2c_dt_ids[] = { { .compatible = "renesas,i2c-r8a7794", .data = (void *)I2C_RCAR_GEN2 }, { .compatible = "renesas,i2c-r8a7795", .data = (void *)I2C_RCAR_GEN3 }, { .compatible = "renesas,i2c-r8a7796", .data = (void *)I2C_RCAR_GEN3 }, + /* S4 has no FM+ bit */ + { .compatible = "renesas,i2c-r8a779f0", .data = (void *)I2C_RCAR_GEN3 }, { .compatible = "renesas,rcar-gen1-i2c", .data = (void *)I2C_RCAR_GEN1 }, { .compatible = "renesas,rcar-gen2-i2c", .data = (void *)I2C_RCAR_GEN2 }, { .compatible = "renesas,rcar-gen3-i2c", .data = (void *)I2C_RCAR_GEN3 }, -- GitLab From 0d9cf23ed55d7ba3ab26d617a3ae507863674c8f Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 8 Nov 2023 17:43:52 +0100 Subject: [PATCH 1132/1290] i2c: s3c24xx: fix read transfers in polling mode To properly handle read transfers in polling mode, no waiting for the ACK state is needed as it will never come. Just wait a bit to ensure start state is on the bus and continue processing next bytes. Fixes: 117053f77a5a ("i2c: s3c2410: Add polling mode support") Signed-off-by: Marek Szyprowski Reviewed-by: Chanho Park Reviewed-by: Andi Shyti Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-s3c2410.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-s3c2410.c b/drivers/i2c/busses/i2c-s3c2410.c index c56886af724ea..bf9a5670ef337 100644 --- a/drivers/i2c/busses/i2c-s3c2410.c +++ b/drivers/i2c/busses/i2c-s3c2410.c @@ -216,8 +216,17 @@ static bool is_ack(struct s3c24xx_i2c *i2c) int tries; for (tries = 50; tries; --tries) { - if (readl(i2c->regs + S3C2410_IICCON) - & S3C2410_IICCON_IRQPEND) { + unsigned long tmp = readl(i2c->regs + S3C2410_IICCON); + + if (!(tmp & S3C2410_IICCON_ACKEN)) { + /* + * Wait a bit for the bus to stabilize, + * delay estimated experimentally. + */ + usleep_range(100, 200); + return true; + } + if (tmp & S3C2410_IICCON_IRQPEND) { if (!(readl(i2c->regs + S3C2410_IICSTAT) & S3C2410_IICSTAT_LASTBIT)) return true; -- GitLab From 990489e1042c6c5d6bccf56deca68f8dbeed8180 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 8 Nov 2023 17:43:53 +0100 Subject: [PATCH 1133/1290] i2c: s3c24xx: fix transferring more than one message in polling mode To properly handle ACK on the bus when transferring more than one message in polling mode, move the polling handling loop from s3c24xx_i2c_message_start() to s3c24xx_i2c_doxfer(). This way i2c_s3c_irq_nextbyte() is always executed till the end, properly acknowledging the IRQ bits and no recursive calls to i2c_s3c_irq_nextbyte() are made. While touching this, also fix finishing transfers in polling mode by using common code path and always waiting for the bus to become idle and disabled. Fixes: 117053f77a5a ("i2c: s3c2410: Add polling mode support") Signed-off-by: Marek Szyprowski Reviewed-by: Andi Shyti Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-s3c2410.c | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/drivers/i2c/busses/i2c-s3c2410.c b/drivers/i2c/busses/i2c-s3c2410.c index bf9a5670ef337..c0fe96a4f2c4f 100644 --- a/drivers/i2c/busses/i2c-s3c2410.c +++ b/drivers/i2c/busses/i2c-s3c2410.c @@ -279,16 +279,6 @@ static void s3c24xx_i2c_message_start(struct s3c24xx_i2c *i2c, stat |= S3C2410_IICSTAT_START; writel(stat, i2c->regs + S3C2410_IICSTAT); - - if (i2c->quirks & QUIRK_POLL) { - while ((i2c->msg_num != 0) && is_ack(i2c)) { - i2c_s3c_irq_nextbyte(i2c, stat); - stat = readl(i2c->regs + S3C2410_IICSTAT); - - if (stat & S3C2410_IICSTAT_ARBITR) - dev_err(i2c->dev, "deal with arbitration loss\n"); - } - } } static inline void s3c24xx_i2c_stop(struct s3c24xx_i2c *i2c, int ret) @@ -694,7 +684,7 @@ static void s3c24xx_i2c_wait_idle(struct s3c24xx_i2c *i2c) static int s3c24xx_i2c_doxfer(struct s3c24xx_i2c *i2c, struct i2c_msg *msgs, int num) { - unsigned long timeout; + unsigned long timeout = 0; int ret; ret = s3c24xx_i2c_set_master(i2c); @@ -714,16 +704,19 @@ static int s3c24xx_i2c_doxfer(struct s3c24xx_i2c *i2c, s3c24xx_i2c_message_start(i2c, msgs); if (i2c->quirks & QUIRK_POLL) { - ret = i2c->msg_idx; + while ((i2c->msg_num != 0) && is_ack(i2c)) { + unsigned long stat = readl(i2c->regs + S3C2410_IICSTAT); - if (ret != num) - dev_dbg(i2c->dev, "incomplete xfer (%d)\n", ret); + i2c_s3c_irq_nextbyte(i2c, stat); - goto out; + stat = readl(i2c->regs + S3C2410_IICSTAT); + if (stat & S3C2410_IICSTAT_ARBITR) + dev_err(i2c->dev, "deal with arbitration loss\n"); + } + } else { + timeout = wait_event_timeout(i2c->wait, i2c->msg_num == 0, HZ * 5); } - timeout = wait_event_timeout(i2c->wait, i2c->msg_num == 0, HZ * 5); - ret = i2c->msg_idx; /* -- GitLab From 187432b82173c86e0450d0e3d516b415ab2455f8 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 8 Nov 2023 17:43:54 +0100 Subject: [PATCH 1134/1290] i2c: s3c24xx: add support for atomic transfers Add support for atomic transfers using polling mode with interrupts intentionally disabled to get rid of the following warning introduced by commit 63b96983a5dd ("i2c: core: introduce callbacks for atomic transfers") during system reboot and power-off: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 1518 at drivers/i2c/i2c-core.h:40 i2c_transfer+0xe8/0xf4 No atomic I2C transfer handler for 'i2c-0' ... ---[ end trace 0000000000000000 ]--- Signed-off-by: Marek Szyprowski Reviewed-by: Chanho Park Reviewed-by: Andi Shyti Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-s3c2410.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-s3c2410.c b/drivers/i2c/busses/i2c-s3c2410.c index c0fe96a4f2c4f..275f7c42165cd 100644 --- a/drivers/i2c/busses/i2c-s3c2410.c +++ b/drivers/i2c/busses/i2c-s3c2410.c @@ -76,6 +76,7 @@ #define QUIRK_HDMIPHY (1 << 1) #define QUIRK_NO_GPIO (1 << 2) #define QUIRK_POLL (1 << 3) +#define QUIRK_ATOMIC (1 << 4) /* Max time to wait for bus to become idle after a xfer (in us) */ #define S3C2410_IDLE_TIMEOUT 5000 @@ -174,7 +175,7 @@ static inline void s3c24xx_i2c_master_complete(struct s3c24xx_i2c *i2c, int ret) if (ret) i2c->msg_idx = ret; - if (!(i2c->quirks & QUIRK_POLL)) + if (!(i2c->quirks & (QUIRK_POLL | QUIRK_ATOMIC))) wake_up(&i2c->wait); } @@ -703,7 +704,7 @@ static int s3c24xx_i2c_doxfer(struct s3c24xx_i2c *i2c, s3c24xx_i2c_enable_irq(i2c); s3c24xx_i2c_message_start(i2c, msgs); - if (i2c->quirks & QUIRK_POLL) { + if (i2c->quirks & (QUIRK_POLL | QUIRK_ATOMIC)) { while ((i2c->msg_num != 0) && is_ack(i2c)) { unsigned long stat = readl(i2c->regs + S3C2410_IICSTAT); @@ -775,6 +776,21 @@ static int s3c24xx_i2c_xfer(struct i2c_adapter *adap, return -EREMOTEIO; } +static int s3c24xx_i2c_xfer_atomic(struct i2c_adapter *adap, + struct i2c_msg *msgs, int num) +{ + struct s3c24xx_i2c *i2c = (struct s3c24xx_i2c *)adap->algo_data; + int ret; + + disable_irq(i2c->irq); + i2c->quirks |= QUIRK_ATOMIC; + ret = s3c24xx_i2c_xfer(adap, msgs, num); + i2c->quirks &= ~QUIRK_ATOMIC; + enable_irq(i2c->irq); + + return ret; +} + /* declare our i2c functionality */ static u32 s3c24xx_i2c_func(struct i2c_adapter *adap) { @@ -785,6 +801,7 @@ static u32 s3c24xx_i2c_func(struct i2c_adapter *adap) /* i2c bus registration info */ static const struct i2c_algorithm s3c24xx_i2c_algorithm = { .master_xfer = s3c24xx_i2c_xfer, + .master_xfer_atomic = s3c24xx_i2c_xfer_atomic, .functionality = s3c24xx_i2c_func, }; -- GitLab From 92a85b7c6262f19c65a1c115cf15f411ba65a57c Mon Sep 17 00:00:00 2001 From: Tim Lunn Date: Sun, 3 Dec 2023 23:39:59 +1100 Subject: [PATCH 1135/1290] i2c: rk3x: Adjust mask/value offset for i2c2 on rv1126 Rockchip RV1126 is using old style i2c controller, the i2c2 bus uses a non-sequential offset in the grf register for the mask/value bits for this bus. This patch fixes i2c2 bus on rv1126 SoCs. Signed-off-by: Tim Lunn Acked-by: Heiko Stuebner Reviewed-by: Andi Shyti Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-rk3x.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-rk3x.c b/drivers/i2c/busses/i2c-rk3x.c index a044ca0c35a19..06fec2344575d 100644 --- a/drivers/i2c/busses/i2c-rk3x.c +++ b/drivers/i2c/busses/i2c-rk3x.c @@ -1288,8 +1288,12 @@ static int rk3x_i2c_probe(struct platform_device *pdev) return -EINVAL; } - /* 27+i: write mask, 11+i: value */ - value = BIT(27 + bus_nr) | BIT(11 + bus_nr); + /* rv1126 i2c2 uses non-sequential write mask 20, value 4 */ + if (i2c->soc_data == &rv1126_soc_data && bus_nr == 2) + value = BIT(20) | BIT(4); + else + /* 27+i: write mask, 11+i: value */ + value = BIT(27 + bus_nr) | BIT(11 + bus_nr); ret = regmap_write(grf, i2c->soc_data->grf_offset, value); if (ret != 0) { -- GitLab From 13e3a512a29001cab68fe9a0c12be94e6d42a10c Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 14 Nov 2023 15:13:28 +0100 Subject: [PATCH 1136/1290] i2c: smbus: Support up to 8 SPD EEPROMs I originally restricted i2c_register_spd() to only support systems with up to 4 memory slots, so that we can experiment with it on a limited numbers of systems. It's been more than 3 years and it seems to work just fine, so the time has come to lift this arbitrary limitation. The maximum number of memory slots which can be connected to a single I2C segment is 8, so support that many SPD EEPROMs. Any system with more than 8 memory slots would have either multiple SMBus channels or SMBus multiplexing, so it would need dedicated care. We'll get to that later as needed. Signed-off-by: Jean Delvare Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-smbus.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/i2c/i2c-smbus.c b/drivers/i2c/i2c-smbus.c index 138c3f5e0093a..74807c6db596d 100644 --- a/drivers/i2c/i2c-smbus.c +++ b/drivers/i2c/i2c-smbus.c @@ -308,8 +308,8 @@ EXPORT_SYMBOL_GPL(i2c_free_slave_host_notify_device); * target systems are the same. * Restrictions to automatic SPD instantiation: * - Only works if all filled slots have the same memory type - * - Only works for DDR2, DDR3 and DDR4 for now - * - Only works on systems with 1 to 4 memory slots + * - Only works for DDR, DDR2, DDR3 and DDR4 for now + * - Only works on systems with 1 to 8 memory slots */ #if IS_ENABLED(CONFIG_DMI) void i2c_register_spd(struct i2c_adapter *adap) @@ -354,9 +354,9 @@ void i2c_register_spd(struct i2c_adapter *adap) dev_info(&adap->dev, "%d/%d memory slots populated (from DMI)\n", dimm_count, slot_count); - if (slot_count > 4) { + if (slot_count > 8) { dev_warn(&adap->dev, - "Systems with more than 4 memory slots not supported yet, not instantiating SPD\n"); + "Systems with more than 8 memory slots not supported yet, not instantiating SPD\n"); return; } -- GitLab From db63eacdf61d6bbcde783f15aaafbd8e66476e9a Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Thu, 30 Nov 2023 10:57:51 +0100 Subject: [PATCH 1137/1290] i2c: imx: Make SDA actually optional for bus recovering Both i2c_generic_scl_recovery() and the debug output indicate that SDA is purely optional for bus recovery. But devm_gpiod_get() never returns NULL making it mandatory. Fix this by calling devm_gpiod_get_optional instead. Signed-off-by: Alexander Stein Reviewed-by: Andi Shyti Reviewed-by: Oleksij Rempel Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-imx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c index 1775a79aeba2a..88a053987403c 100644 --- a/drivers/i2c/busses/i2c-imx.c +++ b/drivers/i2c/busses/i2c-imx.c @@ -1401,7 +1401,7 @@ static int i2c_imx_init_recovery_info(struct imx_i2c_struct *i2c_imx, PINCTRL_STATE_DEFAULT); i2c_imx->pinctrl_pins_gpio = pinctrl_lookup_state(i2c_imx->pinctrl, "gpio"); - rinfo->sda_gpiod = devm_gpiod_get(&pdev->dev, "sda", GPIOD_IN); + rinfo->sda_gpiod = devm_gpiod_get_optional(&pdev->dev, "sda", GPIOD_IN); rinfo->scl_gpiod = devm_gpiod_get(&pdev->dev, "scl", GPIOD_OUT_HIGH_OPEN_DRAIN); if (PTR_ERR(rinfo->sda_gpiod) == -EPROBE_DEFER || -- GitLab From e535af5c4225fe2715d12322b645853ab9724648 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 6 Dec 2023 23:24:03 +0100 Subject: [PATCH 1138/1290] i2c: cpm: Remove linux,i2c-index conversion from be32 sparse reports an error on some data that gets converted from be32. That's because that data is typed u32 instead of __be32. The type is correct, the be32_to_cpu() conversion is not. Remove the conversion. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202312042210.QL4DA8Av-lkp@intel.com/ Signed-off-by: Christophe Leroy Acked-By: Jochen Friedrich Reviewed-by: Andi Shyti Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-cpm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-cpm.c b/drivers/i2c/busses/i2c-cpm.c index 9a664abf734d6..4404b4aac6765 100644 --- a/drivers/i2c/busses/i2c-cpm.c +++ b/drivers/i2c/busses/i2c-cpm.c @@ -658,7 +658,7 @@ static int cpm_i2c_probe(struct platform_device *ofdev) /* register new adapter to i2c module... */ data = of_get_property(ofdev->dev.of_node, "linux,i2c-index", &len); - cpm->adap.nr = (data && len == 4) ? be32_to_cpup(data) : -1; + cpm->adap.nr = (data && len == 4) ? *data : -1; result = i2c_add_numbered_adapter(&cpm->adap); if (result < 0) -- GitLab From 2b0eee4f6add17a74f696a6f40ad2a4fa173613e Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 23 Nov 2023 11:30:32 +0100 Subject: [PATCH 1139/1290] eeprom: at24: use of_match_ptr() This driver does not depend on CONFIG_OF so using of_match_ptr() makes sense to reduce the size a bit. Signed-off-by: Bartosz Golaszewski --- drivers/misc/eeprom/at24.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/misc/eeprom/at24.c b/drivers/misc/eeprom/at24.c index f61a80597a22d..8279adade07e8 100644 --- a/drivers/misc/eeprom/at24.c +++ b/drivers/misc/eeprom/at24.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -242,7 +243,7 @@ static const struct i2c_device_id at24_ids[] = { }; MODULE_DEVICE_TABLE(i2c, at24_ids); -static const struct of_device_id at24_of_match[] = { +static const struct of_device_id __maybe_unused at24_of_match[] = { { .compatible = "atmel,24c00", .data = &at24_data_24c00 }, { .compatible = "atmel,24c01", .data = &at24_data_24c01 }, { .compatible = "atmel,24cs01", .data = &at24_data_24cs01 }, @@ -812,7 +813,7 @@ static struct i2c_driver at24_driver = { .driver = { .name = "at24", .pm = &at24_pm_ops, - .of_match_table = at24_of_match, + .of_match_table = of_match_ptr(at24_of_match), .acpi_match_table = ACPI_PTR(at24_acpi_ids), }, .probe = at24_probe, -- GitLab From 614ef4d30fe724ff8558a74a1926bd3051d39b67 Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Mon, 27 Nov 2023 22:11:02 +0100 Subject: [PATCH 1140/1290] dt-bindings: at24: add ROHM BR24G04 Add compatible for ROHM Semiconductor BR24G04 EEPROMs. Signed-off-by: Philipp Zabel Signed-off-by: Roland Hieber Acked-by: Krzysztof Kozlowski Signed-off-by: Bartosz Golaszewski --- Documentation/devicetree/bindings/eeprom/at24.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/eeprom/at24.yaml b/Documentation/devicetree/bindings/eeprom/at24.yaml index b6864d0ee81e4..1812ef31d5f1e 100644 --- a/Documentation/devicetree/bindings/eeprom/at24.yaml +++ b/Documentation/devicetree/bindings/eeprom/at24.yaml @@ -123,6 +123,7 @@ properties: - enum: - onnn,cat24c04 - onnn,cat24c05 + - rohm,br24g04 - const: atmel,24c04 - items: - const: renesas,r1ex24016 -- GitLab From 94959c0e796e41128483588d133b9a7003b409f9 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 19 Dec 2023 16:22:43 +0100 Subject: [PATCH 1141/1290] i2c: make i2c_bus_type const Now that the driver core can properly handle constant struct bus_type, move the i2c_bus_type variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Note, the sound/soc/rockchip/rk3399_gru_sound.c also needed tweaking as it decided to save off a pointer to a bus type for internal stuff, and it was using the i2c_bus_type as well. Signed-off-by: Greg Kroah-Hartman Acked-by: Mark Brown Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-core-base.c | 2 +- include/linux/i2c.h | 2 +- sound/soc/rockchip/rk3399_gru_sound.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c index f6c828bbd166a..3bd48d4b6318f 100644 --- a/drivers/i2c/i2c-core-base.c +++ b/drivers/i2c/i2c-core-base.c @@ -692,7 +692,7 @@ static struct attribute *i2c_dev_attrs[] = { }; ATTRIBUTE_GROUPS(i2c_dev); -struct bus_type i2c_bus_type = { +const struct bus_type i2c_bus_type = { .name = "i2c", .match = i2c_device_match, .probe = i2c_device_probe, diff --git a/include/linux/i2c.h b/include/linux/i2c.h index e01fb1097868c..652ecb7abedae 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -23,7 +23,7 @@ #include /* for swab16 */ #include -extern struct bus_type i2c_bus_type; +extern const struct bus_type i2c_bus_type; extern struct device_type i2c_adapter_type; extern struct device_type i2c_client_type; diff --git a/sound/soc/rockchip/rk3399_gru_sound.c b/sound/soc/rockchip/rk3399_gru_sound.c index 1a504ebd3a0e9..6c89c7331229f 100644 --- a/sound/soc/rockchip/rk3399_gru_sound.c +++ b/sound/soc/rockchip/rk3399_gru_sound.c @@ -446,7 +446,7 @@ static const struct rockchip_sound_route rockchip_routes[] = { struct dailink_match_data { const char *compatible; - struct bus_type *bus_type; + const struct bus_type *bus_type; }; static const struct dailink_match_data dailink_match[] = { -- GitLab From a8355235dbd571b32c750ee756dd6dac216d18f2 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 8 Nov 2023 07:38:07 +0100 Subject: [PATCH 1142/1290] i2c: mux: reg: Remove class-based device auto-detection support Legacy class-based device auto-detection shouldn't be used in new code. Therefore remove support in i2c-mux-reg as long as we don't have a user of this feature yet. Link: https://lore.kernel.org/linux-i2c/a22978a4-88e4-46f4-b71c-032b22321599@gmail.com/ Signed-off-by: Heiner Kallweit Signed-off-by: Wolfram Sang --- drivers/i2c/muxes/i2c-mux-reg.c | 4 +--- include/linux/platform_data/i2c-mux-reg.h | 2 -- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/i2c/muxes/i2c-mux-reg.c b/drivers/i2c/muxes/i2c-mux-reg.c index 9efc1ed01577b..8489971babd37 100644 --- a/drivers/i2c/muxes/i2c-mux-reg.c +++ b/drivers/i2c/muxes/i2c-mux-reg.c @@ -159,7 +159,6 @@ static int i2c_mux_reg_probe(struct platform_device *pdev) struct regmux *mux; struct i2c_adapter *parent; struct resource *res; - unsigned int class; int i, ret, nr; mux = devm_kzalloc(&pdev->dev, sizeof(*mux), GFP_KERNEL); @@ -213,9 +212,8 @@ static int i2c_mux_reg_probe(struct platform_device *pdev) for (i = 0; i < mux->data.n_values; i++) { nr = mux->data.base_nr ? (mux->data.base_nr + i) : 0; - class = mux->data.classes ? mux->data.classes[i] : 0; - ret = i2c_mux_add_adapter(muxc, nr, mux->data.values[i], class); + ret = i2c_mux_add_adapter(muxc, nr, mux->data.values[i], 0); if (ret) goto err_del_mux_adapters; } diff --git a/include/linux/platform_data/i2c-mux-reg.h b/include/linux/platform_data/i2c-mux-reg.h index 2543c2a1c9aef..e2e8957683116 100644 --- a/include/linux/platform_data/i2c-mux-reg.h +++ b/include/linux/platform_data/i2c-mux-reg.h @@ -17,7 +17,6 @@ * @n_values: Number of multiplexer channels * @little_endian: Indicating if the register is in little endian * @write_only: Reading the register is not allowed by hardware - * @classes: Optional I2C auto-detection classes * @idle: Value to write to mux when idle * @idle_in_use: indicate if idle value is in use * @reg: Virtual address of the register to switch channel @@ -30,7 +29,6 @@ struct i2c_mux_reg_platform_data { int n_values; bool little_endian; bool write_only; - const unsigned int *classes; u32 idle; bool idle_in_use; void __iomem *reg; -- GitLab From 462e9804d2c90bfffb494718b5aae5c374ff3b78 Mon Sep 17 00:00:00 2001 From: Hans Hu Date: Thu, 2 Nov 2023 10:53:51 +0800 Subject: [PATCH 1143/1290] i2c: wmt: Reduce redundant: bus busy check Put wmt_i2c_wait_bus_not_busy() in a more appropriate place to reduce code redundancy Signed-off-by: Hans Hu Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-wmt.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/drivers/i2c/busses/i2c-wmt.c b/drivers/i2c/busses/i2c-wmt.c index 76118abc6e104..d554c63775331 100644 --- a/drivers/i2c/busses/i2c-wmt.c +++ b/drivers/i2c/busses/i2c-wmt.c @@ -128,12 +128,6 @@ static int wmt_i2c_write(struct i2c_adapter *adap, struct i2c_msg *pmsg, unsigned long wait_result; int xfer_len = 0; - if (!(pmsg->flags & I2C_M_NOSTART)) { - ret = wmt_i2c_wait_bus_not_busy(i2c_dev); - if (ret < 0) - return ret; - } - if (pmsg->len == 0) { /* * We still need to run through the while (..) once, so @@ -219,12 +213,6 @@ static int wmt_i2c_read(struct i2c_adapter *adap, struct i2c_msg *pmsg, unsigned long wait_result; u32 xfer_len = 0; - if (!(pmsg->flags & I2C_M_NOSTART)) { - ret = wmt_i2c_wait_bus_not_busy(i2c_dev); - if (ret < 0) - return ret; - } - val = readw(i2c_dev->base + REG_CR); val &= ~CR_TX_END; writew(val, i2c_dev->base + REG_CR); @@ -297,11 +285,18 @@ static int wmt_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg *pmsg; int i, is_last; int ret = 0; + struct wmt_i2c_dev *i2c_dev = i2c_get_adapdata(adap); for (i = 0; ret >= 0 && i < num; i++) { is_last = ((i + 1) == num); pmsg = &msgs[i]; + if (!(pmsg->flags & I2C_M_NOSTART)) { + ret = wmt_i2c_wait_bus_not_busy(i2c_dev); + if (ret < 0) + return ret; + } + if (pmsg->flags & I2C_M_RD) ret = wmt_i2c_read(adap, pmsg, is_last); else -- GitLab From 8a22991a48f2602aaab79df1301483d50bc51b2c Mon Sep 17 00:00:00 2001 From: Hans Hu Date: Thu, 2 Nov 2023 10:53:52 +0800 Subject: [PATCH 1144/1290] i2c: wmt: Reduce redundant: wait event complete Put the handling of interrupt events in a function class to reduce code redundancy. Signed-off-by: Hans Hu Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-wmt.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/drivers/i2c/busses/i2c-wmt.c b/drivers/i2c/busses/i2c-wmt.c index d554c63775331..1b31330ba4ebc 100644 --- a/drivers/i2c/busses/i2c-wmt.c +++ b/drivers/i2c/busses/i2c-wmt.c @@ -109,6 +109,12 @@ static int wmt_i2c_wait_bus_not_busy(struct wmt_i2c_dev *i2c_dev) static int wmt_check_status(struct wmt_i2c_dev *i2c_dev) { int ret = 0; + unsigned long wait_result; + + wait_result = wait_for_completion_timeout(&i2c_dev->complete, + msecs_to_jiffies(500)); + if (!wait_result) + return -ETIMEDOUT; if (i2c_dev->cmd_status & ISR_NACK_ADDR) ret = -EIO; @@ -125,7 +131,6 @@ static int wmt_i2c_write(struct i2c_adapter *adap, struct i2c_msg *pmsg, struct wmt_i2c_dev *i2c_dev = i2c_get_adapdata(adap); u16 val, tcr_val; int ret; - unsigned long wait_result; int xfer_len = 0; if (pmsg->len == 0) { @@ -167,12 +172,6 @@ static int wmt_i2c_write(struct i2c_adapter *adap, struct i2c_msg *pmsg, } while (xfer_len < pmsg->len) { - wait_result = wait_for_completion_timeout(&i2c_dev->complete, - msecs_to_jiffies(500)); - - if (wait_result == 0) - return -ETIMEDOUT; - ret = wmt_check_status(i2c_dev); if (ret) return ret; @@ -210,7 +209,6 @@ static int wmt_i2c_read(struct i2c_adapter *adap, struct i2c_msg *pmsg, struct wmt_i2c_dev *i2c_dev = i2c_get_adapdata(adap); u16 val, tcr_val; int ret; - unsigned long wait_result; u32 xfer_len = 0; val = readw(i2c_dev->base + REG_CR); @@ -251,12 +249,6 @@ static int wmt_i2c_read(struct i2c_adapter *adap, struct i2c_msg *pmsg, } while (xfer_len < pmsg->len) { - wait_result = wait_for_completion_timeout(&i2c_dev->complete, - msecs_to_jiffies(500)); - - if (!wait_result) - return -ETIMEDOUT; - ret = wmt_check_status(i2c_dev); if (ret) return ret; -- GitLab From 7108ecf3cbc728ec4a72ce29136cbcfedf4a9242 Mon Sep 17 00:00:00 2001 From: Hans Hu Date: Thu, 2 Nov 2023 10:53:53 +0800 Subject: [PATCH 1145/1290] i2c: wmt: Reduce redundant: clock mode setting The frequency setting mode is adjusted to reduce the code redundancy, and it is also convenient to share with zhaoxin Signed-off-by: Hans Hu Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-wmt.c | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/drivers/i2c/busses/i2c-wmt.c b/drivers/i2c/busses/i2c-wmt.c index 1b31330ba4ebc..a44ce5fdde8a3 100644 --- a/drivers/i2c/busses/i2c-wmt.c +++ b/drivers/i2c/busses/i2c-wmt.c @@ -74,9 +74,6 @@ #define MCR_APB_96M 7 #define MCR_APB_166M 12 -#define I2C_MODE_STANDARD 0 -#define I2C_MODE_FAST 1 - #define WMT_I2C_TIMEOUT (msecs_to_jiffies(1000)) struct wmt_i2c_dev { @@ -85,7 +82,7 @@ struct wmt_i2c_dev { struct device *dev; void __iomem *base; struct clk *clk; - int mode; + u16 tcr; int irq; u16 cmd_status; }; @@ -129,7 +126,7 @@ static int wmt_i2c_write(struct i2c_adapter *adap, struct i2c_msg *pmsg, int last) { struct wmt_i2c_dev *i2c_dev = i2c_get_adapdata(adap); - u16 val, tcr_val; + u16 val, tcr_val = i2c_dev->tcr; int ret; int xfer_len = 0; @@ -156,11 +153,6 @@ static int wmt_i2c_write(struct i2c_adapter *adap, struct i2c_msg *pmsg, reinit_completion(&i2c_dev->complete); - if (i2c_dev->mode == I2C_MODE_STANDARD) - tcr_val = TCR_STANDARD_MODE; - else - tcr_val = TCR_FAST_MODE; - tcr_val |= (TCR_MASTER_WRITE | (pmsg->addr & TCR_SLAVE_ADDR_MASK)); writew(tcr_val, i2c_dev->base + REG_TCR); @@ -207,7 +199,7 @@ static int wmt_i2c_read(struct i2c_adapter *adap, struct i2c_msg *pmsg, int last) { struct wmt_i2c_dev *i2c_dev = i2c_get_adapdata(adap); - u16 val, tcr_val; + u16 val, tcr_val = i2c_dev->tcr; int ret; u32 xfer_len = 0; @@ -233,11 +225,6 @@ static int wmt_i2c_read(struct i2c_adapter *adap, struct i2c_msg *pmsg, reinit_completion(&i2c_dev->complete); - if (i2c_dev->mode == I2C_MODE_STANDARD) - tcr_val = TCR_STANDARD_MODE; - else - tcr_val = TCR_FAST_MODE; - tcr_val |= TCR_MASTER_READ | (pmsg->addr & TCR_SLAVE_ADDR_MASK); writew(tcr_val, i2c_dev->base + REG_TCR); @@ -346,10 +333,10 @@ static int wmt_i2c_reset_hardware(struct wmt_i2c_dev *i2c_dev) readw(i2c_dev->base + REG_CSR); /* read clear */ writew(ISR_WRITE_ALL, i2c_dev->base + REG_ISR); - if (i2c_dev->mode == I2C_MODE_STANDARD) - writew(SCL_TIMEOUT(128) | TR_STD, i2c_dev->base + REG_TR); - else + if (i2c_dev->tcr == TCR_FAST_MODE) writew(SCL_TIMEOUT(128) | TR_HS, i2c_dev->base + REG_TR); + else + writew(SCL_TIMEOUT(128) | TR_STD, i2c_dev->base + REG_TR); return 0; } @@ -382,10 +369,9 @@ static int wmt_i2c_probe(struct platform_device *pdev) return PTR_ERR(i2c_dev->clk); } - i2c_dev->mode = I2C_MODE_STANDARD; err = of_property_read_u32(np, "clock-frequency", &clk_rate); if (!err && (clk_rate == I2C_MAX_FAST_MODE_FREQ)) - i2c_dev->mode = I2C_MODE_FAST; + i2c_dev->tcr = TCR_FAST_MODE; i2c_dev->dev = &pdev->dev; -- GitLab From 4c541c6a66df396c35693a21c5bc40553cd4d2fa Mon Sep 17 00:00:00 2001 From: Hans Hu Date: Thu, 2 Nov 2023 10:53:55 +0800 Subject: [PATCH 1146/1290] i2c: wmt: Reduce redundant: function parameter Use more appropriate parameter passing to reduce the amount of code Signed-off-by: Hans Hu Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-wmt.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/drivers/i2c/busses/i2c-wmt.c b/drivers/i2c/busses/i2c-wmt.c index a44ce5fdde8a3..7de67886e8ca3 100644 --- a/drivers/i2c/busses/i2c-wmt.c +++ b/drivers/i2c/busses/i2c-wmt.c @@ -122,10 +122,9 @@ static int wmt_check_status(struct wmt_i2c_dev *i2c_dev) return ret; } -static int wmt_i2c_write(struct i2c_adapter *adap, struct i2c_msg *pmsg, +static int wmt_i2c_write(struct wmt_i2c_dev *i2c_dev, struct i2c_msg *pmsg, int last) { - struct wmt_i2c_dev *i2c_dev = i2c_get_adapdata(adap); u16 val, tcr_val = i2c_dev->tcr; int ret; int xfer_len = 0; @@ -195,10 +194,8 @@ static int wmt_i2c_write(struct i2c_adapter *adap, struct i2c_msg *pmsg, return 0; } -static int wmt_i2c_read(struct i2c_adapter *adap, struct i2c_msg *pmsg, - int last) +static int wmt_i2c_read(struct wmt_i2c_dev *i2c_dev, struct i2c_msg *pmsg) { - struct wmt_i2c_dev *i2c_dev = i2c_get_adapdata(adap); u16 val, tcr_val = i2c_dev->tcr; int ret; u32 xfer_len = 0; @@ -262,13 +259,11 @@ static int wmt_i2c_xfer(struct i2c_adapter *adap, int num) { struct i2c_msg *pmsg; - int i, is_last; + int i; int ret = 0; struct wmt_i2c_dev *i2c_dev = i2c_get_adapdata(adap); for (i = 0; ret >= 0 && i < num; i++) { - is_last = ((i + 1) == num); - pmsg = &msgs[i]; if (!(pmsg->flags & I2C_M_NOSTART)) { ret = wmt_i2c_wait_bus_not_busy(i2c_dev); @@ -277,9 +272,9 @@ static int wmt_i2c_xfer(struct i2c_adapter *adap, } if (pmsg->flags & I2C_M_RD) - ret = wmt_i2c_read(adap, pmsg, is_last); + ret = wmt_i2c_read(i2c_dev, pmsg); else - ret = wmt_i2c_write(adap, pmsg, is_last); + ret = wmt_i2c_write(i2c_dev, pmsg, (i + 1) == num); } return (ret < 0) ? ret : i; -- GitLab From bb7c0209c4fead37fbc9fd07f9617f40ba21189e Mon Sep 17 00:00:00 2001 From: Hans Hu Date: Thu, 2 Nov 2023 10:53:54 +0800 Subject: [PATCH 1147/1290] i2c: wmt: Reduce redundant: REG_CR setting These Settings for the same register, REG_CR, can be put together to reduce code redundancy. Signed-off-by: Hans Hu Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-wmt.c | 35 +++++++++-------------------------- 1 file changed, 9 insertions(+), 26 deletions(-) diff --git a/drivers/i2c/busses/i2c-wmt.c b/drivers/i2c/busses/i2c-wmt.c index 7de67886e8ca3..ec2a8da134e56 100644 --- a/drivers/i2c/busses/i2c-wmt.c +++ b/drivers/i2c/busses/i2c-wmt.c @@ -143,9 +143,6 @@ static int wmt_i2c_write(struct wmt_i2c_dev *i2c_dev, struct i2c_msg *pmsg, if (!(pmsg->flags & I2C_M_NOSTART)) { val = readw(i2c_dev->base + REG_CR); val &= ~CR_TX_END; - writew(val, i2c_dev->base + REG_CR); - - val = readw(i2c_dev->base + REG_CR); val |= CR_CPU_RDY; writew(val, i2c_dev->base + REG_CR); } @@ -201,24 +198,15 @@ static int wmt_i2c_read(struct wmt_i2c_dev *i2c_dev, struct i2c_msg *pmsg) u32 xfer_len = 0; val = readw(i2c_dev->base + REG_CR); - val &= ~CR_TX_END; - writew(val, i2c_dev->base + REG_CR); + val &= ~(CR_TX_END | CR_TX_NEXT_NO_ACK); - val = readw(i2c_dev->base + REG_CR); - val &= ~CR_TX_NEXT_NO_ACK; - writew(val, i2c_dev->base + REG_CR); - - if (!(pmsg->flags & I2C_M_NOSTART)) { - val = readw(i2c_dev->base + REG_CR); + if (!(pmsg->flags & I2C_M_NOSTART)) val |= CR_CPU_RDY; - writew(val, i2c_dev->base + REG_CR); - } - if (pmsg->len == 1) { - val = readw(i2c_dev->base + REG_CR); + if (pmsg->len == 1) val |= CR_TX_NEXT_NO_ACK; - writew(val, i2c_dev->base + REG_CR); - } + + writew(val, i2c_dev->base + REG_CR); reinit_completion(&i2c_dev->complete); @@ -240,15 +228,10 @@ static int wmt_i2c_read(struct wmt_i2c_dev *i2c_dev, struct i2c_msg *pmsg) pmsg->buf[xfer_len] = readw(i2c_dev->base + REG_CDR) >> 8; xfer_len++; - if (xfer_len == pmsg->len - 1) { - val = readw(i2c_dev->base + REG_CR); - val |= (CR_TX_NEXT_NO_ACK | CR_CPU_RDY); - writew(val, i2c_dev->base + REG_CR); - } else { - val = readw(i2c_dev->base + REG_CR); - val |= CR_CPU_RDY; - writew(val, i2c_dev->base + REG_CR); - } + val = readw(i2c_dev->base + REG_CR) | CR_CPU_RDY; + if (xfer_len == pmsg->len - 1) + val |= CR_TX_NEXT_NO_ACK; + writew(val, i2c_dev->base + REG_CR); } return 0; -- GitLab From 6d9450464ce1825d7d0e7ef5d33a9d2701f41c76 Mon Sep 17 00:00:00 2001 From: Paul Menzel Date: Wed, 20 Dec 2023 17:10:01 +0100 Subject: [PATCH 1148/1290] i2c: i801: Add lis3lv02d for Dell Precision 3540 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On the Dell Precision 3540/0M14W7, BIOS 1.7.4 05/12/2020, Linux prints the warning below. i801_smbus 0000:00:1f.4: Accelerometer lis3lv02d is present on SMBus but its address is unknown, skipping registration With the help of Wolfram Sang, the test to probe it on I2C bus 6 at address 0x29 was successful. $ echo lis3lv02d 0x29 | sudo tee /sys/bus/i2c/devices/i2c-6/new_device [ 2110.787000] i2c i2c-6: new_device: Instantiated device lis3lv02d at 0x29 [ 2110.791932] lis3lv02d_i2c 6-0029: supply Vdd not found, using dummy regulator [ 2110.791981] lis3lv02d_i2c 6-0029: supply Vdd_IO not found, using dummy regulator [ 2110.809233] lis3lv02d: 8 bits 3DC sensor found [ 2110.900668] input: ST LIS3LV02DL Accelerometer as /devices/platform/lis3lv02d/input/input23 So, the device has that accelerometer. Add the I2C address to the mapping list. Link: https://lore.kernel.org/linux-i2c/97708c11-ac85-fb62-2c8e-d37739ca826f@molgen.mpg.de/ Signed-off-by: Paul Menzel Acked-by: Pali Rohár Reviewed-by: Andi Shyti [wsa: shortened commit message a little] Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-i801.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c index 070999139c6dc..cb9660f841171 100644 --- a/drivers/i2c/busses/i2c-i801.c +++ b/drivers/i2c/busses/i2c-i801.c @@ -1230,6 +1230,7 @@ static const struct { * Additional individual entries were added after verification. */ { "Latitude 5480", 0x29 }, + { "Precision 3540", 0x29 }, { "Vostro V131", 0x1d }, { "Vostro 5568", 0x29 }, }; -- GitLab From 2f189493ae32be9768b27072c9388e62b38d2dda Mon Sep 17 00:00:00 2001 From: Paul Menzel Date: Wed, 20 Dec 2023 17:10:02 +0100 Subject: [PATCH 1149/1290] i2c: i801: Add lis3lv02d for Dell XPS 15 7590 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On the Dell XPS 15 7590/0VYV0G, BIOS 1.24.0 09/11/2023, Linux prints the warning below. i801_smbus 0000:00:1f.4: Accelerometer lis3lv02d is present on SMBus but its address is unknown, skipping registration Following the same suggestions by Wolfram Sang as for the Dell Precision 3540 [1], the accelerometer can be successfully found on I2C bus 2 at address 0x29. $ echo lis3lv02d 0x29 | sudo tee /sys/bus/i2c/devices/i2c-2/new_device lis3lv02d 0x29 $ dmesg | tail -5 [ 549.522876] lis3lv02d_i2c 2-0029: supply Vdd not found, using dummy regulator [ 549.522904] lis3lv02d_i2c 2-0029: supply Vdd_IO not found, using dummy regulator [ 549.542486] lis3lv02d: 8 bits 3DC sensor found [ 549.630022] input: ST LIS3LV02DL Accelerometer as /devices/platform/lis3lv02d/input/input35 [ 549.630586] i2c i2c-2: new_device: Instantiated device lis3lv02d at 0x29 So, the device has that accelerometer. Add the I2C address to the mapping list, and test it successfully on the device. [1]: https://lore.kernel.org/linux-i2c/97708c11-ac85-fb62-2c8e-d37739ca826f@molgen.mpg.de/ Signed-off-by: Paul Menzel Acked-by: Pali Rohár Reviewed-by: Andi Shyti Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-i801.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c index cb9660f841171..3932e8d96a171 100644 --- a/drivers/i2c/busses/i2c-i801.c +++ b/drivers/i2c/busses/i2c-i801.c @@ -1233,6 +1233,7 @@ static const struct { { "Precision 3540", 0x29 }, { "Vostro V131", 0x1d }, { "Vostro 5568", 0x29 }, + { "XPS 15 7590", 0x29 }, }; static void register_dell_lis3lv02d_i2c_device(struct i801_priv *priv) -- GitLab From a51e224c2f42417e95a3e1a672ade221bcd006ba Mon Sep 17 00:00:00 2001 From: Alain Volmat Date: Fri, 15 Dec 2023 18:06:05 +0100 Subject: [PATCH 1150/1290] i2c: stm32f7: use dev_err_probe upon calls of devm_request_irq Convert error handling upon calls of devm_request_irq functions during the probe of the driver. Signed-off-by: Alain Volmat Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-stm32f7.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/drivers/i2c/busses/i2c-stm32f7.c b/drivers/i2c/busses/i2c-stm32f7.c index 983509936727e..d99f152ac7275 100644 --- a/drivers/i2c/busses/i2c-stm32f7.c +++ b/drivers/i2c/busses/i2c-stm32f7.c @@ -2199,19 +2199,13 @@ static int stm32f7_i2c_probe(struct platform_device *pdev) stm32f7_i2c_isr_event_thread, IRQF_ONESHOT, pdev->name, i2c_dev); - if (ret) { - dev_err(&pdev->dev, "Failed to request irq event %i\n", - irq_event); - return ret; - } + if (ret) + return dev_err_probe(&pdev->dev, ret, "Failed to request irq event\n"); ret = devm_request_irq(&pdev->dev, irq_error, stm32f7_i2c_isr_error, 0, pdev->name, i2c_dev); - if (ret) { - dev_err(&pdev->dev, "Failed to request irq error %i\n", - irq_error); - return ret; - } + if (ret) + return dev_err_probe(&pdev->dev, ret, "Failed to request irq error\n"); setup = of_device_get_match_data(&pdev->dev); if (!setup) { -- GitLab From e6103cd45ef0e14eb02f0666bc6b494902cfe821 Mon Sep 17 00:00:00 2001 From: Alain Volmat Date: Fri, 15 Dec 2023 18:06:06 +0100 Subject: [PATCH 1151/1290] i2c: stm32f7: perform most of irq job in threaded handler The irq handling is currently split between the irq handler and the threaded irq handler. Some of the handling (such as dma related stuffs) done within the irq handler might sleep or take some time leading to issues if the kernel is built with realtime constraints. In order to fix that, perform an overall rework to perform most of the job within the threaded handler and only keep fifo access in the non threaded handler. Signed-off-by: Alain Volmat Reviewed-by: Andi Shyti Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-stm32f7.c | 126 ++++++++++++++----------------- 1 file changed, 56 insertions(+), 70 deletions(-) diff --git a/drivers/i2c/busses/i2c-stm32f7.c b/drivers/i2c/busses/i2c-stm32f7.c index d99f152ac7275..dde70e7ec7226 100644 --- a/drivers/i2c/busses/i2c-stm32f7.c +++ b/drivers/i2c/busses/i2c-stm32f7.c @@ -1497,17 +1497,11 @@ static irqreturn_t stm32f7_i2c_slave_isr_event(struct stm32f7_i2c_dev *i2c_dev) static irqreturn_t stm32f7_i2c_isr_event(int irq, void *data) { struct stm32f7_i2c_dev *i2c_dev = data; - struct stm32f7_i2c_msg *f7_msg = &i2c_dev->f7_msg; - struct stm32_i2c_dma *dma = i2c_dev->dma; - void __iomem *base = i2c_dev->base; - u32 status, mask; - int ret = IRQ_HANDLED; + u32 status; - /* Check if the interrupt if for a slave device */ - if (!i2c_dev->master_mode) { - ret = stm32f7_i2c_slave_isr_event(i2c_dev); - return ret; - } + /* Check if the interrupt is for a slave device */ + if (!i2c_dev->master_mode) + return IRQ_WAKE_THREAD; status = readl_relaxed(i2c_dev->base + STM32F7_I2C_ISR); @@ -1519,6 +1513,29 @@ static irqreturn_t stm32f7_i2c_isr_event(int irq, void *data) if (status & STM32F7_I2C_ISR_RXNE) stm32f7_i2c_read_rx_data(i2c_dev); + /* Wake up the thread if other flags are raised */ + if (status & + (STM32F7_I2C_ISR_NACKF | STM32F7_I2C_ISR_STOPF | + STM32F7_I2C_ISR_TC | STM32F7_I2C_ISR_TCR)) + return IRQ_WAKE_THREAD; + + return IRQ_HANDLED; +} + +static irqreturn_t stm32f7_i2c_isr_event_thread(int irq, void *data) +{ + struct stm32f7_i2c_dev *i2c_dev = data; + struct stm32f7_i2c_msg *f7_msg = &i2c_dev->f7_msg; + struct stm32_i2c_dma *dma = i2c_dev->dma; + void __iomem *base = i2c_dev->base; + u32 status, mask; + int ret; + + if (!i2c_dev->master_mode) + return stm32f7_i2c_slave_isr_event(i2c_dev); + + status = readl_relaxed(i2c_dev->base + STM32F7_I2C_ISR); + /* NACK received */ if (status & STM32F7_I2C_ISR_NACKF) { dev_dbg(i2c_dev->dev, "<%s>: Receive NACK (addr %x)\n", @@ -1531,33 +1548,28 @@ static irqreturn_t stm32f7_i2c_isr_event(int irq, void *data) f7_msg->result = -ENXIO; } - /* STOP detection flag */ - if (status & STM32F7_I2C_ISR_STOPF) { - /* Disable interrupts */ - if (stm32f7_i2c_is_slave_registered(i2c_dev)) - mask = STM32F7_I2C_XFER_IRQ_MASK; + if (status & STM32F7_I2C_ISR_TCR) { + if (f7_msg->smbus) + stm32f7_i2c_smbus_reload(i2c_dev); else - mask = STM32F7_I2C_ALL_IRQ_MASK; - stm32f7_i2c_disable_irq(i2c_dev, mask); - - /* Clear STOP flag */ - writel_relaxed(STM32F7_I2C_ICR_STOPCF, base + STM32F7_I2C_ICR); - - if (i2c_dev->use_dma && !f7_msg->result) { - ret = IRQ_WAKE_THREAD; - } else { - i2c_dev->master_mode = false; - complete(&i2c_dev->complete); - } + stm32f7_i2c_reload(i2c_dev); } /* Transfer complete */ if (status & STM32F7_I2C_ISR_TC) { + /* Wait for dma transfer completion before sending next message */ + if (i2c_dev->use_dma && !f7_msg->result) { + ret = wait_for_completion_timeout(&i2c_dev->dma->dma_complete, HZ); + if (!ret) { + dev_dbg(i2c_dev->dev, "<%s>: Timed out\n", __func__); + stm32f7_i2c_disable_dma_req(i2c_dev); + dmaengine_terminate_async(dma->chan_using); + f7_msg->result = -ETIMEDOUT; + } + } if (f7_msg->stop) { mask = STM32F7_I2C_CR2_STOP; stm32f7_i2c_set_bits(base + STM32F7_I2C_CR2, mask); - } else if (i2c_dev->use_dma && !f7_msg->result) { - ret = IRQ_WAKE_THREAD; } else if (f7_msg->smbus) { stm32f7_i2c_smbus_rep_start(i2c_dev); } else { @@ -1567,47 +1579,18 @@ static irqreturn_t stm32f7_i2c_isr_event(int irq, void *data) } } - if (status & STM32F7_I2C_ISR_TCR) { - if (f7_msg->smbus) - stm32f7_i2c_smbus_reload(i2c_dev); + /* STOP detection flag */ + if (status & STM32F7_I2C_ISR_STOPF) { + /* Disable interrupts */ + if (stm32f7_i2c_is_slave_registered(i2c_dev)) + mask = STM32F7_I2C_XFER_IRQ_MASK; else - stm32f7_i2c_reload(i2c_dev); - } - - return ret; -} - -static irqreturn_t stm32f7_i2c_isr_event_thread(int irq, void *data) -{ - struct stm32f7_i2c_dev *i2c_dev = data; - struct stm32f7_i2c_msg *f7_msg = &i2c_dev->f7_msg; - struct stm32_i2c_dma *dma = i2c_dev->dma; - u32 status; - int ret; - - /* - * Wait for dma transfer completion before sending next message or - * notity the end of xfer to the client - */ - ret = wait_for_completion_timeout(&i2c_dev->dma->dma_complete, HZ); - if (!ret) { - dev_dbg(i2c_dev->dev, "<%s>: Timed out\n", __func__); - stm32f7_i2c_disable_dma_req(i2c_dev); - dmaengine_terminate_async(dma->chan_using); - f7_msg->result = -ETIMEDOUT; - } + mask = STM32F7_I2C_ALL_IRQ_MASK; + stm32f7_i2c_disable_irq(i2c_dev, mask); - status = readl_relaxed(i2c_dev->base + STM32F7_I2C_ISR); + /* Clear STOP flag */ + writel_relaxed(STM32F7_I2C_ICR_STOPCF, base + STM32F7_I2C_ICR); - if (status & STM32F7_I2C_ISR_TC) { - if (f7_msg->smbus) { - stm32f7_i2c_smbus_rep_start(i2c_dev); - } else { - i2c_dev->msg_id++; - i2c_dev->msg++; - stm32f7_i2c_xfer_msg(i2c_dev, i2c_dev->msg); - } - } else { i2c_dev->master_mode = false; complete(&i2c_dev->complete); } @@ -1615,7 +1598,7 @@ static irqreturn_t stm32f7_i2c_isr_event_thread(int irq, void *data) return IRQ_HANDLED; } -static irqreturn_t stm32f7_i2c_isr_error(int irq, void *data) +static irqreturn_t stm32f7_i2c_isr_error_thread(int irq, void *data) { struct stm32f7_i2c_dev *i2c_dev = data; struct stm32f7_i2c_msg *f7_msg = &i2c_dev->f7_msg; @@ -2202,8 +2185,11 @@ static int stm32f7_i2c_probe(struct platform_device *pdev) if (ret) return dev_err_probe(&pdev->dev, ret, "Failed to request irq event\n"); - ret = devm_request_irq(&pdev->dev, irq_error, stm32f7_i2c_isr_error, 0, - pdev->name, i2c_dev); + ret = devm_request_threaded_irq(&pdev->dev, irq_error, + NULL, + stm32f7_i2c_isr_error_thread, + IRQF_ONESHOT, + pdev->name, i2c_dev); if (ret) return dev_err_probe(&pdev->dev, ret, "Failed to request irq error\n"); -- GitLab From 33a00d919253022aabafecae6bc88a6fad446589 Mon Sep 17 00:00:00 2001 From: Alain Volmat Date: Fri, 15 Dec 2023 18:06:07 +0100 Subject: [PATCH 1152/1290] i2c: stm32f7: simplify status messages in case of errors Avoid usage of __func__ when reporting an error message since dev_err/dev_dbg are already providing enough details to identify the source of the message. Signed-off-by: Alain Volmat Reviewed-by: Andi Shyti Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-stm32f7.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/i2c/busses/i2c-stm32f7.c b/drivers/i2c/busses/i2c-stm32f7.c index dde70e7ec7226..15100fed0a3f3 100644 --- a/drivers/i2c/busses/i2c-stm32f7.c +++ b/drivers/i2c/busses/i2c-stm32f7.c @@ -1602,6 +1602,7 @@ static irqreturn_t stm32f7_i2c_isr_error_thread(int irq, void *data) { struct stm32f7_i2c_dev *i2c_dev = data; struct stm32f7_i2c_msg *f7_msg = &i2c_dev->f7_msg; + u16 addr = f7_msg->addr; void __iomem *base = i2c_dev->base; struct device *dev = i2c_dev->dev; struct stm32_i2c_dma *dma = i2c_dev->dma; @@ -1611,8 +1612,7 @@ static irqreturn_t stm32f7_i2c_isr_error_thread(int irq, void *data) /* Bus error */ if (status & STM32F7_I2C_ISR_BERR) { - dev_err(dev, "<%s>: Bus error accessing addr 0x%x\n", - __func__, f7_msg->addr); + dev_err(dev, "Bus error accessing addr 0x%x\n", addr); writel_relaxed(STM32F7_I2C_ICR_BERRCF, base + STM32F7_I2C_ICR); stm32f7_i2c_release_bus(&i2c_dev->adap); f7_msg->result = -EIO; @@ -1620,21 +1620,19 @@ static irqreturn_t stm32f7_i2c_isr_error_thread(int irq, void *data) /* Arbitration loss */ if (status & STM32F7_I2C_ISR_ARLO) { - dev_dbg(dev, "<%s>: Arbitration loss accessing addr 0x%x\n", - __func__, f7_msg->addr); + dev_dbg(dev, "Arbitration loss accessing addr 0x%x\n", addr); writel_relaxed(STM32F7_I2C_ICR_ARLOCF, base + STM32F7_I2C_ICR); f7_msg->result = -EAGAIN; } if (status & STM32F7_I2C_ISR_PECERR) { - dev_err(dev, "<%s>: PEC error in reception accessing addr 0x%x\n", - __func__, f7_msg->addr); + dev_err(dev, "PEC error in reception accessing addr 0x%x\n", addr); writel_relaxed(STM32F7_I2C_ICR_PECCF, base + STM32F7_I2C_ICR); f7_msg->result = -EINVAL; } if (status & STM32F7_I2C_ISR_ALERT) { - dev_dbg(dev, "<%s>: SMBus alert received\n", __func__); + dev_dbg(dev, "SMBus alert received\n"); writel_relaxed(STM32F7_I2C_ICR_ALERTCF, base + STM32F7_I2C_ICR); i2c_handle_smbus_alert(i2c_dev->alert->ara); return IRQ_HANDLED; -- GitLab From bf12998e1a68a82d596ea33b00b3ebc75ce52bb5 Mon Sep 17 00:00:00 2001 From: Alain Volmat Date: Fri, 15 Dec 2023 18:06:08 +0100 Subject: [PATCH 1153/1290] dt-bindings: i2c: document st,stm32mp25-i2c compatible Add a new compatible st,stm32mp25-i2c for the STM32MP25 series which has only one interrupt line for both events and errors and differs in term of handling of FastModePlus. Signed-off-by: Alain Volmat Reviewed-by: Conor Dooley Signed-off-by: Wolfram Sang --- .../devicetree/bindings/i2c/st,stm32-i2c.yaml | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/Documentation/devicetree/bindings/i2c/st,stm32-i2c.yaml b/Documentation/devicetree/bindings/i2c/st,stm32-i2c.yaml index 94b75d9f66cdb..1b31b87c1800a 100644 --- a/Documentation/devicetree/bindings/i2c/st,stm32-i2c.yaml +++ b/Documentation/devicetree/bindings/i2c/st,stm32-i2c.yaml @@ -19,6 +19,7 @@ allOf: - st,stm32f7-i2c - st,stm32mp13-i2c - st,stm32mp15-i2c + - st,stm32mp25-i2c then: properties: i2c-scl-rising-time-ns: @@ -41,6 +42,30 @@ allOf: clock-frequency: enum: [100000, 400000] + - if: + properties: + compatible: + contains: + enum: + - st,stm32f4-i2c + - st,stm32f7-i2c + - st,stm32mp13-i2c + - st,stm32mp15-i2c + then: + properties: + interrupts: + minItems: 2 + + interrupt-names: + minItems: 2 + else: + properties: + interrupts: + maxItems: 1 + + interrupt-names: + maxItems: 1 + properties: compatible: enum: @@ -48,6 +73,7 @@ properties: - st,stm32f7-i2c - st,stm32mp13-i2c - st,stm32mp15-i2c + - st,stm32mp25-i2c reg: maxItems: 1 @@ -56,11 +82,13 @@ properties: items: - description: interrupt ID for I2C event - description: interrupt ID for I2C error + minItems: 1 interrupt-names: items: - const: event - const: error + minItems: 1 resets: maxItems: 1 -- GitLab From a058b24c08023ea0ea0a7c38e6845f1d25139a22 Mon Sep 17 00:00:00 2001 From: Alain Volmat Date: Fri, 15 Dec 2023 18:06:09 +0100 Subject: [PATCH 1154/1290] i2c: stm32f7: perform I2C_ISR read once at beginning of event isr Move readl_relaxed of I2C_ISR register at beginning of event isr so that it done once for both master & slave handling. Signed-off-by: Alain Volmat Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-stm32f7.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/i2c/busses/i2c-stm32f7.c b/drivers/i2c/busses/i2c-stm32f7.c index 15100fed0a3f3..f7abf6d3a7b96 100644 --- a/drivers/i2c/busses/i2c-stm32f7.c +++ b/drivers/i2c/busses/i2c-stm32f7.c @@ -1419,15 +1419,13 @@ static bool stm32f7_i2c_is_slave_busy(struct stm32f7_i2c_dev *i2c_dev) return i == busy; } -static irqreturn_t stm32f7_i2c_slave_isr_event(struct stm32f7_i2c_dev *i2c_dev) +static irqreturn_t stm32f7_i2c_slave_isr_event(struct stm32f7_i2c_dev *i2c_dev, u32 status) { void __iomem *base = i2c_dev->base; - u32 cr2, status, mask; + u32 cr2, mask; u8 val; int ret; - status = readl_relaxed(i2c_dev->base + STM32F7_I2C_ISR); - /* Slave transmitter mode */ if (status & STM32F7_I2C_ISR_TXIS) { i2c_slave_event(i2c_dev->slave_running, @@ -1531,11 +1529,11 @@ static irqreturn_t stm32f7_i2c_isr_event_thread(int irq, void *data) u32 status, mask; int ret; - if (!i2c_dev->master_mode) - return stm32f7_i2c_slave_isr_event(i2c_dev); - status = readl_relaxed(i2c_dev->base + STM32F7_I2C_ISR); + if (!i2c_dev->master_mode) + return stm32f7_i2c_slave_isr_event(i2c_dev, status); + /* NACK received */ if (status & STM32F7_I2C_ISR_NACKF) { dev_dbg(i2c_dev->dev, "<%s>: Receive NACK (addr %x)\n", -- GitLab From 90f9b1406236853bd524ddde86cc5a945690c6b7 Mon Sep 17 00:00:00 2001 From: Alain Volmat Date: Fri, 15 Dec 2023 18:06:10 +0100 Subject: [PATCH 1155/1290] i2c: stm32f7: add support for stm32mp25 soc The stm32mp25 has only a single interrupt line used for both events and errors. In order to cope with that, reorganise the error handling code so that it can be called either from the common handler (used in case of SoC having only a single IT line) and the error handler for others. The CR1 register also embeds a new FMP bit, necessary when running at Fast Mode Plus frequency. This bit should be used instead of the SYSCFG bit used on other platforms. Add a new compatible to distinguish between the SoCs and two boolean within the setup structure in order to know if the platform has a single/multiple IT lines and if the FMP bit within CR1 is available or not. Signed-off-by: Valentin Caron Signed-off-by: Alain Volmat Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-stm32f7.c | 214 ++++++++++++++++++------------- 1 file changed, 126 insertions(+), 88 deletions(-) diff --git a/drivers/i2c/busses/i2c-stm32f7.c b/drivers/i2c/busses/i2c-stm32f7.c index f7abf6d3a7b96..01210452216b3 100644 --- a/drivers/i2c/busses/i2c-stm32f7.c +++ b/drivers/i2c/busses/i2c-stm32f7.c @@ -50,6 +50,7 @@ #define STM32F7_I2C_TXDR 0x28 /* STM32F7 I2C control 1 */ +#define STM32_I2C_CR1_FMP BIT(24) #define STM32F7_I2C_CR1_PECEN BIT(23) #define STM32F7_I2C_CR1_ALERTEN BIT(22) #define STM32F7_I2C_CR1_SMBHEN BIT(20) @@ -226,6 +227,8 @@ struct stm32f7_i2c_spec { * @rise_time: Rise time (ns) * @fall_time: Fall time (ns) * @fmp_clr_offset: Fast Mode Plus clear register offset from set register + * @single_it_line: Only a single IT line is used for both events/errors + * @fmp_cr1_bit: Fast Mode Plus control is done via a bit in CR1 */ struct stm32f7_i2c_setup { u32 speed_freq; @@ -233,6 +236,8 @@ struct stm32f7_i2c_setup { u32 rise_time; u32 fall_time; u32 fmp_clr_offset; + bool single_it_line; + bool fmp_cr1_bit; }; /** @@ -418,6 +423,13 @@ static const struct stm32f7_i2c_setup stm32mp13_setup = { .fmp_clr_offset = 0x4, }; +static const struct stm32f7_i2c_setup stm32mp25_setup = { + .rise_time = STM32F7_I2C_RISE_TIME_DEFAULT, + .fall_time = STM32F7_I2C_FALL_TIME_DEFAULT, + .single_it_line = true, + .fmp_cr1_bit = true, +}; + static inline void stm32f7_i2c_set_bits(void __iomem *reg, u32 mask) { writel_relaxed(readl_relaxed(reg) | mask, reg); @@ -1492,17 +1504,81 @@ static irqreturn_t stm32f7_i2c_slave_isr_event(struct stm32f7_i2c_dev *i2c_dev, return IRQ_HANDLED; } +static irqreturn_t stm32f7_i2c_handle_isr_errs(struct stm32f7_i2c_dev *i2c_dev, u32 status) +{ + struct stm32f7_i2c_msg *f7_msg = &i2c_dev->f7_msg; + u16 addr = f7_msg->addr; + void __iomem *base = i2c_dev->base; + struct device *dev = i2c_dev->dev; + struct stm32_i2c_dma *dma = i2c_dev->dma; + + /* Bus error */ + if (status & STM32F7_I2C_ISR_BERR) { + dev_err(dev, "Bus error accessing addr 0x%x\n", addr); + writel_relaxed(STM32F7_I2C_ICR_BERRCF, base + STM32F7_I2C_ICR); + stm32f7_i2c_release_bus(&i2c_dev->adap); + f7_msg->result = -EIO; + } + + /* Arbitration loss */ + if (status & STM32F7_I2C_ISR_ARLO) { + dev_dbg(dev, "Arbitration loss accessing addr 0x%x\n", addr); + writel_relaxed(STM32F7_I2C_ICR_ARLOCF, base + STM32F7_I2C_ICR); + f7_msg->result = -EAGAIN; + } + + if (status & STM32F7_I2C_ISR_PECERR) { + dev_err(dev, "PEC error in reception accessing addr 0x%x\n", addr); + writel_relaxed(STM32F7_I2C_ICR_PECCF, base + STM32F7_I2C_ICR); + f7_msg->result = -EINVAL; + } + + if (status & STM32F7_I2C_ISR_ALERT) { + dev_dbg(dev, "SMBus alert received\n"); + writel_relaxed(STM32F7_I2C_ICR_ALERTCF, base + STM32F7_I2C_ICR); + i2c_handle_smbus_alert(i2c_dev->alert->ara); + return IRQ_HANDLED; + } + + if (!i2c_dev->slave_running) { + u32 mask; + /* Disable interrupts */ + if (stm32f7_i2c_is_slave_registered(i2c_dev)) + mask = STM32F7_I2C_XFER_IRQ_MASK; + else + mask = STM32F7_I2C_ALL_IRQ_MASK; + stm32f7_i2c_disable_irq(i2c_dev, mask); + } + + /* Disable dma */ + if (i2c_dev->use_dma) { + stm32f7_i2c_disable_dma_req(i2c_dev); + dmaengine_terminate_async(dma->chan_using); + } + + i2c_dev->master_mode = false; + complete(&i2c_dev->complete); + + return IRQ_HANDLED; +} + +#define STM32F7_ERR_EVENTS (STM32F7_I2C_ISR_BERR | STM32F7_I2C_ISR_ARLO |\ + STM32F7_I2C_ISR_PECERR | STM32F7_I2C_ISR_ALERT) static irqreturn_t stm32f7_i2c_isr_event(int irq, void *data) { struct stm32f7_i2c_dev *i2c_dev = data; u32 status; - /* Check if the interrupt is for a slave device */ - if (!i2c_dev->master_mode) - return IRQ_WAKE_THREAD; - status = readl_relaxed(i2c_dev->base + STM32F7_I2C_ISR); + /* + * Check if the interrupt is for a slave device or related + * to errors flags (in case of single it line mode) + */ + if (!i2c_dev->master_mode || + (i2c_dev->setup.single_it_line && (status & STM32F7_ERR_EVENTS))) + return IRQ_WAKE_THREAD; + /* Tx empty */ if (status & STM32F7_I2C_ISR_TXIS) stm32f7_i2c_write_tx_data(i2c_dev); @@ -1534,6 +1610,10 @@ static irqreturn_t stm32f7_i2c_isr_event_thread(int irq, void *data) if (!i2c_dev->master_mode) return stm32f7_i2c_slave_isr_event(i2c_dev, status); + /* Handle errors in case of this handler is used for events/errors */ + if (i2c_dev->setup.single_it_line && (status & STM32F7_ERR_EVENTS)) + return stm32f7_i2c_handle_isr_errs(i2c_dev, status); + /* NACK received */ if (status & STM32F7_I2C_ISR_NACKF) { dev_dbg(i2c_dev->dev, "<%s>: Receive NACK (addr %x)\n", @@ -1599,63 +1679,11 @@ static irqreturn_t stm32f7_i2c_isr_event_thread(int irq, void *data) static irqreturn_t stm32f7_i2c_isr_error_thread(int irq, void *data) { struct stm32f7_i2c_dev *i2c_dev = data; - struct stm32f7_i2c_msg *f7_msg = &i2c_dev->f7_msg; - u16 addr = f7_msg->addr; - void __iomem *base = i2c_dev->base; - struct device *dev = i2c_dev->dev; - struct stm32_i2c_dma *dma = i2c_dev->dma; u32 status; status = readl_relaxed(i2c_dev->base + STM32F7_I2C_ISR); - /* Bus error */ - if (status & STM32F7_I2C_ISR_BERR) { - dev_err(dev, "Bus error accessing addr 0x%x\n", addr); - writel_relaxed(STM32F7_I2C_ICR_BERRCF, base + STM32F7_I2C_ICR); - stm32f7_i2c_release_bus(&i2c_dev->adap); - f7_msg->result = -EIO; - } - - /* Arbitration loss */ - if (status & STM32F7_I2C_ISR_ARLO) { - dev_dbg(dev, "Arbitration loss accessing addr 0x%x\n", addr); - writel_relaxed(STM32F7_I2C_ICR_ARLOCF, base + STM32F7_I2C_ICR); - f7_msg->result = -EAGAIN; - } - - if (status & STM32F7_I2C_ISR_PECERR) { - dev_err(dev, "PEC error in reception accessing addr 0x%x\n", addr); - writel_relaxed(STM32F7_I2C_ICR_PECCF, base + STM32F7_I2C_ICR); - f7_msg->result = -EINVAL; - } - - if (status & STM32F7_I2C_ISR_ALERT) { - dev_dbg(dev, "SMBus alert received\n"); - writel_relaxed(STM32F7_I2C_ICR_ALERTCF, base + STM32F7_I2C_ICR); - i2c_handle_smbus_alert(i2c_dev->alert->ara); - return IRQ_HANDLED; - } - - if (!i2c_dev->slave_running) { - u32 mask; - /* Disable interrupts */ - if (stm32f7_i2c_is_slave_registered(i2c_dev)) - mask = STM32F7_I2C_XFER_IRQ_MASK; - else - mask = STM32F7_I2C_ALL_IRQ_MASK; - stm32f7_i2c_disable_irq(i2c_dev, mask); - } - - /* Disable dma */ - if (i2c_dev->use_dma) { - stm32f7_i2c_disable_dma_req(i2c_dev); - dmaengine_terminate_async(dma->chan_using); - } - - i2c_dev->master_mode = false; - complete(&i2c_dev->complete); - - return IRQ_HANDLED; + return stm32f7_i2c_handle_isr_errs(i2c_dev, status); } static int stm32f7_i2c_wait_polling(struct stm32f7_i2c_dev *i2c_dev) @@ -1991,23 +2019,27 @@ static int stm32f7_i2c_unreg_slave(struct i2c_client *slave) static int stm32f7_i2c_write_fm_plus_bits(struct stm32f7_i2c_dev *i2c_dev, bool enable) { - int ret; + int ret = 0; if (i2c_dev->bus_rate <= I2C_MAX_FAST_MODE_FREQ || - IS_ERR_OR_NULL(i2c_dev->regmap)) + (!i2c_dev->setup.fmp_cr1_bit && IS_ERR_OR_NULL(i2c_dev->regmap))) /* Optional */ return 0; - if (i2c_dev->fmp_sreg == i2c_dev->fmp_creg) - ret = regmap_update_bits(i2c_dev->regmap, - i2c_dev->fmp_sreg, - i2c_dev->fmp_mask, - enable ? i2c_dev->fmp_mask : 0); - else - ret = regmap_write(i2c_dev->regmap, - enable ? i2c_dev->fmp_sreg : - i2c_dev->fmp_creg, - i2c_dev->fmp_mask); + if (i2c_dev->setup.fmp_cr1_bit) { + if (enable) + stm32f7_i2c_set_bits(i2c_dev->base + STM32F7_I2C_CR1, STM32_I2C_CR1_FMP); + else + stm32f7_i2c_clr_bits(i2c_dev->base + STM32F7_I2C_CR1, STM32_I2C_CR1_FMP); + } else { + if (i2c_dev->fmp_sreg == i2c_dev->fmp_creg) + ret = regmap_update_bits(i2c_dev->regmap, i2c_dev->fmp_sreg, + i2c_dev->fmp_mask, enable ? i2c_dev->fmp_mask : 0); + else + ret = regmap_write(i2c_dev->regmap, + enable ? i2c_dev->fmp_sreg : i2c_dev->fmp_creg, + i2c_dev->fmp_mask); + } return ret; } @@ -2141,6 +2173,13 @@ static int stm32f7_i2c_probe(struct platform_device *pdev) if (!i2c_dev) return -ENOMEM; + setup = of_device_get_match_data(&pdev->dev); + if (!setup) { + dev_err(&pdev->dev, "Can't get device data\n"); + return -ENODEV; + } + i2c_dev->setup = *setup; + i2c_dev->base = devm_platform_get_and_ioremap_resource(pdev, 0, &res); if (IS_ERR(i2c_dev->base)) return PTR_ERR(i2c_dev->base); @@ -2150,10 +2189,6 @@ static int stm32f7_i2c_probe(struct platform_device *pdev) if (irq_event < 0) return irq_event; - irq_error = platform_get_irq(pdev, 1); - if (irq_error < 0) - return irq_error; - i2c_dev->wakeup_src = of_property_read_bool(pdev->dev.of_node, "wakeup-source"); @@ -2181,20 +2216,19 @@ static int stm32f7_i2c_probe(struct platform_device *pdev) if (ret) return dev_err_probe(&pdev->dev, ret, "Failed to request irq event\n"); - ret = devm_request_threaded_irq(&pdev->dev, irq_error, - NULL, - stm32f7_i2c_isr_error_thread, - IRQF_ONESHOT, - pdev->name, i2c_dev); - if (ret) - return dev_err_probe(&pdev->dev, ret, "Failed to request irq error\n"); + if (!i2c_dev->setup.single_it_line) { + irq_error = platform_get_irq(pdev, 1); + if (irq_error < 0) + return irq_error; - setup = of_device_get_match_data(&pdev->dev); - if (!setup) { - dev_err(&pdev->dev, "Can't get device data\n"); - return -ENODEV; + ret = devm_request_threaded_irq(&pdev->dev, irq_error, + NULL, + stm32f7_i2c_isr_error_thread, + IRQF_ONESHOT, + pdev->name, i2c_dev); + if (ret) + return dev_err_probe(&pdev->dev, ret, "Failed to request irq error\n"); } - i2c_dev->setup = *setup; ret = stm32f7_i2c_setup_timing(i2c_dev, &i2c_dev->setup); if (ret) @@ -2202,9 +2236,12 @@ static int stm32f7_i2c_probe(struct platform_device *pdev) /* Setup Fast mode plus if necessary */ if (i2c_dev->bus_rate > I2C_MAX_FAST_MODE_FREQ) { - ret = stm32f7_i2c_setup_fm_plus_bits(pdev, i2c_dev); - if (ret) - return ret; + if (!i2c_dev->setup.fmp_cr1_bit) { + ret = stm32f7_i2c_setup_fm_plus_bits(pdev, i2c_dev); + if (ret) + return ret; + } + ret = stm32f7_i2c_write_fm_plus_bits(i2c_dev, true); if (ret) return ret; @@ -2483,6 +2520,7 @@ static const struct of_device_id stm32f7_i2c_match[] = { { .compatible = "st,stm32f7-i2c", .data = &stm32f7_setup}, { .compatible = "st,stm32mp15-i2c", .data = &stm32mp15_setup}, { .compatible = "st,stm32mp13-i2c", .data = &stm32mp13_setup}, + { .compatible = "st,stm32mp25-i2c", .data = &stm32mp25_setup}, {}, }; MODULE_DEVICE_TABLE(of, stm32f7_i2c_match); -- GitLab From 4503538d3066f6dd0a66ecc902b382912b97d8a1 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sat, 13 Jan 2024 20:35:54 +0100 Subject: [PATCH 1156/1290] MAINTAINERS: use proper email for my I2C work Renesas is solely funding my I2C maintenance meanwhile, give them credit for that by using this email address. Signed-off-by: Wolfram Sang --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 012df8ccf34e9..1a1824790239e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9988,7 +9988,7 @@ F: Documentation/i2c/busses/i2c-parport.rst F: drivers/i2c/busses/i2c-parport.c I2C SUBSYSTEM -M: Wolfram Sang +M: Wolfram Sang L: linux-i2c@vger.kernel.org S: Maintained W: https://i2c.wiki.kernel.org/ -- GitLab From e28b0359587fe4055c838698172de0530b511702 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 10 Jan 2024 15:54:41 -0800 Subject: [PATCH 1157/1290] bcachefs: Replace strlcpy() with strscpy() strlcpy() reads the entire source buffer first. This read may exceed the destination size limit. This is both inefficient and can lead to linear read overflows if a source string is not NUL-terminated[1]. Additionally, it returns the size of the source string, not the resulting size of the destination string. In an effort to remove strlcpy() completely[2], replace strlcpy() here with strscpy(). Nothing checks the return value here, so a direct replacement with strspy() is possible. Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strlcpy [1] Link: https://github.com/KSPP/linux/issues/89 [2] Cc: Kent Overstreet Cc: Brian Foster Cc: Link: https://lore.kernel.org/r/20240110235438.work.385-kees@kernel.org Signed-off-by: Kees Cook --- fs/bcachefs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 9dbc35940197f..cefe52898e8eb 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1386,8 +1386,8 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) prt_bdevname(&name, ca->disk_sb.bdev); if (c->sb.nr_devices == 1) - strlcpy(c->name, name.buf, sizeof(c->name)); - strlcpy(ca->name, name.buf, sizeof(ca->name)); + strscpy(c->name, name.buf, sizeof(c->name)); + strscpy(ca->name, name.buf, sizeof(ca->name)); printbuf_exit(&name); -- GitLab From 3bb9b1f958c3d986ed90a3ff009f1e77e9553207 Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Wed, 10 Jan 2024 20:58:35 +0530 Subject: [PATCH 1158/1290] drm/amd/display: Fix late derefrence 'dsc' check in 'link_set_dsc_pps_packet()' In link_set_dsc_pps_packet(), 'struct display_stream_compressor *dsc' was dereferenced in a DC_LOGGER_INIT(dsc->ctx->logger); before the 'dsc' NULL pointer check. Fixes the below: drivers/gpu/drm/amd/amdgpu/../display/dc/link/link_dpms.c:905 link_set_dsc_pps_packet() warn: variable dereferenced before check 'dsc' (see line 903) Cc: stable@vger.kernel.org Cc: Aurabindo Pillai Cc: Rodrigo Siqueira Cc: Hamza Mahfooz Cc: Wenjing Liu Cc: Qingqing Zhuo Signed-off-by: Srinivasan Shanmugam Reviewed-by: Aurabindo Pillai Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/link/link_dpms.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/link/link_dpms.c b/drivers/gpu/drm/amd/display/dc/link/link_dpms.c index 3de148004c066..3cbfbf8d107e9 100644 --- a/drivers/gpu/drm/amd/display/dc/link/link_dpms.c +++ b/drivers/gpu/drm/amd/display/dc/link/link_dpms.c @@ -900,11 +900,15 @@ bool link_set_dsc_pps_packet(struct pipe_ctx *pipe_ctx, bool enable, bool immedi { struct display_stream_compressor *dsc = pipe_ctx->stream_res.dsc; struct dc_stream_state *stream = pipe_ctx->stream; - DC_LOGGER_INIT(dsc->ctx->logger); - if (!pipe_ctx->stream->timing.flags.DSC || !dsc) + if (!pipe_ctx->stream->timing.flags.DSC) return false; + if (!dsc) + return false; + + DC_LOGGER_INIT(dsc->ctx->logger); + if (enable) { struct dsc_config dsc_cfg; uint8_t dsc_packed_pps[128]; -- GitLab From aa36d8971fccb55ef3241cbfff9d1799e31d8628 Mon Sep 17 00:00:00 2001 From: Dillon Varone Date: Thu, 28 Dec 2023 21:36:39 -0500 Subject: [PATCH 1159/1290] drm/amd/display: Init link enc resources in dc_state only if res_pool presents [Why & How] res_pool is not initialized in all situations such as virtual environments, and therefore link encoder resources should not be initialized if res_pool is NULL. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Martin Leung Acked-by: Alex Hung Signed-off-by: Dillon Varone Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc_state.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_state.c b/drivers/gpu/drm/amd/display/dc/core/dc_state.c index 460a8010c79fe..56feee0ff01b1 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_state.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_state.c @@ -267,7 +267,8 @@ void dc_state_construct(struct dc *dc, struct dc_state *state) state->clk_mgr = dc->clk_mgr; /* Initialise DIG link encoder resource tracking variables. */ - link_enc_cfg_init(dc, state); + if (dc->res_pool) + link_enc_cfg_init(dc, state); } void dc_state_destruct(struct dc_state *state) -- GitLab From 8a51cc097dd590a86e8eec5398934ef389ff9a7b Mon Sep 17 00:00:00 2001 From: Charlene Liu Date: Thu, 28 Dec 2023 13:19:33 -0500 Subject: [PATCH 1160/1290] drm/amd/display: Add logging resource checks [Why] When mapping resources, resources could be unavailable. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Sung joon Kim Acked-by: Alex Hung Signed-off-by: Charlene Liu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc.c | 4 +++- drivers/gpu/drm/amd/display/dc/core/dc_resource.c | 4 ++++ drivers/gpu/drm/amd/display/dc/core/dc_state.c | 5 +++-- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index 69e726630241d..aa7c02ba948e9 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -3522,7 +3522,7 @@ static void commit_planes_for_stream(struct dc *dc, top_pipe_to_program = resource_get_otg_master_for_stream( &context->res_ctx, stream); - + ASSERT(top_pipe_to_program != NULL); for (i = 0; i < dc->res_pool->pipe_count; i++) { struct pipe_ctx *old_pipe = &dc->current_state->res_ctx.pipe_ctx[i]; @@ -4345,6 +4345,8 @@ static bool should_commit_minimal_transition_for_windowed_mpo_odm(struct dc *dc, cur_pipe = resource_get_otg_master_for_stream(&dc->current_state->res_ctx, stream); new_pipe = resource_get_otg_master_for_stream(&context->res_ctx, stream); + if (!cur_pipe || !new_pipe) + return false; cur_is_odm_in_use = resource_get_odm_slice_count(cur_pipe) > 1; new_is_odm_in_use = resource_get_odm_slice_count(new_pipe) > 1; if (cur_is_odm_in_use == new_is_odm_in_use) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c index f2abc1096ffb6..9fbdb09697fd5 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c @@ -2194,6 +2194,10 @@ void resource_log_pipe_topology_update(struct dc *dc, struct dc_state *state) for (stream_idx = 0; stream_idx < state->stream_count; stream_idx++) { otg_master = resource_get_otg_master_for_stream( &state->res_ctx, state->streams[stream_idx]); + if (!otg_master || otg_master->stream_res.tg == NULL) { + DC_LOG_DC("topology update: otg_master NULL stream_idx %d!\n", stream_idx); + return; + } slice_count = resource_get_opp_heads_for_otg_master(otg_master, &state->res_ctx, opp_heads); for (slice_idx = 0; slice_idx < slice_count; slice_idx++) { diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_state.c b/drivers/gpu/drm/amd/display/dc/core/dc_state.c index 56feee0ff01b1..88c6436b28b69 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_state.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_state.c @@ -434,8 +434,9 @@ bool dc_state_add_plane( otg_master_pipe = resource_get_otg_master_for_stream( &state->res_ctx, stream); - added = resource_append_dpp_pipes_for_plane_composition(state, - dc->current_state, pool, otg_master_pipe, plane_state); + if (otg_master_pipe) + added = resource_append_dpp_pipes_for_plane_composition(state, + dc->current_state, pool, otg_master_pipe, plane_state); if (added) { stream_status->plane_states[stream_status->plane_count] = -- GitLab From 4b56f7d47be87cde5f368b67bc7fac53a2c3e8d2 Mon Sep 17 00:00:00 2001 From: Nicholas Kazlauskas Date: Fri, 15 Dec 2023 11:01:42 -0500 Subject: [PATCH 1161/1290] drm/amd/display: Port DENTIST hang and TDR fixes to OTG disable W/A [Why] We can experience DENTIST hangs during optimize_bandwidth or TDRs if FIFO is toggled and hangs. [How] Port the DCN35 fixes to DCN314. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Charlene Liu Acked-by: Alex Hung Signed-off-by: Nicholas Kazlauskas Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../dc/clk_mgr/dcn314/dcn314_clk_mgr.c | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_clk_mgr.c index 878c0e7b78abd..a84f1e376dee4 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_clk_mgr.c @@ -145,30 +145,27 @@ static int dcn314_get_active_display_cnt_wa( return display_count; } -static void dcn314_disable_otg_wa(struct clk_mgr *clk_mgr_base, struct dc_state *context, bool disable) +static void dcn314_disable_otg_wa(struct clk_mgr *clk_mgr_base, struct dc_state *context, + bool safe_to_lower, bool disable) { struct dc *dc = clk_mgr_base->ctx->dc; int i; for (i = 0; i < dc->res_pool->pipe_count; ++i) { - struct pipe_ctx *pipe = &dc->current_state->res_ctx.pipe_ctx[i]; + struct pipe_ctx *pipe = safe_to_lower + ? &context->res_ctx.pipe_ctx[i] + : &dc->current_state->res_ctx.pipe_ctx[i]; if (pipe->top_pipe || pipe->prev_odm_pipe) continue; if (pipe->stream && (pipe->stream->dpms_off || dc_is_virtual_signal(pipe->stream->signal))) { - struct stream_encoder *stream_enc = pipe->stream_res.stream_enc; - if (disable) { - if (stream_enc && stream_enc->funcs->disable_fifo) - pipe->stream_res.stream_enc->funcs->disable_fifo(stream_enc); + if (pipe->stream_res.tg && pipe->stream_res.tg->funcs->immediate_disable_crtc) + pipe->stream_res.tg->funcs->immediate_disable_crtc(pipe->stream_res.tg); - pipe->stream_res.tg->funcs->immediate_disable_crtc(pipe->stream_res.tg); reset_sync_context_for_pipe(dc, context, i); } else { pipe->stream_res.tg->funcs->enable_crtc(pipe->stream_res.tg); - - if (stream_enc && stream_enc->funcs->enable_fifo) - pipe->stream_res.stream_enc->funcs->enable_fifo(stream_enc); } } } @@ -297,11 +294,11 @@ void dcn314_update_clocks(struct clk_mgr *clk_mgr_base, } if (should_set_clock(safe_to_lower, new_clocks->dispclk_khz, clk_mgr_base->clks.dispclk_khz)) { - dcn314_disable_otg_wa(clk_mgr_base, context, true); + dcn314_disable_otg_wa(clk_mgr_base, context, safe_to_lower, true); clk_mgr_base->clks.dispclk_khz = new_clocks->dispclk_khz; dcn314_smu_set_dispclk(clk_mgr, clk_mgr_base->clks.dispclk_khz); - dcn314_disable_otg_wa(clk_mgr_base, context, false); + dcn314_disable_otg_wa(clk_mgr_base, context, safe_to_lower, false); update_dispclk = true; } -- GitLab From 3ba2a0bfd8cf94eb225e1c60dff16e5c35bde1da Mon Sep 17 00:00:00 2001 From: Ilya Bakoulin Date: Wed, 3 Jan 2024 09:42:04 -0500 Subject: [PATCH 1162/1290] drm/amd/display: Clear OPTC mem select on disable [Why] Not clearing the memory select bits prior to OPTC disable can cause DSC corruption issues when attempting to reuse a memory instance for another OPTC that enables ODM. [How] Clear the memory select bits prior to disabling an OPTC. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Charlene Liu Acked-by: Alex Hung Signed-off-by: Ilya Bakoulin Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.c | 3 +++ drivers/gpu/drm/amd/display/dc/optc/dcn35/dcn35_optc.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.c b/drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.c index 1788eb29474b4..8234935433254 100644 --- a/drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.c +++ b/drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.c @@ -173,6 +173,9 @@ static bool optc32_disable_crtc(struct timing_generator *optc) OPTC_SEG3_SRC_SEL, 0xf, OPTC_NUM_OF_INPUT_SEGMENT, 0); + REG_UPDATE(OPTC_MEMORY_CONFIG, + OPTC_MEM_SEL, 0); + /* disable otg request until end of the first line * in the vertical blank region */ diff --git a/drivers/gpu/drm/amd/display/dc/optc/dcn35/dcn35_optc.c b/drivers/gpu/drm/amd/display/dc/optc/dcn35/dcn35_optc.c index 3d6c1b2c2b4d6..5b15475088503 100644 --- a/drivers/gpu/drm/amd/display/dc/optc/dcn35/dcn35_optc.c +++ b/drivers/gpu/drm/amd/display/dc/optc/dcn35/dcn35_optc.c @@ -145,6 +145,9 @@ static bool optc35_disable_crtc(struct timing_generator *optc) OPTC_SEG3_SRC_SEL, 0xf, OPTC_NUM_OF_INPUT_SEGMENT, 0); + REG_UPDATE(OPTC_MEMORY_CONFIG, + OPTC_MEM_SEL, 0); + /* disable otg request until end of the first line * in the vertical blank region */ -- GitLab From d3579f5df0536c2f0fabaa3ea80bb2d179884195 Mon Sep 17 00:00:00 2001 From: Ovidiu Bunea Date: Mon, 18 Dec 2023 21:40:45 -0500 Subject: [PATCH 1163/1290] drm/amd/display: Fix DML2 watermark calculation [Why] core_mode_programming in DML2 should output watermark calculations to locals, but it incorrectly uses mode_lib [How] update code to match HW DML2 Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Charlene Liu Acked-by: Alex Hung Signed-off-by: Ovidiu Bunea Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../drm/amd/display/dc/dml2/display_mode_core.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml2/display_mode_core.c b/drivers/gpu/drm/amd/display/dc/dml2/display_mode_core.c index a6b938a12de13..9be5ebf3a8c0b 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2/display_mode_core.c +++ b/drivers/gpu/drm/amd/display/dc/dml2/display_mode_core.c @@ -9446,13 +9446,13 @@ void dml_core_mode_programming(struct display_mode_lib_st *mode_lib, const struc CalculateWatermarks_params->CompressedBufferSizeInkByte = locals->CompressedBufferSizeInkByte; // Output - CalculateWatermarks_params->Watermark = &s->dummy_watermark; // Watermarks *Watermark - CalculateWatermarks_params->DRAMClockChangeSupport = &mode_lib->ms.support.DRAMClockChangeSupport[0]; - CalculateWatermarks_params->MaxActiveDRAMClockChangeLatencySupported = &s->dummy_single_array[0][0]; // dml_float_t *MaxActiveDRAMClockChangeLatencySupported[] - CalculateWatermarks_params->SubViewportLinesNeededInMALL = &mode_lib->ms.SubViewportLinesNeededInMALL[j]; // dml_uint_t SubViewportLinesNeededInMALL[] - CalculateWatermarks_params->FCLKChangeSupport = &mode_lib->ms.support.FCLKChangeSupport[0]; - CalculateWatermarks_params->MaxActiveFCLKChangeLatencySupported = &s->dummy_single[0]; // dml_float_t *MaxActiveFCLKChangeLatencySupported - CalculateWatermarks_params->USRRetrainingSupport = &mode_lib->ms.support.USRRetrainingSupport[0]; + CalculateWatermarks_params->Watermark = &locals->Watermark; // Watermarks *Watermark + CalculateWatermarks_params->DRAMClockChangeSupport = &locals->DRAMClockChangeSupport; + CalculateWatermarks_params->MaxActiveDRAMClockChangeLatencySupported = locals->MaxActiveDRAMClockChangeLatencySupported; // dml_float_t *MaxActiveDRAMClockChangeLatencySupported[] + CalculateWatermarks_params->SubViewportLinesNeededInMALL = locals->SubViewportLinesNeededInMALL; // dml_uint_t SubViewportLinesNeededInMALL[] + CalculateWatermarks_params->FCLKChangeSupport = &locals->FCLKChangeSupport; + CalculateWatermarks_params->MaxActiveFCLKChangeLatencySupported = &locals->MaxActiveFCLKChangeLatencySupported; // dml_float_t *MaxActiveFCLKChangeLatencySupported + CalculateWatermarks_params->USRRetrainingSupport = &locals->USRRetrainingSupport; CalculateWatermarksMALLUseAndDRAMSpeedChangeSupport( &mode_lib->scratch, -- GitLab From bfe79f5fff1300d96203383582b078c7b0aec80a Mon Sep 17 00:00:00 2001 From: Wayne Lin Date: Tue, 2 Jan 2024 14:20:37 +0800 Subject: [PATCH 1164/1290] drm/amd/display: Align the returned error code with legacy DP [Why] For usb4 connector, AUX transaction is handled by dmub utilizing a differnt code path comparing to legacy DP connector. If the usb4 DP connector is disconnected, AUX access will report EBUSY and cause igt@kms_dp_aux_dev fail. [How] Align the error code with the one reported by legacy DP as EIO. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Acked-by: Alex Hung Signed-off-by: Wayne Lin Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c index eaf8d9f482446..85b7f58a7f35a 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c @@ -979,6 +979,11 @@ int dm_helper_dmub_aux_transfer_sync( struct aux_payload *payload, enum aux_return_code_type *operation_result) { + if (!link->hpd_status) { + *operation_result = AUX_RET_ERROR_HPD_DISCON; + return -1; + } + return amdgpu_dm_process_dmub_aux_transfer_sync(ctx, link->link_index, payload, operation_result); } -- GitLab From bc03c02cc1991a066b23e69bbcc0f66e8f1f7453 Mon Sep 17 00:00:00 2001 From: Ma Jun Date: Fri, 12 Jan 2024 13:33:24 +0800 Subject: [PATCH 1165/1290] drm/amdgpu: Fix the null pointer when load rlc firmware If the RLC firmware is invalid because of wrong header size, the pointer to the rlc firmware is released in function amdgpu_ucode_request. There will be a null pointer error in subsequent use. So skip validation to fix it. Fixes: 3da9b71563cb ("drm/amd: Use `amdgpu_ucode_*` helpers for GFX10") Signed-off-by: Ma Jun Acked-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index 73f6d7e72c737..d63cab294883b 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -3996,16 +3996,13 @@ static int gfx_v10_0_init_microcode(struct amdgpu_device *adev) if (!amdgpu_sriov_vf(adev)) { snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_rlc.bin", ucode_prefix); - err = amdgpu_ucode_request(adev, &adev->gfx.rlc_fw, fw_name); - /* don't check this. There are apparently firmwares in the wild with - * incorrect size in the header - */ - if (err == -ENODEV) - goto out; + err = request_firmware(&adev->gfx.rlc_fw, fw_name, adev->dev); if (err) - dev_dbg(adev->dev, - "gfx10: amdgpu_ucode_request() failed \"%s\"\n", - fw_name); + goto out; + + /* don't validate this firmware. There are apparently firmwares + * in the wild with incorrect size in the header + */ rlc_hdr = (const struct rlc_firmware_header_v2_0 *)adev->gfx.rlc_fw->data; version_major = le16_to_cpu(rlc_hdr->header.header_version_major); version_minor = le16_to_cpu(rlc_hdr->header.header_version_minor); -- GitLab From 05638ff6dd6f0f38734b6b3ee2c7cf15520f5c00 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 13 Jan 2024 15:58:21 +0100 Subject: [PATCH 1166/1290] drm/amd/display: Fix a switch statement in populate_dml_output_cfg_from_stream_state() It is likely that the statement related to 'dml_edp' is misplaced. So move it in the correct "case SIGNAL_TYPE_EDP". Fixes: 7966f319c66d ("drm/amd/display: Introduce DML2") Signed-off-by: Christophe JAILLET Signed-off-by: Hamza Mahfooz Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c b/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c index fa6a93dd96295..64d01a9cd68c8 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c +++ b/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c @@ -626,8 +626,8 @@ static void populate_dml_output_cfg_from_stream_state(struct dml_output_cfg_st * if (is_dp2p0_output_encoder(pipe)) out->OutputEncoder[location] = dml_dp2p0; break; - out->OutputEncoder[location] = dml_edp; case SIGNAL_TYPE_EDP: + out->OutputEncoder[location] = dml_edp; break; case SIGNAL_TYPE_HDMI_TYPE_A: case SIGNAL_TYPE_DVI_SINGLE_LINK: -- GitLab From 3c4e4eb5d872118fef1708abe933a410c5e07e3a Mon Sep 17 00:00:00 2001 From: Flora Cui Date: Wed, 10 Jan 2024 19:23:56 +0800 Subject: [PATCH 1167/1290] drm/amdkfd: init drm_client with funcs hook otherwise drm_client_dev_unregister() would try to kfree(&adev->kfd.client). Fixes: 1819200166ce ("drm/amdkfd: Export DMABufs from KFD using GEM handles") Signed-off-by: Flora Cui Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 067690ba7bffd..81af6bf2f0525 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -138,6 +138,9 @@ static void amdgpu_amdkfd_reset_work(struct work_struct *work) amdgpu_device_gpu_recover(adev, NULL, &reset_context); } +static const struct drm_client_funcs kfd_client_funcs = { + .unregister = drm_client_release, +}; void amdgpu_amdkfd_device_init(struct amdgpu_device *adev) { int i; @@ -161,7 +164,7 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev) .enable_mes = adev->enable_mes, }; - ret = drm_client_init(&adev->ddev, &adev->kfd.client, "kfd", NULL); + ret = drm_client_init(&adev->ddev, &adev->kfd.client, "kfd", &kfd_client_funcs); if (ret) { dev_err(adev->dev, "Failed to init DRM client: %d\n", ret); return; -- GitLab From fb1c93c2e9604a884467a773790016199f78ca08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Wed, 10 Jan 2024 15:19:29 +0100 Subject: [PATCH 1168/1290] drm/amdgpu: revert "Adjust removal control flow for smu v13_0_2" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Calling amdgpu_device_ip_resume_phase1() during shutdown leaves the HW in an active state and is an unbalanced use of the IP callbacks. Using the IP callbacks like this can lead to memory leaks, double free and imbalanced reference counters. Leaving the HW in an active state can lead to DMA accesses to memory now freed by the driver. Both is a complete no-go for driver unload so completely revert the workaround for now. This reverts commit f5c7e7797060255dbc8160734ccc5ad6183c5e04. Signed-off-by: Christian König Acked-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 32 +--------------------- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 32 ---------------------- drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 1 - drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 1 - 4 files changed, 1 insertion(+), 65 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index ecbc58269951c..b158d27d0a71c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5246,7 +5246,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, struct amdgpu_device *tmp_adev = NULL; bool need_full_reset, skip_hw_reset, vram_lost = false; int r = 0; - bool gpu_reset_for_dev_remove = 0; /* Try reset handler method first */ tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, @@ -5266,10 +5265,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); - gpu_reset_for_dev_remove = - test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && - test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); - /* * ASIC reset has to be done on all XGMI hive nodes ASAP * to allow proper links negotiation in FW (within 1 sec) @@ -5312,18 +5307,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, amdgpu_ras_intr_cleared(); } - /* Since the mode1 reset affects base ip blocks, the - * phase1 ip blocks need to be resumed. Otherwise there - * will be a BIOS signature error and the psp bootloader - * can't load kdb on the next amdgpu install. - */ - if (gpu_reset_for_dev_remove) { - list_for_each_entry(tmp_adev, device_list_handle, reset_list) - amdgpu_device_ip_resume_phase1(tmp_adev); - - goto end; - } - list_for_each_entry(tmp_adev, device_list_handle, reset_list) { if (need_full_reset) { /* post card */ @@ -5560,11 +5543,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, int i, r = 0; bool need_emergency_restart = false; bool audio_suspended = false; - bool gpu_reset_for_dev_remove = false; - - gpu_reset_for_dev_remove = - test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && - test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); /* * Special case: RAS triggered and full reset isn't supported @@ -5602,7 +5580,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { list_add_tail(&tmp_adev->reset_list, &device_list); - if (gpu_reset_for_dev_remove && adev->shutdown) + if (adev->shutdown) tmp_adev->shutdown = true; } if (!list_is_first(&adev->reset_list, &device_list)) @@ -5687,10 +5665,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, retry: /* Rest of adevs pre asic reset from XGMI hive. */ list_for_each_entry(tmp_adev, device_list_handle, reset_list) { - if (gpu_reset_for_dev_remove) { - /* Workaroud for ASICs need to disable SMC first */ - amdgpu_device_smu_fini_early(tmp_adev); - } r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); /*TODO Should we stop ?*/ if (r) { @@ -5722,9 +5696,6 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ r = amdgpu_do_asic_reset(device_list_handle, reset_context); if (r && r == -EAGAIN) goto retry; - - if (!r && gpu_reset_for_dev_remove) - goto recover_end; } skip_hw_reset: @@ -5780,7 +5751,6 @@ skip_sched_resume: amdgpu_ras_set_error_query_ready(tmp_adev, true); } -recover_end: tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, reset_list); amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 5c9caf5fa0758..cc69005f5b46e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2337,38 +2337,6 @@ amdgpu_pci_remove(struct pci_dev *pdev) pm_runtime_forbid(dev->dev); } - if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 2) && - !amdgpu_sriov_vf(adev)) { - bool need_to_reset_gpu = false; - - if (adev->gmc.xgmi.num_physical_nodes > 1) { - struct amdgpu_hive_info *hive; - - hive = amdgpu_get_xgmi_hive(adev); - if (hive->device_remove_count == 0) - need_to_reset_gpu = true; - hive->device_remove_count++; - amdgpu_put_xgmi_hive(hive); - } else { - need_to_reset_gpu = true; - } - - /* Workaround for ASICs need to reset SMU. - * Called only when the first device is removed. - */ - if (need_to_reset_gpu) { - struct amdgpu_reset_context reset_context; - - adev->shutdown = true; - memset(&reset_context, 0, sizeof(reset_context)); - reset_context.method = AMD_RESET_METHOD_NONE; - reset_context.reset_req_dev = adev; - set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); - set_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context.flags); - amdgpu_device_gpu_recover(adev, NULL, &reset_context); - } - } - amdgpu_driver_unload_kms(dev); /* diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h index b0335a1c5e90c..19899f6b9b2b4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h @@ -32,7 +32,6 @@ enum AMDGPU_RESET_FLAGS { AMDGPU_NEED_FULL_RESET = 0, AMDGPU_SKIP_HW_RESET = 1, - AMDGPU_RESET_FOR_DEVICE_REMOVE = 2, }; struct amdgpu_reset_context { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h index 6cab882e8061e..1592c63b3099b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h @@ -43,7 +43,6 @@ struct amdgpu_hive_info { } pstate; struct amdgpu_reset_domain *reset_domain; - uint32_t device_remove_count; atomic_t ras_recovery; }; -- GitLab From b2139c96dc954b58b81bc670fc4ea5f034ed062c Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Sat, 13 Jan 2024 14:32:27 +0530 Subject: [PATCH 1169/1290] drm/amd/display: Drop 'acrtc' and add 'new_crtc_state' NULL check for writeback requests. Return value of 'to_amdgpu_crtc' which is container_of(...) can't be null, so it's null check 'acrtc' is dropped. Fixing the below: drivers/gpu/drm/amd/amdgpu/../display/amdgpu_dm/amdgpu_dm.c:9302 amdgpu_dm_atomic_commit_tail() error: we previously assumed 'acrtc' could be null (see line 9299) Added 'new_crtc_state' NULL check for function 'drm_atomic_get_new_crtc_state' that retrieves the new state for a CRTC, while enabling writeback requests. Cc: stable@vger.kernel.org Cc: Alex Hung Cc: Aurabindo Pillai Cc: Rodrigo Siqueira Cc: Hamza Mahfooz Signed-off-by: Srinivasan Shanmugam Reviewed-by: Alex Hung Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 10b2a896c4982..d55eeb30ccb2c 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -9293,10 +9293,10 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state) if (!new_con_state->writeback_job) continue; - new_crtc_state = NULL; + new_crtc_state = drm_atomic_get_new_crtc_state(state, &acrtc->base); - if (acrtc) - new_crtc_state = drm_atomic_get_new_crtc_state(state, &acrtc->base); + if (!new_crtc_state) + continue; if (acrtc->wb_enabled) continue; -- GitLab From aa0901a9008eeb2710292aff94e615adf7884d5f Mon Sep 17 00:00:00 2001 From: Ori Messinger Date: Wed, 22 Nov 2023 00:12:13 -0500 Subject: [PATCH 1170/1290] drm/amdgpu: Enable GFXOFF for Compute on GFX11 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On GFX version 11, GFXOFF was disabled due to a MES KIQ firmware issue, which has since been fixed after version 64. This patch only re-enables GFXOFF for GFX version 11 if the GPU's MES KIQ firmware version is newer than version 64. V2: Keep GFXOFF disabled on GFX11 if MES KIQ is below version 64. V3: Add parentheses to avoid GCC warning for parentheses: "suggest parentheses around comparison in operand of ‘&’" V4: Remove "V3" from commit title V5: Change commit description and insert 'Acked-by' Signed-off-by: Ori Messinger Acked-by: Alex Deucher Reviewed-by: Harish Kasiviswanathan Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 81af6bf2f0525..77e2636602887 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -698,10 +698,8 @@ err: void amdgpu_amdkfd_set_compute_idle(struct amdgpu_device *adev, bool idle) { enum amd_powergating_state state = idle ? AMD_PG_STATE_GATE : AMD_PG_STATE_UNGATE; - /* Temporary workaround to fix issues observed in some - * compute applications when GFXOFF is enabled on GFX11. - */ - if (IP_VERSION_MAJ(amdgpu_ip_version(adev, GC_HWIP, 0)) == 11) { + if (IP_VERSION_MAJ(amdgpu_ip_version(adev, GC_HWIP, 0)) == 11 && + ((adev->mes.kiq_version & AMDGPU_MES_VERSION_MASK) <= 64)) { pr_debug("GFXOFF is %s\n", idle ? "enabled" : "disabled"); amdgpu_gfx_off_ctrl(adev, idle); } else if ((IP_VERSION_MAJ(amdgpu_ip_version(adev, GC_HWIP, 0)) == 9) && -- GitLab From 66f962d8939fd2ac74de901d30d30310c8ddca79 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Thu, 18 Jan 2024 22:21:20 +0100 Subject: [PATCH 1171/1290] riscv: Fix build error on rv32 + XIP commit 66f1e6809397 ("riscv: Make XIP bootable again") restricted page offset to the sv39 page offset instead of the default sv57, which makes sense since probably the platforms that target XIP kernels do not support anything else than sv39 and we do not try to find out the largest address space supported on XIP kernels (ie set_satp_mode()). But PAGE_OFFSET_L3 is not defined for rv32, so fix the build error by restoring the previous behaviour which picks CONFIG_PAGE_OFFSET for rv32. Fixes: 66f1e6809397 ("riscv: Make XIP bootable again") Reported-by: Randy Dunlap Closes: https://lore.kernel.org/linux-riscv/344dca85-5c48-44e1-bc64-4fa7973edd12@infradead.org/T/#u Signed-off-by: Alexandre Ghiti Acked-by: Randy Dunlap Tested-by: Randy Dunlap # build-tested Link: https://lore.kernel.org/r/20240118212120.2087803-1-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index f533dd667a83a..32cad6a65ccd2 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1060,7 +1060,11 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) kernel_map.virt_addr = KERNEL_LINK_ADDR + kernel_map.virt_offset; #ifdef CONFIG_XIP_KERNEL +#ifdef CONFIG_64BIT kernel_map.page_offset = PAGE_OFFSET_L3; +#else + kernel_map.page_offset = _AC(CONFIG_PAGE_OFFSET, UL); +#endif kernel_map.xiprom = (uintptr_t)CONFIG_XIP_PHYS_ADDR; kernel_map.xiprom_sz = (uintptr_t)(&_exiprom) - (uintptr_t)(&_xiprom); -- GitLab From 2c25716dcc25a0420c4ad49d6e6bf61e60a21434 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 8 Jan 2024 19:38:44 +1030 Subject: [PATCH 1172/1290] btrfs: zlib: fix and simplify the inline extent decompression [BUG] If we have a filesystem with 4k sectorsize, and an inlined compressed extent created like this: item 4 key (257 INODE_ITEM 0) itemoff 15863 itemsize 160 generation 8 transid 8 size 4096 nbytes 4096 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 sequence 1 flags 0x0(none) item 5 key (257 INODE_REF 256) itemoff 15839 itemsize 24 index 2 namelen 14 name: source_inlined item 6 key (257 EXTENT_DATA 0) itemoff 15770 itemsize 69 generation 8 type 0 (inline) inline extent data size 48 ram_bytes 4096 compression 1 (zlib) Which has an inline compressed extent at file offset 0, and its decompressed size is 4K, allowing us to reflink that 4K range to another location (which will not be compressed). If we do such reflink on a subpage system, it would fail like this: # xfs_io -f -c "reflink $mnt/source_inlined 0 60k 4k" $mnt/dest XFS_IOC_CLONE_RANGE: Input/output error [CAUSE] In zlib_decompress(), we didn't treat @start_byte as just a page offset, but also use it as an indicator on whether we should switch our output buffer. In reality, for subpage cases, although @start_byte can be non-zero, we should never switch input/output buffer, since the whole input/output buffer should never exceed one sector. Note: The above assumption is only not true if we're going to support multi-page sectorsize. Thus the current code using @start_byte as a condition to switch input/output buffer or finish the decompression is completely incorrect. [FIX] The fix involves several modifications: - Rename @start_byte to @dest_pgoff to properly express its meaning - Add an extra ASSERT() inside btrfs_decompress() to make sure the input/output size never exceeds one sector. - Use Z_FINISH flag to make sure the decompression happens in one go - Remove the loop needed to switch input/output buffers - Use correct destination offset inside the destination page - Consider early end as an error After the fix, even on 64K page sized aarch64, above reflink now works as expected: # xfs_io -f -c "reflink $mnt/source_inlined 0 60k 4k" $mnt/dest linked 4096/4096 bytes at offset 61440 And resulted a correct file layout: item 9 key (258 INODE_ITEM 0) itemoff 15542 itemsize 160 generation 10 transid 10 size 65536 nbytes 4096 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 sequence 1 flags 0x0(none) item 10 key (258 INODE_REF 256) itemoff 15528 itemsize 14 index 3 namelen 4 name: dest item 11 key (258 XATTR_ITEM 3817753667) itemoff 15445 itemsize 83 location key (0 UNKNOWN.0 0) type XATTR transid 10 data_len 37 name_len 16 name: security.selinux data unconfined_u:object_r:unlabeled_t:s0 item 12 key (258 EXTENT_DATA 61440) itemoff 15392 itemsize 53 generation 10 type 1 (regular) extent data disk byte 13631488 nr 4096 extent data offset 0 nr 4096 ram 4096 extent compression 0 (none) Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 23 +++++++++---- fs/btrfs/compression.h | 2 +- fs/btrfs/zlib.c | 73 +++++++++++------------------------------- 3 files changed, 36 insertions(+), 62 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 193168214eeb1..68345f73d429a 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -141,16 +141,16 @@ static int compression_decompress_bio(struct list_head *ws, } static int compression_decompress(int type, struct list_head *ws, - const u8 *data_in, struct page *dest_page, - unsigned long start_byte, size_t srclen, size_t destlen) + const u8 *data_in, struct page *dest_page, + unsigned long dest_pgoff, size_t srclen, size_t destlen) { switch (type) { case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page, - start_byte, srclen, destlen); + dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page, - start_byte, srclen, destlen); + dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page, - start_byte, srclen, destlen); + dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_NONE: default: /* @@ -1037,14 +1037,23 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) * start_byte tells us the offset into the compressed data we're interested in */ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, - unsigned long start_byte, size_t srclen, size_t destlen) + unsigned long dest_pgoff, size_t srclen, size_t destlen) { + struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); struct list_head *workspace; + const u32 sectorsize = fs_info->sectorsize; int ret; + /* + * The full destination page range should not exceed the page size. + * And the @destlen should not exceed sectorsize, as this is only called for + * inline file extents, which should not exceed sectorsize. + */ + ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize); + workspace = get_workspace(type, 0); ret = compression_decompress(type, workspace, data_in, dest_page, - start_byte, srclen, destlen); + dest_pgoff, srclen, destlen); put_workspace(type, workspace); return ret; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 93cc92974deee..2b4dfb1b010cd 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -148,7 +148,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zlib_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen); struct list_head *zlib_alloc_workspace(unsigned int level); void zlib_free_workspace(struct list_head *ws); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 36cf1f0e338e2..8da66ea699e8f 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -354,18 +354,13 @@ done: } int zlib_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0; int wbits = MAX_WBITS; - unsigned long bytes_left; - unsigned long total_out = 0; - unsigned long pg_offset = 0; - - destlen = min_t(unsigned long, destlen, PAGE_SIZE); - bytes_left = destlen; + unsigned long to_copy; workspace->strm.next_in = data_in; workspace->strm.avail_in = srclen; @@ -390,60 +385,30 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in, return -EIO; } - while (bytes_left > 0) { - unsigned long buf_start; - unsigned long buf_offset; - unsigned long bytes; - - ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH); - if (ret != Z_OK && ret != Z_STREAM_END) - break; - - buf_start = total_out; - total_out = workspace->strm.total_out; - - if (total_out == buf_start) { - ret = -EIO; - break; - } - - if (total_out <= start_byte) - goto next; - - if (total_out > start_byte && buf_start < start_byte) - buf_offset = start_byte - buf_start; - else - buf_offset = 0; - - bytes = min(PAGE_SIZE - pg_offset, - PAGE_SIZE - (buf_offset % PAGE_SIZE)); - bytes = min(bytes, bytes_left); + /* + * Everything (in/out buf) should be at most one sector, there should + * be no need to switch any input/output buffer. + */ + ret = zlib_inflate(&workspace->strm, Z_FINISH); + to_copy = min(workspace->strm.total_out, destlen); + if (ret != Z_STREAM_END) + goto out; - memcpy_to_page(dest_page, pg_offset, - workspace->buf + buf_offset, bytes); + memcpy_to_page(dest_page, dest_pgoff, workspace->buf, to_copy); - pg_offset += bytes; - bytes_left -= bytes; -next: - workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = workspace->buf_size; - } - - if (ret != Z_STREAM_END && bytes_left != 0) +out: + if (unlikely(to_copy != destlen)) { + pr_warn_ratelimited("BTRFS: infalte failed, decompressed=%lu expected=%zu\n", + to_copy, destlen); ret = -EIO; - else + } else { ret = 0; + } zlib_inflateEnd(&workspace->strm); - /* - * this should only happen if zlib returned fewer bytes than we - * expected. btrfs_get_block is responsible for zeroing from the - * end of the inline extent (destlen) to the end of the page - */ - if (pg_offset < destlen) { - memzero_page(dest_page, pg_offset, destlen - pg_offset); - } + if (unlikely(to_copy < destlen)) + memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy); return ret; } -- GitLab From 6a69631ec9b1b23784c012a855bbb23012a6dbeb Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 8 Jan 2024 19:38:45 +1030 Subject: [PATCH 1173/1290] btrfs: lzo: fix and simplify the inline extent decompression [BUG] If we have a filesystem with 4k sectorsize, and an inlined compressed extent created like this: item 4 key (257 INODE_ITEM 0) itemoff 15863 itemsize 160 generation 8 transid 8 size 4096 nbytes 4096 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 sequence 1 flags 0x0(none) item 5 key (257 INODE_REF 256) itemoff 15839 itemsize 24 index 2 namelen 14 name: source_inlined item 6 key (257 EXTENT_DATA 0) itemoff 15770 itemsize 69 generation 8 type 0 (inline) inline extent data size 48 ram_bytes 4096 compression 2 (lzo) Then trying to reflink that extent in an aarch64 system with 64K page size, the reflink would just fail: # xfs_io -f -c "reflink $mnt/source_inlined 0 60k 4k" $mnt/dest XFS_IOC_CLONE_RANGE: Input/output error [CAUSE] In zlib_decompress(), we didn't treat @start_byte as just a page offset, but also use it as an indicator on whether we should error out, without any proper explanation (this is from the very beginning of btrfs). In reality, for subpage cases, although @start_byte can be non-zero, we should never switch input/output buffer nor error out, since the whole input/output buffer should never exceed one sector. Note: The above assumption is only not true if we're going to support multi-page sectorsize. Thus the current code using @start_byte as a condition to switch input/output buffer or finish the decompression is completely incorrect. [FIX] The fix involves several modifications: - Rename @start_byte to @dest_pgoff to properly express its meaning - Use @sectorsize other than PAGE_SIZE to properly initialize the output buffer size - Use correct destination offset inside the destination page - Use memcpy_to_page() to copy the contents to the destination page - Use memzero_page() to zero out the tailing part - Consider early end as an error After the fix, even on 64K page sized aarch64, above reflink now works as expected: # xfs_io -f -c "reflink $mnt/source_inlined 0 60k 4k" $mnt/dest linked 4096/4096 bytes at offset 61440 And results the correct file layout: item 9 key (258 INODE_ITEM 0) itemoff 15542 itemsize 160 generation 10 transid 10 size 65536 nbytes 4096 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 sequence 1 flags 0x0(none) item 10 key (258 INODE_REF 256) itemoff 15528 itemsize 14 index 3 namelen 4 name: dest item 11 key (258 XATTR_ITEM 3817753667) itemoff 15445 itemsize 83 location key (0 UNKNOWN.0 0) type XATTR transid 10 data_len 37 name_len 16 name: security.selinux data unconfined_u:object_r:unlabeled_t:s0 item 12 key (258 EXTENT_DATA 61440) itemoff 15392 itemsize 53 generation 10 type 1 (regular) extent data disk byte 13631488 nr 4096 extent data offset 0 nr 4096 ram 4096 extent compression 0 (none) Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.h | 2 +- fs/btrfs/lzo.c | 34 +++++++++------------------------- 2 files changed, 10 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 2b4dfb1b010cd..afd7e50d073d4 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -159,7 +159,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen); struct list_head *lzo_alloc_workspace(unsigned int level); void lzo_free_workspace(struct list_head *ws); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 1131d5a29d612..e43bc0fdc74ec 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -425,16 +425,16 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) } int lzo_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); + struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); + const u32 sectorsize = fs_info->sectorsize; size_t in_len; size_t out_len; size_t max_segment_len = WORKSPACE_BUF_LENGTH; int ret = 0; - char *kaddr; - unsigned long bytes; if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2) return -EUCLEAN; @@ -451,7 +451,7 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, } data_in += LZO_LEN; - out_len = PAGE_SIZE; + out_len = sectorsize; ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); if (ret != LZO_E_OK) { pr_warn("BTRFS: decompress failed!\n"); @@ -459,29 +459,13 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, goto out; } - if (out_len < start_byte) { + ASSERT(out_len <= sectorsize); + memcpy_to_page(dest_page, dest_pgoff, workspace->buf, out_len); + /* Early end, considered as an error. */ + if (unlikely(out_len < destlen)) { ret = -EIO; - goto out; + memzero_page(dest_page, dest_pgoff + out_len, destlen - out_len); } - - /* - * the caller is already checking against PAGE_SIZE, but lets - * move this check closer to the memcpy/memset - */ - destlen = min_t(unsigned long, destlen, PAGE_SIZE); - bytes = min_t(unsigned long, destlen, out_len - start_byte); - - kaddr = kmap_local_page(dest_page); - memcpy(kaddr, workspace->buf + start_byte, bytes); - - /* - * btrfs_getblock is doing a zero on the tail of the page too, - * but this will cover anything missing from the decompressed - * data. - */ - if (bytes < destlen) - memset(kaddr+bytes, 0, destlen-bytes); - kunmap_local(kaddr); out: return ret; } -- GitLab From 1e7f6def8b2370ecefb54b3c8f390ff894b0c51b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 8 Jan 2024 19:38:46 +1030 Subject: [PATCH 1174/1290] btrfs: zstd: fix and simplify the inline extent decompression [BUG] If we have a filesystem with 4k sectorsize, and an inlined compressed extent created like this: item 4 key (257 INODE_ITEM 0) itemoff 15863 itemsize 160 generation 8 transid 8 size 4096 nbytes 4096 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 sequence 1 flags 0x0(none) item 5 key (257 INODE_REF 256) itemoff 15839 itemsize 24 index 2 namelen 14 name: source_inlined item 6 key (257 EXTENT_DATA 0) itemoff 15770 itemsize 69 generation 8 type 0 (inline) inline extent data size 48 ram_bytes 4096 compression 3 (zstd) Then trying to reflink that extent in an aarch64 system with 64K page size, the reflink would just fail: # xfs_io -f -c "reflink $mnt/source_inlined 0 60k 4k" $mnt/dest XFS_IOC_CLONE_RANGE: Input/output error [CAUSE] In zstd_decompress(), we didn't treat @start_byte as just a page offset, but also use it as an indicator on whether we should error out, without any proper explanation (this is copied from other decompression code). In reality, for subpage cases, although @start_byte can be non-zero, we should never switch input/output buffer nor error out, since the whole input/output buffer should never exceed one sector, thus we should not need to do any buffer switch. Thus the current code using @start_byte as a condition to switch input/output buffer or finish the decompression is completely incorrect. [FIX] The fix involves several modification: - Rename @start_byte to @dest_pgoff to properly express its meaning - Use @sectorsize other than PAGE_SIZE to properly initialize the output buffer size - Use correct destination offset inside the destination page - Simplify the main loop Since the input/output buffer should never switch, we only need one zstd_decompress_stream() call. - Consider early end as an error After the fix, even on 64K page sized aarch64, above reflink now works as expected: # xfs_io -f -c "reflink $mnt/source_inlined 0 60k 4k" $mnt/dest linked 4096/4096 bytes at offset 61440 And results the correct file layout: item 9 key (258 INODE_ITEM 0) itemoff 15542 itemsize 160 generation 10 transid 10 size 65536 nbytes 4096 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 sequence 1 flags 0x0(none) item 10 key (258 INODE_REF 256) itemoff 15528 itemsize 14 index 3 namelen 4 name: dest item 11 key (258 XATTR_ITEM 3817753667) itemoff 15445 itemsize 83 location key (0 UNKNOWN.0 0) type XATTR transid 10 data_len 37 name_len 16 name: security.selinux data unconfined_u:object_r:unlabeled_t:s0 item 12 key (258 EXTENT_DATA 61440) itemoff 15392 itemsize 53 generation 10 type 1 (regular) extent data disk byte 13631488 nr 4096 extent data offset 0 nr 4096 ram 4096 extent compression 0 (none) Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.h | 2 +- fs/btrfs/zstd.c | 75 +++++++++++++----------------------------- 2 files changed, 23 insertions(+), 54 deletions(-) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index afd7e50d073d4..97fe3ebf11a22 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -169,7 +169,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zstd_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen); void zstd_init_workspace_manager(void); void zstd_cleanup_workspace_manager(void); diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 0d66db8bc1d47..346c46d88d07f 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -20,6 +20,7 @@ #include "misc.h" #include "compression.h" #include "ctree.h" +#include "super.h" #define ZSTD_BTRFS_MAX_WINDOWLOG 17 #define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG) @@ -618,80 +619,48 @@ done: } int zstd_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); + struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); + const u32 sectorsize = fs_info->sectorsize; zstd_dstream *stream; int ret = 0; - size_t ret2; - unsigned long total_out = 0; - unsigned long pg_offset = 0; + unsigned long to_copy = 0; stream = zstd_init_dstream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); if (!stream) { pr_warn("BTRFS: zstd_init_dstream failed\n"); - ret = -EIO; goto finish; } - destlen = min_t(size_t, destlen, PAGE_SIZE); - workspace->in_buf.src = data_in; workspace->in_buf.pos = 0; workspace->in_buf.size = srclen; workspace->out_buf.dst = workspace->buf; workspace->out_buf.pos = 0; - workspace->out_buf.size = PAGE_SIZE; - - ret2 = 1; - while (pg_offset < destlen - && workspace->in_buf.pos < workspace->in_buf.size) { - unsigned long buf_start; - unsigned long buf_offset; - unsigned long bytes; - - /* Check if the frame is over and we still need more input */ - if (ret2 == 0) { - pr_debug("BTRFS: zstd_decompress_stream ended early\n"); - ret = -EIO; - goto finish; - } - ret2 = zstd_decompress_stream(stream, &workspace->out_buf, - &workspace->in_buf); - if (zstd_is_error(ret2)) { - pr_debug("BTRFS: zstd_decompress_stream returned %d\n", - zstd_get_error_code(ret2)); - ret = -EIO; - goto finish; - } - - buf_start = total_out; - total_out += workspace->out_buf.pos; - workspace->out_buf.pos = 0; - - if (total_out <= start_byte) - continue; - - if (total_out > start_byte && buf_start < start_byte) - buf_offset = start_byte - buf_start; - else - buf_offset = 0; - - bytes = min_t(unsigned long, destlen - pg_offset, - workspace->out_buf.size - buf_offset); - - memcpy_to_page(dest_page, pg_offset, - workspace->out_buf.dst + buf_offset, bytes); - - pg_offset += bytes; + workspace->out_buf.size = sectorsize; + + /* + * Since both input and output buffers should not exceed one sector, + * one call should end the decompression. + */ + ret = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf); + if (zstd_is_error(ret)) { + pr_warn_ratelimited("BTRFS: zstd_decompress_stream return %d\n", + zstd_get_error_code(ret)); + goto finish; } - ret = 0; + to_copy = workspace->out_buf.pos; + memcpy_to_page(dest_page, dest_pgoff + to_copy, workspace->out_buf.dst, to_copy); finish: - if (pg_offset < destlen) { - memzero_page(dest_page, pg_offset, destlen - pg_offset); + /* Error or early end. */ + if (unlikely(to_copy < destlen)) { + ret = -EIO; + memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy); } return ret; } -- GitLab From f398e70dd69e6ceea71463a5380e6118f219197e Mon Sep 17 00:00:00 2001 From: Chung-Chiang Cheng Date: Fri, 12 Jan 2024 15:41:05 +0800 Subject: [PATCH 1175/1290] btrfs: tree-checker: fix inline ref size in error messages The error message should accurately reflect the size rather than the type. Fixes: f82d1c7ca8ae ("btrfs: tree-checker: Add EXTENT_ITEM and METADATA_ITEM check") CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Filipe Manana Reviewed-by: Qu Wenruo Signed-off-by: Chung-Chiang Cheng Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 50fdc69fdddf9..6eccf8496486c 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1436,7 +1436,7 @@ static int check_extent_item(struct extent_buffer *leaf, if (unlikely(ptr + btrfs_extent_inline_ref_size(inline_type) > end)) { extent_err(leaf, slot, "inline ref item overflows extent item, ptr %lu iref size %u end %lu", - ptr, inline_type, end); + ptr, btrfs_extent_inline_ref_size(inline_type), end); return -EUCLEAN; } -- GitLab From a208b3f132b48e1f94f620024e66fea635925877 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 15 Jan 2024 20:30:26 +0100 Subject: [PATCH 1176/1290] btrfs: don't warn if discard range is not aligned to sector There's a warning in btrfs_issue_discard() when the range is not aligned to 512 bytes, originally added in 4d89d377bbb0 ("btrfs: btrfs_issue_discard ensure offset/length are aligned to sector boundaries"). We can't do sub-sector writes anyway so the adjustment is the only thing that we can do and the warning is unnecessary. CC: stable@vger.kernel.org # 4.19+ Reported-by: syzbot+4a4f1eba14eb5c3417d1@syzkaller.appspotmail.com Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6d680031211a1..8e8cc11112772 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1260,7 +1260,8 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, u64 bytes_left, end; u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT); - if (WARN_ON(start != aligned_start)) { + /* Adjust the range to be aligned to 512B sectors if necessary. */ + if (start != aligned_start) { len -= aligned_start - start; len = round_down(len, 1 << SECTOR_SHIFT); start = aligned_start; -- GitLab From 2018ef1d9ac3e95448b9206adc3425b0431c2411 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 10 Jan 2024 12:54:41 -0500 Subject: [PATCH 1177/1290] btrfs: use the original mount's mount options for the legacy reconfigure btrfs/330, which tests our old trick to allow mount -o ro,subvol=/x /dev/sda1 /foo mount -o rw,subvol=/y /dev/sda1 /bar fails on the block group tree. This is because we aren't preserving the mount options for what is essentially a remount, and thus we're ending up without the FREE_SPACE_TREE mount option, which triggers our free space tree delete codepath. This isn't possible with the block group tree and thus it falls over. Fix this by making sure we copy the existing mount options for the existing fs mount over in this case. Fixes: f044b318675f ("btrfs: handle the ro->rw transition for mounting different subvolumes") Reviewed-by: Neal Gompa Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 3a677b808f0f7..f192f8fe0ce62 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1451,6 +1451,14 @@ static int btrfs_reconfigure(struct fs_context *fc) btrfs_info_to_ctx(fs_info, &old_ctx); + /* + * This is our "bind mount" trick, we don't want to allow the user to do + * anything other than mount a different ro/rw and a different subvol, + * all of the mount options should be maintained. + */ + if (mount_reconfigure) + ctx->mount_opt = old_ctx.mount_opt; + sync_filesystem(sb); set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); -- GitLab From 1e61b8c672ab2f59b282c8d48a29c14b52c0f5b4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 10 Jan 2024 17:14:21 -0500 Subject: [PATCH 1178/1290] btrfs: don't unconditionally call folio_start_writeback in subpage In the normal case we check if a page is under writeback and skip it before we attempt to begin writeback. The exception is subpage metadata writes, where we know we don't have an eb under writeback and we're doing it one eb at a time. Since b5612c368648 ("mm: return void from folio_start_writeback() and related functions") we now will BUG_ON() if we call folio_start_writeback() on a folio that's already under writeback. Previously folio_start_writeback() would bail if writeback was already started. Fix this in the subpage code by checking if we have writeback set and skipping it if we do. This fixes the panic we were seeing on subpage. Reviewed-by: Qu Wenruo Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/subpage.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index d9a30b93d543b..277dd6d312ee3 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -475,7 +475,8 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - folio_start_writeback(folio); + if (!folio_test_writeback(folio)) + folio_start_writeback(folio); spin_unlock_irqrestore(&subpage->lock, flags); } -- GitLab From 4525462dd0db9e86bb67c10dedbbaa4f8d62697d Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Thu, 18 Jan 2024 14:36:45 -0800 Subject: [PATCH 1179/1290] riscv: lib: Check if output in asm goto supported The output field of an asm goto statement is not supported by all compilers. If it is not supported, fallback to the non-optimized code. Signed-off-by: Charlie Jenkins Fixes: a04c192eabfb ("riscv: Add checksum library") Link: https://lore.kernel.org/r/20240118-csum_remove_output_operands_asm_goto-v2-1-5d1b73cf93d4@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/lib/csum.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/riscv/lib/csum.c b/arch/riscv/lib/csum.c index 06ce8e7250d98..af3df5274ccba 100644 --- a/arch/riscv/lib/csum.c +++ b/arch/riscv/lib/csum.c @@ -156,6 +156,7 @@ do_csum_with_alignment(const unsigned char *buff, int len) end = (const unsigned long *)(buff + len); csum = do_csum_common(ptr, end, data); +#ifdef CC_HAS_ASM_GOTO_TIED_OUTPUT /* * Zbb support saves 6 instructions, so not worth checking without * alternatives if supported @@ -214,6 +215,7 @@ end: return csum >> 16; } no_zbb: +#endif /* CC_HAS_ASM_GOTO_TIED_OUTPUT */ #ifndef CONFIG_32BIT csum += ror64(csum, 32); csum >>= 32; -- GitLab From f546c4282673497a06ecb6190b50ae7f6c85b02f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 17 Jan 2024 11:02:25 +1030 Subject: [PATCH 1180/1290] btrfs: scrub: avoid use-after-free when chunk length is not 64K aligned [BUG] There is a bug report that, on a ext4-converted btrfs, scrub leads to various problems, including: - "unable to find chunk map" errors BTRFS info (device vdb): scrub: started on devid 1 BTRFS critical (device vdb): unable to find chunk map for logical 2214744064 length 4096 BTRFS critical (device vdb): unable to find chunk map for logical 2214744064 length 45056 This would lead to unrepariable errors. - Use-after-free KASAN reports: ================================================================== BUG: KASAN: slab-use-after-free in __blk_rq_map_sg+0x18f/0x7c0 Read of size 8 at addr ffff8881013c9040 by task btrfs/909 CPU: 0 PID: 909 Comm: btrfs Not tainted 6.7.0-x64v3-dbg #11 c50636e9419a8354555555245df535e380563b2b Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 2023.11-2 12/24/2023 Call Trace: dump_stack_lvl+0x43/0x60 print_report+0xcf/0x640 kasan_report+0xa6/0xd0 __blk_rq_map_sg+0x18f/0x7c0 virtblk_prep_rq.isra.0+0x215/0x6a0 [virtio_blk 19a65eeee9ae6fcf02edfad39bb9ddee07dcdaff] virtio_queue_rqs+0xc4/0x310 [virtio_blk 19a65eeee9ae6fcf02edfad39bb9ddee07dcdaff] blk_mq_flush_plug_list.part.0+0x780/0x860 __blk_flush_plug+0x1ba/0x220 blk_finish_plug+0x3b/0x60 submit_initial_group_read+0x10a/0x290 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965] flush_scrub_stripes+0x38e/0x430 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965] scrub_stripe+0x82a/0xae0 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965] scrub_chunk+0x178/0x200 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965] scrub_enumerate_chunks+0x4bc/0xa30 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965] btrfs_scrub_dev+0x398/0x810 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965] btrfs_ioctl+0x4b9/0x3020 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965] __x64_sys_ioctl+0xbd/0x100 do_syscall_64+0x5d/0xe0 entry_SYSCALL_64_after_hwframe+0x63/0x6b RIP: 0033:0x7f47e5e0952b - Crash, mostly due to above use-after-free [CAUSE] The converted fs has the following data chunk layout: item 2 key (FIRST_CHUNK_TREE CHUNK_ITEM 2214658048) itemoff 16025 itemsize 80 length 86016 owner 2 stripe_len 65536 type DATA|single For above logical bytenr 2214744064, it's at the chunk end (2214658048 + 86016 = 2214744064). This means btrfs_submit_bio() would split the bio, and trigger endio function for both of the two halves. However scrub_submit_initial_read() would only expect the endio function to be called once, not any more. This means the first endio function would already free the bbio::bio, leaving the bvec freed, thus the 2nd endio call would lead to use-after-free. [FIX] - Make sure scrub_read_endio() only updates bits in its range Since we may read less than 64K at the end of the chunk, we should not touch the bits beyond chunk boundary. - Make sure scrub_submit_initial_read() only to read the chunk range This is done by calculating the real number of sectors we need to read, and add sector-by-sector to the bio. Thankfully the scrub read repair path won't need extra fixes: - scrub_stripe_submit_repair_read() With above fixes, we won't update error bit for range beyond chunk, thus scrub_stripe_submit_repair_read() should never submit any read beyond the chunk. Reported-by: Rongrong Fixes: e02ee89baa66 ("btrfs: scrub: switch scrub_simple_mirror() to scrub_stripe infrastructure") Tested-by: Rongrong Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a01807cbd4d44..2d81b1a18a046 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1098,12 +1098,22 @@ out: static void scrub_read_endio(struct btrfs_bio *bbio) { struct scrub_stripe *stripe = bbio->private; + struct bio_vec *bvec; + int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); + int num_sectors; + u32 bio_size = 0; + int i; + + ASSERT(sector_nr < stripe->nr_sectors); + bio_for_each_bvec_all(bvec, &bbio->bio, i) + bio_size += bvec->bv_len; + num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits; if (bbio->bio.bi_status) { - bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors); - bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors); + bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors); + bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors); } else { - bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors); + bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors); } bio_put(&bbio->bio); if (atomic_dec_and_test(&stripe->pending_io)) { @@ -1701,6 +1711,9 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, { struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_bio *bbio; + unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start + + stripe->bg->length - stripe->logical) >> + fs_info->sectorsize_bits; int mirror = stripe->mirror_num; ASSERT(stripe->bg); @@ -1715,14 +1728,16 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info, scrub_read_endio, stripe); - /* Read the whole stripe. */ bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT; - for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) { + /* Read the whole range inside the chunk boundary. */ + for (unsigned int cur = 0; cur < nr_sectors; cur++) { + struct page *page = scrub_stripe_get_page(stripe, cur); + unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur); int ret; - ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0); + ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); /* We should have allocated enough bio vectors. */ - ASSERT(ret == PAGE_SIZE); + ASSERT(ret == fs_info->sectorsize); } atomic_inc(&stripe->pending_io); -- GitLab From 7f2d219e78e95a137a9c76fddac7ff8228260439 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 17 Jan 2024 11:02:26 +1030 Subject: [PATCH 1181/1290] btrfs: scrub: limit RST scrub to chunk boundary [BUG] If there is an extent beyond chunk boundary, currently RST scrub would error out. [CAUSE] In scrub_submit_extent_sector_read(), we completely rely on extent_sector_bitmap, which is populated using extent tree. The extent tree can be corrupted that there is an extent item beyond a chunk. In that case, RST scrub would fail and error out. [FIX] Despite the extent_sector_bitmap usage, also limit the read to chunk boundary. Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 2d81b1a18a046..0123d27289237 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1646,6 +1646,9 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct btrfs_bio *bbio = NULL; + unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start + + stripe->bg->length - stripe->logical) >> + fs_info->sectorsize_bits; u64 stripe_len = BTRFS_STRIPE_LEN; int mirror = stripe->mirror_num; int i; @@ -1656,6 +1659,10 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, struct page *page = scrub_stripe_get_page(stripe, i); unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i); + /* We're beyond the chunk boundary, no need to read anymore. */ + if (i >= nr_sectors) + break; + /* The current sector cannot be merged, submit the bio. */ if (bbio && ((i > 0 && -- GitLab From 17d49b7e47a1001c8796f05f4a2bbdef0a998213 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 3 Jan 2024 09:57:07 -0700 Subject: [PATCH 1182/1290] power: supply: bq24190_charger: Fix "initializer element is not constant" error When building with a version of GCC prior to 8.x, there is an error around non-constant initializer elements: drivers/power/supply/bq24190_charger.c:1978:16: error: initializer element is not constant .vbus_desc = bq24190_vbus_desc, ^~~~~~~~~~~~~~~~~ drivers/power/supply/bq24190_charger.c:1978:16: note: (near initialization for 'bq24190_chip_info_tbl[0].vbus_desc') drivers/power/supply/bq24190_charger.c:1989:16: error: initializer element is not constant .vbus_desc = bq24190_vbus_desc, ^~~~~~~~~~~~~~~~~ drivers/power/supply/bq24190_charger.c:1989:16: note: (near initialization for 'bq24190_chip_info_tbl[1].vbus_desc') drivers/power/supply/bq24190_charger.c:2000:16: error: initializer element is not constant .vbus_desc = bq24190_vbus_desc, ^~~~~~~~~~~~~~~~~ drivers/power/supply/bq24190_charger.c:2000:16: note: (near initialization for 'bq24190_chip_info_tbl[2].vbus_desc') drivers/power/supply/bq24190_charger.c:2011:16: error: initializer element is not constant .vbus_desc = bq24190_vbus_desc, ^~~~~~~~~~~~~~~~~ drivers/power/supply/bq24190_charger.c:2011:16: note: (near initialization for 'bq24190_chip_info_tbl[3].vbus_desc') drivers/power/supply/bq24190_charger.c:2022:16: error: initializer element is not constant .vbus_desc = bq24296_vbus_desc, ^~~~~~~~~~~~~~~~~ drivers/power/supply/bq24190_charger.c:2022:16: note: (near initialization for 'bq24190_chip_info_tbl[4].vbus_desc') Clang versions prior to 17.x show a similar error: drivers/power/supply/bq24190_charger.c:1978:16: error: initializer element is not a compile-time constant .vbus_desc = bq24190_vbus_desc, ^~~~~~~~~~~~~~~~~ 1 error generated. Newer compilers have decided to accept these structures as compile time constants as an extension. To resolve this issue for all supported compilers, change the vbus_desc member in 'struct bq24190_chip_info' to a pointer, as it is only ever passed by reference anyways, and adjust the assignments accordingly. Closes: https://github.com/ClangBuiltLinux/linux/issues/1973 Fixes: b150a703b56f ("power: supply: bq24190_charger: Add support for BQ24296") Signed-off-by: Nathan Chancellor Reviewed-by: Justin Stitt Link: https://lore.kernel.org/r/20240103-fix-bq24190_charger-vbus_desc-non-const-v1-1-115ddf798c70@kernel.org Signed-off-by: Sebastian Reichel --- drivers/power/supply/bq24190_charger.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/power/supply/bq24190_charger.c b/drivers/power/supply/bq24190_charger.c index a8995a21fadbf..2b393eb5c2820 100644 --- a/drivers/power/supply/bq24190_charger.c +++ b/drivers/power/supply/bq24190_charger.c @@ -246,7 +246,7 @@ struct bq24190_dev_info { struct bq24190_chip_info { int ichg_array_size; #ifdef CONFIG_REGULATOR - const struct regulator_desc vbus_desc; + const struct regulator_desc *vbus_desc; #endif int (*check_chip)(struct bq24190_dev_info *bdi); int (*set_chg_config)(struct bq24190_dev_info *bdi, const u8 chg_config); @@ -728,7 +728,7 @@ static int bq24190_register_vbus_regulator(struct bq24190_dev_info *bdi) else cfg.init_data = &bq24190_vbus_init_data; cfg.driver_data = bdi; - reg = devm_regulator_register(bdi->dev, &bdi->info->vbus_desc, &cfg); + reg = devm_regulator_register(bdi->dev, bdi->info->vbus_desc, &cfg); if (IS_ERR(reg)) { ret = PTR_ERR(reg); dev_err(bdi->dev, "Can't register regulator: %d\n", ret); @@ -1975,7 +1975,7 @@ static const struct bq24190_chip_info bq24190_chip_info_tbl[] = { [BQ24190] = { .ichg_array_size = ARRAY_SIZE(bq24190_ccc_ichg_values), #ifdef CONFIG_REGULATOR - .vbus_desc = bq24190_vbus_desc, + .vbus_desc = &bq24190_vbus_desc, #endif .check_chip = bq24190_check_chip, .set_chg_config = bq24190_battery_set_chg_config, @@ -1986,7 +1986,7 @@ static const struct bq24190_chip_info bq24190_chip_info_tbl[] = { [BQ24192] = { .ichg_array_size = ARRAY_SIZE(bq24190_ccc_ichg_values), #ifdef CONFIG_REGULATOR - .vbus_desc = bq24190_vbus_desc, + .vbus_desc = &bq24190_vbus_desc, #endif .check_chip = bq24190_check_chip, .set_chg_config = bq24190_battery_set_chg_config, @@ -1997,7 +1997,7 @@ static const struct bq24190_chip_info bq24190_chip_info_tbl[] = { [BQ24192i] = { .ichg_array_size = ARRAY_SIZE(bq24190_ccc_ichg_values), #ifdef CONFIG_REGULATOR - .vbus_desc = bq24190_vbus_desc, + .vbus_desc = &bq24190_vbus_desc, #endif .check_chip = bq24190_check_chip, .set_chg_config = bq24190_battery_set_chg_config, @@ -2008,7 +2008,7 @@ static const struct bq24190_chip_info bq24190_chip_info_tbl[] = { [BQ24196] = { .ichg_array_size = ARRAY_SIZE(bq24190_ccc_ichg_values), #ifdef CONFIG_REGULATOR - .vbus_desc = bq24190_vbus_desc, + .vbus_desc = &bq24190_vbus_desc, #endif .check_chip = bq24190_check_chip, .set_chg_config = bq24190_battery_set_chg_config, @@ -2019,7 +2019,7 @@ static const struct bq24190_chip_info bq24190_chip_info_tbl[] = { [BQ24296] = { .ichg_array_size = BQ24296_CCC_ICHG_VALUES_LEN, #ifdef CONFIG_REGULATOR - .vbus_desc = bq24296_vbus_desc, + .vbus_desc = &bq24296_vbus_desc, #endif .check_chip = bq24296_check_chip, .set_chg_config = bq24296_battery_set_chg_config, -- GitLab From d7851dc13d87688e2c532f0e77c2bd29f902d6cf Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 17 Jan 2024 17:59:59 -0600 Subject: [PATCH 1183/1290] smb3: minor documentation updates Update the usage documentation to include some missing configuration options. Update the todo list documentation for cifs.ko Reviewed-by: Bharath SM Signed-off-by: Steve French --- Documentation/admin-guide/cifs/todo.rst | 36 +++++++++++++----------- Documentation/admin-guide/cifs/usage.rst | 8 +++++- 2 files changed, 27 insertions(+), 17 deletions(-) diff --git a/Documentation/admin-guide/cifs/todo.rst b/Documentation/admin-guide/cifs/todo.rst index 2646ed2e2d3e3..e46c360013947 100644 --- a/Documentation/admin-guide/cifs/todo.rst +++ b/Documentation/admin-guide/cifs/todo.rst @@ -2,7 +2,8 @@ TODO ==== -Version 2.14 December 21, 2018 +As of 6.7 kernel. See https://wiki.samba.org/index.php/LinuxCIFSKernel +for list of features added by release A Partial List of Missing Features ================================== @@ -12,22 +13,22 @@ for visible, important contributions to this module. Here is a partial list of the known problems and missing features: a) SMB3 (and SMB3.1.1) missing optional features: + multichannel performance optimizations, algorithmic channel selection, + directory leases optimizations, + support for faster packet signing (GMAC), + support for compression over the network, + T10 copy offload ie "ODX" (copy chunk, and "Duplicate Extents" ioctl + are currently the only two server side copy mechanisms supported) - - multichannel (partially integrated), integration of multichannel with RDMA - - directory leases (improved metadata caching). Currently only implemented for root dir - - T10 copy offload ie "ODX" (copy chunk, and "Duplicate Extents" ioctl - currently the only two server side copy mechanisms supported) +b) Better optimized compounding and error handling for sparse file support, + perhaps addition of new optional SMB3.1.1 fsctls to make collapse range + and insert range more atomic -b) improved sparse file support (fiemap and SEEK_HOLE are implemented - but additional features would be supportable by the protocol such - as FALLOC_FL_COLLAPSE_RANGE and FALLOC_FL_INSERT_RANGE) - -c) Directory entry caching relies on a 1 second timer, rather than - using Directory Leases, currently only the root file handle is cached longer - by leveraging Directory Leases +c) Support for SMB3.1.1 over QUIC (and perhaps other socket based protocols + like SCTP) d) quota support (needs minor kernel change since quota calls otherwise - won't make it to network filesystems or deviceless filesystems). + won't make it to network filesystems or deviceless filesystems). e) Additional use cases can be optimized to use "compounding" (e.g. open/query/close and open/setinfo/close) to reduce the number of @@ -92,10 +93,13 @@ t) split cifs and smb3 support into separate modules so legacy (and less v) Additional testing of POSIX Extensions for SMB3.1.1 -w) Add support for additional strong encryption types, and additional spnego - authentication mechanisms (see MS-SMB2). GCM-256 is now partially implemented. +w) Support for the Mac SMB3.1.1 extensions to improve interop with Apple servers + +x) Support for additional authentication options (e.g. IAKERB, peer-to-peer + Kerberos, SCRAM and others supported by existing servers) -x) Finish support for SMB3.1.1 compression +y) Improved tracing, more eBPF trace points, better scripts for performance + analysis Known Bugs ========== diff --git a/Documentation/admin-guide/cifs/usage.rst b/Documentation/admin-guide/cifs/usage.rst index 5f936b4b60188..aa8290a29dc88 100644 --- a/Documentation/admin-guide/cifs/usage.rst +++ b/Documentation/admin-guide/cifs/usage.rst @@ -81,7 +81,7 @@ much older and less secure than the default dialect SMB3 which includes many advanced security features such as downgrade attack detection and encrypted shares and stronger signing and authentication algorithms. There are additional mount options that may be helpful for SMB3 to get -improved POSIX behavior (NB: can use vers=3.0 to force only SMB3, never 2.1): +improved POSIX behavior (NB: can use vers=3 to force SMB3 or later, never 2.1): ``mfsymlinks`` and either ``cifsacl`` or ``modefromsid`` (usually with ``idsfromsid``) @@ -715,6 +715,7 @@ DebugData Displays information about active CIFS sessions and Stats Lists summary resource usage information as well as per share statistics. open_files List all the open file handles on all active SMB sessions. +mount_params List of all mount parameters available for the module ======================= ======================================================= Configuration pseudo-files: @@ -864,6 +865,11 @@ i.e.:: echo "value" > /sys/module/cifs/parameters/ +More detailed descriptions of the available module parameters and their values +can be seen by doing: + + modinfo cifs (or modinfo smb3) + ================= ========================================================== 1. enable_oplocks Enable or disable oplocks. Oplocks are enabled by default. [Y/y/1]. To disable use any of [N/n/0]. -- GitLab From 936eba9cfb5cfbf6a2c762cd163605f2b784e03e Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Wed, 17 Jan 2024 05:55:39 +0000 Subject: [PATCH 1184/1290] cifs: open_cached_dir should not rely on primary channel open_cached_dir today selects ses->server a.k.a primary channel to send requests. When multichannel is used, the primary channel maybe down. So it does not make sense to rely only on that channel. This fix makes this function pick a channel with the standard helper function cifs_pick_channel. Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/cached_dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index d64a306a414be..9718926205047 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -151,7 +151,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, return -EOPNOTSUPP; ses = tcon->ses; - server = ses->server; + server = cifs_pick_channel(ses); cfids = tcon->cfids; if (!server->ops->new_lease_key) -- GitLab From 268b8b5797becb242013fcd63173eb28c007c8ae Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Wed, 10 Jan 2024 10:48:36 +0000 Subject: [PATCH 1185/1290] cifs: pick channel for tcon and tdis Today, the tree connect and disconnect requests are sent on the primary channel only. However, the new multichannel logic allows the session to remain active even if one of the channels are alive. So a tree connect can now be triggered during a reconnect on any of its channels. This change changes tcon and tdis calls to pick an active channel instead of the first one. Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/smb2pdu.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 50f6bf16b6245..f8d70660ba292 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -1958,10 +1958,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, __le16 *unc_path = NULL; int flags = 0; unsigned int total_len; - struct TCP_Server_Info *server; - - /* always use master channel */ - server = ses->server; + struct TCP_Server_Info *server = cifs_pick_channel(ses); cifs_dbg(FYI, "TCON\n"); @@ -2094,6 +2091,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) struct smb2_tree_disconnect_req *req; /* response is trivial */ int rc = 0; struct cifs_ses *ses = tcon->ses; + struct TCP_Server_Info *server = cifs_pick_channel(ses); int flags = 0; unsigned int total_len; struct kvec iov[1]; @@ -2116,7 +2114,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) invalidate_all_cached_dirs(tcon); - rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, ses->server, + rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, server, (void **) &req, &total_len); if (rc) @@ -2134,7 +2132,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) rqst.rq_iov = iov; rqst.rq_nvec = 1; - rc = cifs_send_recv(xid, ses, ses->server, + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); if (rc) { -- GitLab From 7f738527a7a03021c7e1b02e188f446845f05eb6 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Wed, 17 Jan 2024 06:21:33 +0000 Subject: [PATCH 1186/1290] cifs: new nt status codes from MS-SMB2 MS-SMB2 spec has introduced two new status codes, STATUS_SERVER_UNAVAILABLE and STATUS_FILE_NOT_AVAILABLE which are to be treated as retryable errors. This change adds these to the available mappings and maps them to Linux errno EAGAIN. Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/smb2maperror.c | 2 ++ fs/smb/client/smb2status.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/fs/smb/client/smb2maperror.c b/fs/smb/client/smb2maperror.c index 1a90dd78b238f..ac1895358908a 100644 --- a/fs/smb/client/smb2maperror.c +++ b/fs/smb/client/smb2maperror.c @@ -1210,6 +1210,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_INVALID_TASK_INDEX, -EIO, "STATUS_INVALID_TASK_INDEX"}, {STATUS_THREAD_ALREADY_IN_TASK, -EIO, "STATUS_THREAD_ALREADY_IN_TASK"}, {STATUS_CALLBACK_BYPASS, -EIO, "STATUS_CALLBACK_BYPASS"}, + {STATUS_SERVER_UNAVAILABLE, -EAGAIN, "STATUS_SERVER_UNAVAILABLE"}, + {STATUS_FILE_NOT_AVAILABLE, -EAGAIN, "STATUS_FILE_NOT_AVAILABLE"}, {STATUS_PORT_CLOSED, -EIO, "STATUS_PORT_CLOSED"}, {STATUS_MESSAGE_LOST, -EIO, "STATUS_MESSAGE_LOST"}, {STATUS_INVALID_MESSAGE, -EIO, "STATUS_INVALID_MESSAGE"}, diff --git a/fs/smb/client/smb2status.h b/fs/smb/client/smb2status.h index a9e958166fc53..9c6d79b0bd497 100644 --- a/fs/smb/client/smb2status.h +++ b/fs/smb/client/smb2status.h @@ -982,6 +982,8 @@ struct ntstatus { #define STATUS_INVALID_TASK_INDEX cpu_to_le32(0xC0000501) #define STATUS_THREAD_ALREADY_IN_TASK cpu_to_le32(0xC0000502) #define STATUS_CALLBACK_BYPASS cpu_to_le32(0xC0000503) +#define STATUS_SERVER_UNAVAILABLE cpu_to_le32(0xC0000466) +#define STATUS_FILE_NOT_AVAILABLE cpu_to_le32(0xC0000467) #define STATUS_PORT_CLOSED cpu_to_le32(0xC0000700) #define STATUS_MESSAGE_LOST cpu_to_le32(0xC0000701) #define STATUS_INVALID_MESSAGE cpu_to_le32(0xC0000702) -- GitLab From cacea81390fd8c8c85404e5eb2adeb83d87a912e Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Thu, 18 Jan 2024 06:19:57 +1000 Subject: [PATCH 1187/1290] nouveau/vmm: don't set addr on the fail path to avoid warning nvif_vmm_put gets called if addr is set, but if the allocation fails we don't need to call put, otherwise we get a warning like [523232.435671] ------------[ cut here ]------------ [523232.435674] WARNING: CPU: 8 PID: 1505697 at drivers/gpu/drm/nouveau/nvif/vmm.c:68 nvif_vmm_put+0x72/0x80 [nouveau] [523232.435795] Modules linked in: uinput rfcomm snd_seq_dummy snd_hrtimer nf_conntrack_netbios_ns nf_conntrack_broadcast nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables nfnetlink qrtr bnep sunrpc binfmt_misc intel_rapl_msr intel_rapl_common intel_uncore_frequency intel_uncore_frequency_common isst_if_common iwlmvm nfit libnvdimm vfat fat x86_pkg_temp_thermal intel_powerclamp mac80211 snd_soc_avs snd_soc_hda_codec coretemp snd_hda_ext_core snd_soc_core snd_hda_codec_realtek kvm_intel snd_hda_codec_hdmi snd_compress snd_hda_codec_generic ac97_bus snd_pcm_dmaengine snd_hda_intel libarc4 snd_intel_dspcfg snd_intel_sdw_acpi snd_hda_codec kvm iwlwifi snd_hda_core btusb snd_hwdep btrtl snd_seq btintel irqbypass btbcm rapl snd_seq_device eeepc_wmi btmtk intel_cstate iTCO_wdt cfg80211 snd_pcm asus_wmi bluetooth intel_pmc_bxt iTCO_vendor_support snd_timer ledtrig_audio pktcdvd snd mei_me [523232.435828] sparse_keymap intel_uncore i2c_i801 platform_profile wmi_bmof mei pcspkr ioatdma soundcore i2c_smbus rfkill idma64 dca joydev acpi_tad loop zram nouveau drm_ttm_helper ttm video drm_exec drm_gpuvm gpu_sched crct10dif_pclmul i2c_algo_bit nvme crc32_pclmul crc32c_intel drm_display_helper polyval_clmulni nvme_core polyval_generic e1000e mxm_wmi cec ghash_clmulni_intel r8169 sha512_ssse3 nvme_common wmi pinctrl_sunrisepoint uas usb_storage ip6_tables ip_tables fuse [523232.435849] CPU: 8 PID: 1505697 Comm: gnome-shell Tainted: G W 6.6.0-rc7-nvk-uapi+ #12 [523232.435851] Hardware name: System manufacturer System Product Name/ROG STRIX X299-E GAMING II, BIOS 1301 09/24/2021 [523232.435852] RIP: 0010:nvif_vmm_put+0x72/0x80 [nouveau] [523232.435934] Code: 00 00 48 89 e2 be 02 00 00 00 48 c7 04 24 00 00 00 00 48 89 44 24 08 e8 fc bf ff ff 85 c0 75 0a 48 c7 43 08 00 00 00 00 eb b3 <0f> 0b eb f2 e8 f5 c9 b2 e6 0f 1f 44 00 00 90 90 90 90 90 90 90 90 [523232.435936] RSP: 0018:ffffc900077ffbd8 EFLAGS: 00010282 [523232.435937] RAX: 00000000fffffffe RBX: ffffc900077ffc00 RCX: 0000000000000010 [523232.435938] RDX: 0000000000000010 RSI: ffffc900077ffb38 RDI: ffffc900077ffbd8 [523232.435940] RBP: ffff888e1c4f2140 R08: 0000000000000000 R09: 0000000000000000 [523232.435940] R10: 0000000000000000 R11: 0000000000000000 R12: ffff888503811800 [523232.435941] R13: ffffc900077ffca0 R14: ffff888e1c4f2140 R15: ffff88810317e1e0 [523232.435942] FS: 00007f933a769640(0000) GS:ffff88905fa00000(0000) knlGS:0000000000000000 [523232.435943] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [523232.435944] CR2: 00007f930bef7000 CR3: 00000005d0322001 CR4: 00000000003706e0 [523232.435945] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [523232.435946] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [523232.435964] Call Trace: [523232.435965] [523232.435966] ? nvif_vmm_put+0x72/0x80 [nouveau] [523232.436051] ? __warn+0x81/0x130 [523232.436055] ? nvif_vmm_put+0x72/0x80 [nouveau] [523232.436138] ? report_bug+0x171/0x1a0 [523232.436142] ? handle_bug+0x3c/0x80 [523232.436144] ? exc_invalid_op+0x17/0x70 [523232.436145] ? asm_exc_invalid_op+0x1a/0x20 [523232.436149] ? nvif_vmm_put+0x72/0x80 [nouveau] [523232.436230] ? nvif_vmm_put+0x64/0x80 [nouveau] [523232.436342] nouveau_vma_del+0x80/0xd0 [nouveau] [523232.436506] nouveau_vma_new+0x1a0/0x210 [nouveau] [523232.436671] nouveau_gem_object_open+0x1d0/0x1f0 [nouveau] [523232.436835] drm_gem_handle_create_tail+0xd1/0x180 [523232.436840] drm_prime_fd_to_handle_ioctl+0x12e/0x200 [523232.436844] ? __pfx_drm_prime_fd_to_handle_ioctl+0x10/0x10 [523232.436847] drm_ioctl_kernel+0xd3/0x180 [523232.436849] drm_ioctl+0x26d/0x4b0 [523232.436851] ? __pfx_drm_prime_fd_to_handle_ioctl+0x10/0x10 [523232.436855] nouveau_drm_ioctl+0x5a/0xb0 [nouveau] [523232.437032] __x64_sys_ioctl+0x94/0xd0 [523232.437036] do_syscall_64+0x5d/0x90 [523232.437040] ? syscall_exit_to_user_mode+0x2b/0x40 [523232.437044] ? do_syscall_64+0x6c/0x90 [523232.437046] entry_SYSCALL_64_after_hwframe+0x6e/0xd8 Reported-by: Faith Ekstrand Cc: stable@vger.kernel.org Signed-off-by: Dave Airlie Link: https://patchwork.freedesktop.org/patch/msgid/20240117213852.295565-1-airlied@gmail.com --- drivers/gpu/drm/nouveau/nouveau_vmm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nouveau_vmm.c b/drivers/gpu/drm/nouveau/nouveau_vmm.c index a6602c0126715..3dda885df5b22 100644 --- a/drivers/gpu/drm/nouveau/nouveau_vmm.c +++ b/drivers/gpu/drm/nouveau/nouveau_vmm.c @@ -108,6 +108,9 @@ nouveau_vma_new(struct nouveau_bo *nvbo, struct nouveau_vmm *vmm, } else { ret = nvif_vmm_get(&vmm->vmm, PTES, false, mem->mem.page, 0, mem->mem.size, &tmp); + if (ret) + goto done; + vma->addr = tmp.addr; } -- GitLab From d87123aa9a7920e88633ffc5c5a0a22ab08bdc06 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 25 Sep 2023 13:10:22 +0200 Subject: [PATCH 1188/1290] sh: ecovec24: Rename missed backlight field from fbdev to dev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit One instance of gpio_backlight_platform_data.fbdev was renamed, but the second instance was forgotten, causing a build failure: arch/sh/boards/mach-ecovec24/setup.c: In function ‘arch_setup’: arch/sh/boards/mach-ecovec24/setup.c:1223:37: error: ‘struct gpio_backlight_platform_data’ has no member named ‘fbdev’; did you mean ‘dev’? 1223 | gpio_backlight_data.fbdev = NULL; | ^~~~~ | dev Fix this by updating the second instance. Fixes: ed369def91c1579a ("backlight/gpio_backlight: Rename field 'fbdev' to 'dev'") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202309231601.Uu6qcRnU-lkp@intel.com/ Signed-off-by: Geert Uytterhoeven Acked-by: Thomas Zimmermann Reviewed-by: John Paul Adrian Glaubitz Link: https://lore.kernel.org/r/20230925111022.3626362-1-geert+renesas@glider.be Signed-off-by: John Paul Adrian Glaubitz --- arch/sh/boards/mach-ecovec24/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/boards/mach-ecovec24/setup.c b/arch/sh/boards/mach-ecovec24/setup.c index 0f279360838a4..30d117f9ad7ee 100644 --- a/arch/sh/boards/mach-ecovec24/setup.c +++ b/arch/sh/boards/mach-ecovec24/setup.c @@ -1220,7 +1220,7 @@ static int __init arch_setup(void) lcdc_info.ch[0].num_modes = ARRAY_SIZE(ecovec_dvi_modes); /* No backlight */ - gpio_backlight_data.fbdev = NULL; + gpio_backlight_data.dev = NULL; gpio_set_value(GPIO_PTA2, 1); gpio_set_value(GPIO_PTU1, 1); -- GitLab From 99fe83ab3bb0e8aac4d45a9361919794336b2ba8 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 21 Nov 2023 08:54:23 +0900 Subject: [PATCH 1189/1290] sh: vsyscall: Remove unnecessary $(foreach ...) There is no need to use $(foreach ...) for iterating over just one parameter. Signed-off-by: Masahiro Yamada Reviewed-by: John Paul Adrian Glaubitz Link: https://lore.kernel.org/r/20231120235423.4103310-1-masahiroy@kernel.org Signed-off-by: John Paul Adrian Glaubitz --- arch/sh/kernel/vsyscall/Makefile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/sh/kernel/vsyscall/Makefile b/arch/sh/kernel/vsyscall/Makefile index 6e86644480488..118744d349e21 100644 --- a/arch/sh/kernel/vsyscall/Makefile +++ b/arch/sh/kernel/vsyscall/Makefile @@ -1,11 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 obj-y += vsyscall.o vsyscall-syscall.o vsyscall-syms.o -$(obj)/vsyscall-syscall.o: \ - $(foreach F,trapa,$(obj)/vsyscall-$F.so) +$(obj)/vsyscall-syscall.o: $(obj)/vsyscall-trapa.so # Teach kbuild about targets -targets += $(foreach F,trapa,vsyscall-$F.o vsyscall-$F.so) +targets += vsyscall-trapa.o vsyscall-traps.so targets += vsyscall-note.o vsyscall.lds vsyscall-dummy.o # The DSO images are built using a special linker script -- GitLab From fe0d495e759cee0dbfff4348b5791f21b6f56655 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 22 Dec 2023 11:06:44 -0700 Subject: [PATCH 1190/1290] dmaengine: xilinx: xdma: Fix operator precedence in xdma_prep_interleaved_dma() Clang warns (or errors with CONFIG_WERROR=y): drivers/dma/xilinx/xdma.c:757:68: error: operator '?:' has lower precedence than '+'; '+' will be evaluated first [-Werror,-Wparentheses] 757 | src_addr += dmaengine_get_src_icg(xt, &xt->sgl[i]) + xt->src_inc ? | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ^ drivers/dma/xilinx/xdma.c:757:68: note: place parentheses around the '+' expression to silence this warning 757 | src_addr += dmaengine_get_src_icg(xt, &xt->sgl[i]) + xt->src_inc ? | ^ | ( ) drivers/dma/xilinx/xdma.c:757:68: note: place parentheses around the '?:' expression to evaluate it first 757 | src_addr += dmaengine_get_src_icg(xt, &xt->sgl[i]) + xt->src_inc ? | ^ | ( 758 | xt->sgl[i].size : 0; | | ) drivers/dma/xilinx/xdma.c:759:68: error: operator '?:' has lower precedence than '+'; '+' will be evaluated first [-Werror,-Wparentheses] 759 | dst_addr += dmaengine_get_dst_icg(xt, &xt->sgl[i]) + xt->dst_inc ? | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ^ drivers/dma/xilinx/xdma.c:759:68: note: place parentheses around the '+' expression to silence this warning 759 | dst_addr += dmaengine_get_dst_icg(xt, &xt->sgl[i]) + xt->dst_inc ? | ^ | ( ) drivers/dma/xilinx/xdma.c:759:68: note: place parentheses around the '?:' expression to evaluate it first 759 | dst_addr += dmaengine_get_dst_icg(xt, &xt->sgl[i]) + xt->dst_inc ? | ^ | ( 760 | xt->sgl[i].size : 0; | | ) The src_inc and dst_inc members of 'struct dma_interleaved_template' are booleans, so it does not make sense for the addition to happen first. Wrap the conditional operator in parantheses so it is evaluated first. Closes: https://github.com/ClangBuiltLinux/linux/issues/1971 Fixes: 2f8f90cd2f8d ("dmaengine: xilinx: xdma: Implement interleaved DMA transfers") Signed-off-by: Nathan Chancellor Link: https://lore.kernel.org/r/20231222-dma-xilinx-xdma-clang-fixes-v1-1-84a18ff184d2@kernel.org Signed-off-by: Vinod Koul --- drivers/dma/xilinx/xdma.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/dma/xilinx/xdma.c b/drivers/dma/xilinx/xdma.c index 4ebc90b41bdb9..d5b9fc3fd955b 100644 --- a/drivers/dma/xilinx/xdma.c +++ b/drivers/dma/xilinx/xdma.c @@ -754,10 +754,10 @@ xdma_prep_interleaved_dma(struct dma_chan *chan, dst_addr = xt->dst_start; for (i = 0; i < xt->frame_size; ++i) { desc_num += xdma_fill_descs(sw_desc, src_addr, dst_addr, xt->sgl[i].size, desc_num); - src_addr += dmaengine_get_src_icg(xt, &xt->sgl[i]) + xt->src_inc ? - xt->sgl[i].size : 0; - dst_addr += dmaengine_get_dst_icg(xt, &xt->sgl[i]) + xt->dst_inc ? - xt->sgl[i].size : 0; + src_addr += dmaengine_get_src_icg(xt, &xt->sgl[i]) + (xt->src_inc ? + xt->sgl[i].size : 0); + dst_addr += dmaengine_get_dst_icg(xt, &xt->sgl[i]) + (xt->dst_inc ? + xt->sgl[i].size : 0); period_size += xt->sgl[i].size; } sw_desc->period_size = period_size; -- GitLab From 620a7e4c1f03a84e10c8c3fa0ae1aab03ef84294 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 22 Dec 2023 11:06:45 -0700 Subject: [PATCH 1191/1290] dmaengine: xilinx: xdma: Fix initialization location of desc in xdma_channel_isr() Clang warns (or errors with CONFIG_WERROR=y): drivers/dma/xilinx/xdma.c:894:3: error: variable 'desc' is uninitialized when used here [-Werror,-Wuninitialized] 894 | desc->error = true; | ^~~~ The initialization of desc was moved too far forward, move it back so that this assignment does not result in a potential crash at runtime while clearing up the warning. Closes: https://github.com/ClangBuiltLinux/linux/issues/1972 Fixes: 2f8f90cd2f8d ("dmaengine: xilinx: xdma: Implement interleaved DMA transfers") Signed-off-by: Nathan Chancellor Link: https://lore.kernel.org/r/20231222-dma-xilinx-xdma-clang-fixes-v1-2-84a18ff184d2@kernel.org Signed-off-by: Vinod Koul --- drivers/dma/xilinx/xdma.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/dma/xilinx/xdma.c b/drivers/dma/xilinx/xdma.c index d5b9fc3fd955b..ee595d1ebc63e 100644 --- a/drivers/dma/xilinx/xdma.c +++ b/drivers/dma/xilinx/xdma.c @@ -888,6 +888,8 @@ static irqreturn_t xdma_channel_isr(int irq, void *dev_id) if (ret) goto out; + desc = to_xdma_desc(vd); + st &= XDMA_CHAN_STATUS_MASK; if ((st & XDMA_CHAN_ERROR_MASK) || !(st & (CHAN_CTRL_IE_DESC_COMPLETED | CHAN_CTRL_IE_DESC_STOPPED))) { @@ -901,7 +903,6 @@ static irqreturn_t xdma_channel_isr(int irq, void *dev_id) if (ret) goto out; - desc = to_xdma_desc(vd); if (desc->interleaved_dma) { xchan->busy = false; desc->completed_desc_num += complete_desc_num; -- GitLab From 98373a21159379341742dadd6c038fe8ff34d9a1 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 18 Jan 2024 19:28:32 -0800 Subject: [PATCH 1192/1290] dmaengine: imx-sdma: fix Excess kernel-doc warnings Fix warnings of "Excess struct member" by removing those lines. They are extraneous. imx-sdma.c:467: warning: Excess struct member 'context_loaded' description in 'sdma_channel' imx-sdma.c:467: warning: Excess struct member 'bd_pool' description in 'sdma_channel' imx-sdma.c:500: warning: Excess struct member 'script_addrs' description in 'sdma_firmware_header' Signed-off-by: Randy Dunlap Cc: Sascha Hauer Cc: Shawn Guo Cc: Vinod Koul Cc: dmaengine@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20240119032832.4051-1-rdunlap@infradead.org Signed-off-by: Vinod Koul --- drivers/dma/imx-sdma.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c index f81ecf5863e86..9b42f5e96b1e0 100644 --- a/drivers/dma/imx-sdma.c +++ b/drivers/dma/imx-sdma.c @@ -421,9 +421,7 @@ struct sdma_desc { * @shp_addr: value for gReg[6] * @per_addr: value for gReg[2] * @status: status of dma channel - * @context_loaded: ensure context is only loaded once * @data: specific sdma interface structure - * @bd_pool: dma_pool for bd * @terminate_worker: used to call back into terminate work function * @terminated: terminated list * @is_ram_script: flag for script in ram @@ -486,8 +484,6 @@ struct sdma_channel { * @num_script_addrs: Number of script addresses in this image * @ram_code_start: offset of SDMA ram image in this firmware image * @ram_code_size: size of SDMA ram image - * @script_addrs: Stores the start address of the SDMA scripts - * (in SDMA memory space) */ struct sdma_firmware_header { u32 magic; -- GitLab From c4d6dcb3b6250ea546a952ad33382daf7cd32425 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Wed, 10 Jan 2024 22:27:17 +0000 Subject: [PATCH 1193/1290] dmaengine: sh: rz-dmac: Avoid format-overflow warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The max channel count for RZ DMAC is 16, hence use u8 instead of unsigned int and make the pdev_irqname string long enough to avoid the warning. This fixes the below issue: drivers/dma/sh/rz-dmac.c: In function ‘rz_dmac_probe’: drivers/dma/sh/rz-dmac.c:770:34: warning: ‘%u’ directive writing between 1 and 10 bytes into a region of size 3 [-Wformat-overflow=] 770 | sprintf(pdev_irqname, "ch%u", index); | ^~ In function ‘rz_dmac_chan_probe’, inlined from ‘rz_dmac_probe’ at drivers/dma/sh/rz-dmac.c:910:9: drivers/dma/sh/rz-dmac.c:770:31: note: directive argument in the range [0, 4294967294] 770 | sprintf(pdev_irqname, "ch%u", index); | ^~~~~~ drivers/dma/sh/rz-dmac.c:770:9: note: ‘sprintf’ output between 4 and 13 bytes into a destination of size 5 770 | sprintf(pdev_irqname, "ch%u", index); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ While at it use scnprintf() instead of sprintf() to make the code more robust. Signed-off-by: Lad Prabhakar Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20240110222717.193719-1-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Vinod Koul --- drivers/dma/sh/rz-dmac.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/dma/sh/rz-dmac.c b/drivers/dma/sh/rz-dmac.c index fea5bda34bc20..1f1e86ba5c66a 100644 --- a/drivers/dma/sh/rz-dmac.c +++ b/drivers/dma/sh/rz-dmac.c @@ -755,11 +755,11 @@ static struct dma_chan *rz_dmac_of_xlate(struct of_phandle_args *dma_spec, static int rz_dmac_chan_probe(struct rz_dmac *dmac, struct rz_dmac_chan *channel, - unsigned int index) + u8 index) { struct platform_device *pdev = to_platform_device(dmac->dev); struct rz_lmdesc *lmdesc; - char pdev_irqname[5]; + char pdev_irqname[6]; char *irqname; int ret; @@ -767,7 +767,7 @@ static int rz_dmac_chan_probe(struct rz_dmac *dmac, channel->mid_rid = -EINVAL; /* Request the channel interrupt. */ - sprintf(pdev_irqname, "ch%u", index); + scnprintf(pdev_irqname, sizeof(pdev_irqname), "ch%u", index); channel->irq = platform_get_irq_byname(pdev, pdev_irqname); if (channel->irq < 0) return channel->irq; @@ -845,9 +845,9 @@ static int rz_dmac_probe(struct platform_device *pdev) struct dma_device *engine; struct rz_dmac *dmac; int channel_num; - unsigned int i; int ret; int irq; + u8 i; dmac = devm_kzalloc(&pdev->dev, sizeof(*dmac), GFP_KERNEL); if (!dmac) -- GitLab From 62b68a88795942512936896b9fec1ee7d5fa9922 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Wed, 10 Jan 2024 22:22:10 +0000 Subject: [PATCH 1194/1290] dmaengine: usb-dmac: Avoid format-overflow warning gcc points out that the fix-byte buffer might be too small: drivers/dma/sh/usb-dmac.c: In function 'usb_dmac_probe': drivers/dma/sh/usb-dmac.c:720:34: warning: '%u' directive writing between 1 and 10 bytes into a region of size 3 [-Wformat-overflow=] 720 | sprintf(pdev_irqname, "ch%u", index); | ^~ In function 'usb_dmac_chan_probe', inlined from 'usb_dmac_probe' at drivers/dma/sh/usb-dmac.c:814:9: drivers/dma/sh/usb-dmac.c:720:31: note: directive argument in the range [0, 4294967294] 720 | sprintf(pdev_irqname, "ch%u", index); | ^~~~~~ drivers/dma/sh/usb-dmac.c:720:9: note: 'sprintf' output between 4 and 13 bytes into a destination of size 5 720 | sprintf(pdev_irqname, "ch%u", index); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Maximum number of channels for USB-DMAC as per the driver is 1-99 so use u8 instead of unsigned int/int for DMAC channel indexing and make the pdev_irqname string long enough to avoid the warning. While at it use scnprintf() instead of sprintf() to make the code more robust. Signed-off-by: Lad Prabhakar Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20240110222210.193479-1-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Vinod Koul --- drivers/dma/sh/usb-dmac.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/dma/sh/usb-dmac.c b/drivers/dma/sh/usb-dmac.c index a9b4302f60501..f7cd0cad056c1 100644 --- a/drivers/dma/sh/usb-dmac.c +++ b/drivers/dma/sh/usb-dmac.c @@ -706,10 +706,10 @@ static const struct dev_pm_ops usb_dmac_pm = { static int usb_dmac_chan_probe(struct usb_dmac *dmac, struct usb_dmac_chan *uchan, - unsigned int index) + u8 index) { struct platform_device *pdev = to_platform_device(dmac->dev); - char pdev_irqname[5]; + char pdev_irqname[6]; char *irqname; int ret; @@ -717,7 +717,7 @@ static int usb_dmac_chan_probe(struct usb_dmac *dmac, uchan->iomem = dmac->iomem + USB_DMAC_CHAN_OFFSET(index); /* Request the channel interrupt. */ - sprintf(pdev_irqname, "ch%u", index); + scnprintf(pdev_irqname, sizeof(pdev_irqname), "ch%u", index); uchan->irq = platform_get_irq_byname(pdev, pdev_irqname); if (uchan->irq < 0) return -ENODEV; @@ -768,8 +768,8 @@ static int usb_dmac_probe(struct platform_device *pdev) const enum dma_slave_buswidth widths = USB_DMAC_SLAVE_BUSWIDTH; struct dma_device *engine; struct usb_dmac *dmac; - unsigned int i; int ret; + u8 i; dmac = devm_kzalloc(&pdev->dev, sizeof(*dmac), GFP_KERNEL); if (!dmac) @@ -869,7 +869,7 @@ static void usb_dmac_chan_remove(struct usb_dmac *dmac, static void usb_dmac_remove(struct platform_device *pdev) { struct usb_dmac *dmac = platform_get_drvdata(pdev); - int i; + u8 i; for (i = 0; i < dmac->n_channels; ++i) usb_dmac_chan_remove(dmac, &dmac->channels[i]); -- GitLab From f829bca2e294bc2953bd2dadb93d72a9987b3110 Mon Sep 17 00:00:00 2001 From: Jan Kuliga Date: Sat, 23 Dec 2023 00:17:28 +0100 Subject: [PATCH 1195/1290] dmaengine: xilinx: xdma: Fix kernel-doc warnings Replace hyphens with colons where necessary. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202312230634.3AIMQ3OP-lkp@intel.com/ Signed-off-by: Jan Kuliga Reviewed-by: Randy Dunlap Link: https://lore.kernel.org/r/20231222231728.7156-1-jankul@alatek.krakow.pl Signed-off-by: Vinod Koul --- drivers/dma/xilinx/xdma.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/dma/xilinx/xdma.c b/drivers/dma/xilinx/xdma.c index ee595d1ebc63e..170017ff2aad6 100644 --- a/drivers/dma/xilinx/xdma.c +++ b/drivers/dma/xilinx/xdma.c @@ -548,11 +548,11 @@ static void xdma_synchronize(struct dma_chan *chan) /** * xdma_fill_descs - Fill hardware descriptors with contiguous memory block addresses - * @sw_desc - tx descriptor state container - * @src_addr - Value for a ->src_addr field of a first descriptor - * @dst_addr - Value for a ->dst_addr field of a first descriptor - * @size - Total size of a contiguous memory block - * @filled_descs_num - Number of filled hardware descriptors for corresponding sw_desc + * @sw_desc: tx descriptor state container + * @src_addr: Value for a ->src_addr field of a first descriptor + * @dst_addr: Value for a ->dst_addr field of a first descriptor + * @size: Total size of a contiguous memory block + * @filled_descs_num: Number of filled hardware descriptors for corresponding sw_desc */ static inline u32 xdma_fill_descs(struct xdma_desc *sw_desc, u64 src_addr, u64 dst_addr, u32 size, u32 filled_descs_num) -- GitLab From 404290240827c3bb5c4e195174a8854eef2f89ac Mon Sep 17 00:00:00 2001 From: Vinod Koul Date: Fri, 19 Jan 2024 18:10:44 +0530 Subject: [PATCH 1196/1290] dmaengine: shdma: increase size of 'dev_id' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We seem to have hit warnings of 'output may be truncated' which is fixed by increasing the size of 'dev_id' drivers/dma/sh/shdmac.c: In function ‘sh_dmae_probe’: drivers/dma/sh/shdmac.c:541:34: error: ‘%d’ directive output may be truncated writing between 1 and 10 bytes into a region of size 9 [-Werror=format-truncation=] 541 | "sh-dmae%d.%d", pdev->id, id); | ^~ In function ‘sh_dmae_chan_probe’, inlined from ‘sh_dmae_probe’ at drivers/dma/sh/shdmac.c:845:9: drivers/dma/sh/shdmac.c:541:26: note: directive argument in the range [0, 2147483647] 541 | "sh-dmae%d.%d", pdev->id, id); | ^~~~~~~~~~~~~~ drivers/dma/sh/shdmac.c:541:26: note: directive argument in the range [0, 19] drivers/dma/sh/shdmac.c:540:17: note: ‘snprintf’ output between 11 and 21 bytes into a destination of size 16 540 | snprintf(sh_chan->dev_id, sizeof(sh_chan->dev_id), | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 541 | "sh-dmae%d.%d", pdev->id, id); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Vinod Koul --- drivers/dma/sh/shdma.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/sh/shdma.h b/drivers/dma/sh/shdma.h index 9c121a4b33ad8..f97d80343aea4 100644 --- a/drivers/dma/sh/shdma.h +++ b/drivers/dma/sh/shdma.h @@ -25,7 +25,7 @@ struct sh_dmae_chan { const struct sh_dmae_slave_config *config; /* Slave DMA configuration */ int xmit_shift; /* log_2(bytes_per_xfer) */ void __iomem *base; - char dev_id[16]; /* unique name per DMAC of channel */ + char dev_id[32]; /* unique name per DMAC of channel */ int pm_error; dma_addr_t slave_addr; }; -- GitLab From 6386f6c995b3ab91c72cfb76e4465553c555a8da Mon Sep 17 00:00:00 2001 From: Vinod Koul Date: Fri, 19 Jan 2024 18:10:44 +0530 Subject: [PATCH 1197/1290] dmaengine: fsl-qdma: increase size of 'irq_name' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We seem to have hit warnings of 'output may be truncated' which is fixed by increasing the size of 'irq_name' drivers/dma/fsl-qdma.c: In function ‘fsl_qdma_irq_init’: drivers/dma/fsl-qdma.c:824:46: error: ‘%d’ directive writing between 1 and 11 bytes into a region of size 10 [-Werror=format-overflow=] 824 | sprintf(irq_name, "qdma-queue%d", i); | ^~ drivers/dma/fsl-qdma.c:824:35: note: directive argument in the range [-2147483641, 2147483646] 824 | sprintf(irq_name, "qdma-queue%d", i); | ^~~~~~~~~~~~~~ drivers/dma/fsl-qdma.c:824:17: note: ‘sprintf’ output between 12 and 22 bytes into a destination of size 20 824 | sprintf(irq_name, "qdma-queue%d", i); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Vinod Koul --- drivers/dma/fsl-qdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/fsl-qdma.c b/drivers/dma/fsl-qdma.c index 47cb284680494..a1d0aa63142a9 100644 --- a/drivers/dma/fsl-qdma.c +++ b/drivers/dma/fsl-qdma.c @@ -805,7 +805,7 @@ fsl_qdma_irq_init(struct platform_device *pdev, int i; int cpu; int ret; - char irq_name[20]; + char irq_name[32]; fsl_qdma->error_irq = platform_get_irq_byname(pdev, "qdma-error"); -- GitLab From cb95a4fa50bbc1262bfb7fea482388a50b12948f Mon Sep 17 00:00:00 2001 From: Vinod Koul Date: Fri, 19 Jan 2024 18:10:44 +0530 Subject: [PATCH 1198/1290] dmaengine: dw-edma: increase size of 'name' in debugfs code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We seem to have hit warnings of 'output may be truncated' which is fixed by increasing the size of 'name' drivers/dma/dw-edma/dw-hdma-v0-debugfs.c: In function ‘dw_hdma_v0_debugfs_on’: drivers/dma/dw-edma/dw-hdma-v0-debugfs.c:125:50: error: ‘%d’ directive output may be truncated writing between 1 and 11 bytes into a region of size 8 [-Werror=format-truncation=] 125 | snprintf(name, sizeof(name), "%s:%d", CHANNEL_STR, i); | ^~ drivers/dma/dw-edma/dw-hdma-v0-debugfs.c: In function ‘dw_hdma_v0_debugfs_on’: drivers/dma/dw-edma/dw-hdma-v0-debugfs.c:142:50: error: ‘%d’ directive output may be truncated writing between 1 and 11 bytes into a region of size 8 [-Werror=format-truncation=] 142 | snprintf(name, sizeof(name), "%s:%d", CHANNEL_STR, i); | ^~ drivers/dma/dw-edma/dw-edma-v0-debugfs.c: In function ‘dw_edma_debugfs_regs_wr’: drivers/dma/dw-edma/dw-edma-v0-debugfs.c:193:50: error: ‘%d’ directive output may be truncated writing between 1 and 11 bytes into a region of size 8 [-Werror=format-truncation=] 193 | snprintf(name, sizeof(name), "%s:%d", CHANNEL_STR, i); | ^~ Signed-off-by: Vinod Koul --- drivers/dma/dw-edma/dw-edma-v0-debugfs.c | 4 ++-- drivers/dma/dw-edma/dw-hdma-v0-debugfs.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/dma/dw-edma/dw-edma-v0-debugfs.c b/drivers/dma/dw-edma/dw-edma-v0-debugfs.c index 0745d9e7d259b..406f169b09a75 100644 --- a/drivers/dma/dw-edma/dw-edma-v0-debugfs.c +++ b/drivers/dma/dw-edma/dw-edma-v0-debugfs.c @@ -176,7 +176,7 @@ dw_edma_debugfs_regs_wr(struct dw_edma *dw, struct dentry *dent) }; struct dentry *regs_dent, *ch_dent; int nr_entries, i; - char name[16]; + char name[32]; regs_dent = debugfs_create_dir(WRITE_STR, dent); @@ -239,7 +239,7 @@ static noinline_for_stack void dw_edma_debugfs_regs_rd(struct dw_edma *dw, }; struct dentry *regs_dent, *ch_dent; int nr_entries, i; - char name[16]; + char name[32]; regs_dent = debugfs_create_dir(READ_STR, dent); diff --git a/drivers/dma/dw-edma/dw-hdma-v0-debugfs.c b/drivers/dma/dw-edma/dw-hdma-v0-debugfs.c index 520c81978b085..dcdc57fe976c1 100644 --- a/drivers/dma/dw-edma/dw-hdma-v0-debugfs.c +++ b/drivers/dma/dw-edma/dw-hdma-v0-debugfs.c @@ -116,7 +116,7 @@ static void dw_hdma_debugfs_regs_ch(struct dw_edma *dw, enum dw_edma_dir dir, static void dw_hdma_debugfs_regs_wr(struct dw_edma *dw, struct dentry *dent) { struct dentry *regs_dent, *ch_dent; - char name[16]; + char name[32]; int i; regs_dent = debugfs_create_dir(WRITE_STR, dent); @@ -133,7 +133,7 @@ static void dw_hdma_debugfs_regs_wr(struct dw_edma *dw, struct dentry *dent) static void dw_hdma_debugfs_regs_rd(struct dw_edma *dw, struct dentry *dent) { struct dentry *regs_dent, *ch_dent; - char name[16]; + char name[32]; int i; regs_dent = debugfs_create_dir(READ_STR, dent); -- GitLab From 61c2ef4b6cb019946479baf0aeded648081bfb5c Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 31 Aug 2023 16:40:08 -0500 Subject: [PATCH 1199/1290] sparc: Use device_get_match_data() Use preferred device_get_match_data() instead of of_match_device() to get the driver match data. With this, adjust the includes to explicitly include the correct headers. Signed-off-by: Rob Herring --- arch/sparc/kernel/pci_sabre.c | 9 +++++---- arch/sparc/kernel/pci_schizo.c | 13 +++++++------ 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/arch/sparc/kernel/pci_sabre.c b/arch/sparc/kernel/pci_sabre.c index 3c38ca40a22ba..a84598568300d 100644 --- a/arch/sparc/kernel/pci_sabre.c +++ b/arch/sparc/kernel/pci_sabre.c @@ -13,7 +13,10 @@ #include #include #include -#include +#include +#include +#include +#include #include #include @@ -456,7 +459,6 @@ static void sabre_pbm_init(struct pci_pbm_info *pbm, static const struct of_device_id sabre_match[]; static int sabre_probe(struct platform_device *op) { - const struct of_device_id *match; const struct linux_prom64_registers *pr_regs; struct device_node *dp = op->dev.of_node; struct pci_pbm_info *pbm; @@ -466,8 +468,7 @@ static int sabre_probe(struct platform_device *op) const u32 *vdma; u64 clear_irq; - match = of_match_device(sabre_match, &op->dev); - hummingbird_p = match && (match->data != NULL); + hummingbird_p = (uintptr_t)device_get_match_data(&op->dev); if (!hummingbird_p) { struct device_node *cpu_dp; diff --git a/arch/sparc/kernel/pci_schizo.c b/arch/sparc/kernel/pci_schizo.c index 23b47f7fdb1d5..5d8dd49495863 100644 --- a/arch/sparc/kernel/pci_schizo.c +++ b/arch/sparc/kernel/pci_schizo.c @@ -11,7 +11,10 @@ #include #include #include -#include +#include +#include +#include +#include #include #include @@ -1459,15 +1462,13 @@ out_err: return err; } -static const struct of_device_id schizo_match[]; static int schizo_probe(struct platform_device *op) { - const struct of_device_id *match; + unsigned long chip_type = (unsigned long)device_get_match_data(&op->dev); - match = of_match_device(schizo_match, &op->dev); - if (!match) + if (!chip_type) return -EINVAL; - return __schizo_init(op, (unsigned long)match->data); + return __schizo_init(op, chip_type); } /* The ordering of this table is very important. Some Tomatillo -- GitLab From 5e6c3454b40594c6f1d398254e7b4005494f9638 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Fri, 1 Sep 2023 14:36:49 -0500 Subject: [PATCH 1200/1290] net: can: Use device_get_match_data() Use preferred device_get_match_data() instead of of_match_device() to get the driver match data. With this, adjust the includes to explicitly include the correct headers. Error checking for matching and match data was not necessary as matching is always successful if we're already in probe and the match tables always have data pointers. Signed-off-by: Rob Herring --- drivers/net/can/c_can/c_can_platform.c | 13 ++----------- drivers/net/can/flexcan/flexcan-core.c | 12 ++---------- drivers/net/can/mscan/mpc5xxx_can.c | 8 ++++---- drivers/net/can/xilinx_can.c | 9 +++------ 4 files changed, 11 insertions(+), 31 deletions(-) diff --git a/drivers/net/can/c_can/c_can_platform.c b/drivers/net/can/c_can/c_can_platform.c index f44ba2600415f..e2ec69aa46e53 100644 --- a/drivers/net/can/c_can/c_can_platform.c +++ b/drivers/net/can/c_can/c_can_platform.c @@ -30,9 +30,9 @@ #include #include #include +#include #include #include -#include #include #include @@ -259,22 +259,13 @@ static int c_can_plat_probe(struct platform_device *pdev) void __iomem *addr; struct net_device *dev; struct c_can_priv *priv; - const struct of_device_id *match; struct resource *mem; int irq; struct clk *clk; const struct c_can_driver_data *drvdata; struct device_node *np = pdev->dev.of_node; - match = of_match_device(c_can_of_table, &pdev->dev); - if (match) { - drvdata = match->data; - } else if (pdev->id_entry->driver_data) { - drvdata = (struct c_can_driver_data *) - platform_get_device_id(pdev)->driver_data; - } else { - return -ENODEV; - } + drvdata = device_get_match_data(&pdev->dev); /* get the appropriate clk */ clk = devm_clk_get(&pdev->dev, NULL); diff --git a/drivers/net/can/flexcan/flexcan-core.c b/drivers/net/can/flexcan/flexcan-core.c index d15f85a40c1e5..8ea7f2795551b 100644 --- a/drivers/net/can/flexcan/flexcan-core.c +++ b/drivers/net/can/flexcan/flexcan-core.c @@ -23,11 +23,11 @@ #include #include #include -#include #include #include #include #include +#include #include #include @@ -2034,7 +2034,6 @@ MODULE_DEVICE_TABLE(platform, flexcan_id_table); static int flexcan_probe(struct platform_device *pdev) { - const struct of_device_id *of_id; const struct flexcan_devtype_data *devtype_data; struct net_device *dev; struct flexcan_priv *priv; @@ -2090,14 +2089,7 @@ static int flexcan_probe(struct platform_device *pdev) if (IS_ERR(regs)) return PTR_ERR(regs); - of_id = of_match_device(flexcan_of_match, &pdev->dev); - if (of_id) - devtype_data = of_id->data; - else if (platform_get_device_id(pdev)->driver_data) - devtype_data = (struct flexcan_devtype_data *) - platform_get_device_id(pdev)->driver_data; - else - return -ENODEV; + devtype_data = device_get_match_data(&pdev->dev); if ((devtype_data->quirks & FLEXCAN_QUIRK_SUPPORT_FD) && !((devtype_data->quirks & diff --git a/drivers/net/can/mscan/mpc5xxx_can.c b/drivers/net/can/mscan/mpc5xxx_can.c index 4837df6efa926..5b3d69c3b6b66 100644 --- a/drivers/net/can/mscan/mpc5xxx_can.c +++ b/drivers/net/can/mscan/mpc5xxx_can.c @@ -12,8 +12,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -290,7 +292,7 @@ static int mpc5xxx_can_probe(struct platform_device *ofdev) int irq, mscan_clksrc = 0; int err = -ENOMEM; - data = of_device_get_match_data(&ofdev->dev); + data = device_get_match_data(&ofdev->dev); if (!data) return -EINVAL; @@ -351,13 +353,11 @@ exit_unmap_mem: static void mpc5xxx_can_remove(struct platform_device *ofdev) { - const struct of_device_id *match; const struct mpc5xxx_can_data *data; struct net_device *dev = platform_get_drvdata(ofdev); struct mscan_priv *priv = netdev_priv(dev); - match = of_match_device(mpc5xxx_can_table, &ofdev->dev); - data = match ? match->data : NULL; + data = device_get_match_data(&ofdev->dev); unregister_mscandev(dev); if (data && data->put_clock) diff --git a/drivers/net/can/xilinx_can.c b/drivers/net/can/xilinx_can.c index abe58f1030433..3722eaa84234e 100644 --- a/drivers/net/can/xilinx_can.c +++ b/drivers/net/can/xilinx_can.c @@ -20,8 +20,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -1726,8 +1726,7 @@ static int xcan_probe(struct platform_device *pdev) struct net_device *ndev; struct xcan_priv *priv; struct phy *transceiver; - const struct of_device_id *of_id; - const struct xcan_devtype_data *devtype = &xcan_axi_data; + const struct xcan_devtype_data *devtype; void __iomem *addr; int ret; int rx_max, tx_max; @@ -1741,9 +1740,7 @@ static int xcan_probe(struct platform_device *pdev) goto err; } - of_id = of_match_device(xcan_of_match, &pdev->dev); - if (of_id && of_id->data) - devtype = of_id->data; + devtype = device_get_match_data(&pdev->dev); hw_tx_max_property = devtype->flags & XCAN_FLAG_TX_MAILBOXES ? "tx-mailbox-count" : "tx-fifo-depth"; -- GitLab From ed7dafcc5364d5ff1ac85d7b18bf9a00ff28b6f8 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Mon, 9 Oct 2023 14:45:08 -0500 Subject: [PATCH 1201/1290] thermal: loongson2: Replace of_device.h with explicit includes The DT of_device.h and of_platform.h date back to the separate of_platform_bus_type before it as merged into the regular platform bus. As part of that merge prepping Arm DT support 13 years ago, they "temporarily" include each other. They also include platform_device.h and of.h. of_device.h isn't needed, but mod_devicetable.h and property.h were implicitly included. Signed-off-by: Rob Herring --- drivers/thermal/loongson2_thermal.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/loongson2_thermal.c b/drivers/thermal/loongson2_thermal.c index 99ca0c7bc41c7..0f475fe46bc9d 100644 --- a/drivers/thermal/loongson2_thermal.c +++ b/drivers/thermal/loongson2_thermal.c @@ -8,9 +8,10 @@ #include #include #include +#include #include -#include #include +#include #include #include #include "thermal_hwmon.h" -- GitLab From 527eb67e0cfb3f398d780cf04fde28ee55618a4a Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 11 Dec 2023 16:05:10 +1100 Subject: [PATCH 1202/1290] clk: qcom: gcc-x1e80100: Replace of_device.h with explicit includes The DT of_device.h and of_platform.h date back to the separate of_platform_bus_type before it as merged into the regular platform bus. As part of that merge prepping Arm DT support 13 years ago, they "temporarily" include each other. They also include platform_device.h and of.h. of_device.h isn't needed, but mod_devicetable.h and platform_device.h were implicitly included. Signed-off-by: Stephen Rothwell Reviewed-by: Sibi Sankar Link: https://lore.kernel.org/r/20231211160510.0aef871b@canb.auug.org.au [robh: Redo commit msg] Signed-off-by: Rob Herring --- drivers/clk/qcom/gcc-x1e80100.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/clk/qcom/gcc-x1e80100.c b/drivers/clk/qcom/gcc-x1e80100.c index 74db7fef237b4..d7182d6e97837 100644 --- a/drivers/clk/qcom/gcc-x1e80100.c +++ b/drivers/clk/qcom/gcc-x1e80100.c @@ -4,8 +4,9 @@ */ #include +#include #include -#include +#include #include #include -- GitLab From ef175b29a242fea98f467f008237484b03c94834 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Fri, 15 Jan 2021 15:24:59 -0600 Subject: [PATCH 1203/1290] of: Stop circularly including of_device.h and of_platform.h The DT of_device.h and of_platform.h headers date back to the separate of_platform_bus_type before it was merged into the regular platform bus. As part of that merge prepping Arm DT support 13 years ago, they "temporarily" include each other. The headers also include platform_device.h and of.h. The result was lots of drivers relied on these implicit includes. Now the entire tree has been fixed over the last couple of cycles to explicitly include the necessary headers instead of relying on of_device.h and/or of_platform.h implicit includes, so the implicit and circular includes can finally be removed. Signed-off-by: Rob Herring --- include/linux/of_device.h | 5 +---- include/linux/of_platform.h | 4 ++-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/include/linux/of_device.h b/include/linux/of_device.h index a72661e47faa5..9042bca5bb848 100644 --- a/include/linux/of_device.h +++ b/include/linux/of_device.h @@ -2,10 +2,7 @@ #ifndef _LINUX_OF_DEVICE_H #define _LINUX_OF_DEVICE_H -#include -#include /* temporary until merge */ - -#include +#include struct device; struct of_device_id; diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h index fadfea5754852..a2ff1ad48f7f0 100644 --- a/include/linux/of_platform.h +++ b/include/linux/of_platform.h @@ -7,11 +7,11 @@ */ #include -#include -#include struct device; +struct device_node; struct of_device_id; +struct platform_device; /** * struct of_dev_auxdata - lookup table entry for device names & platform_data -- GitLab From 71fee48fb772ac4f6cfa63dbebc5629de8b4cc09 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 15 Jan 2024 17:35:55 +0100 Subject: [PATCH 1204/1290] tick-sched: Fix idle and iowait sleeptime accounting vs CPU hotplug When offlining and onlining CPUs the overall reported idle and iowait times as reported by /proc/stat jump backward and forward: cpu 132 0 176 225249 47 6 6 21 0 0 cpu0 80 0 115 112575 33 3 4 18 0 0 cpu1 52 0 60 112673 13 3 1 2 0 0 cpu 133 0 177 226681 47 6 6 21 0 0 cpu0 80 0 116 113387 33 3 4 18 0 0 cpu 133 0 178 114431 33 6 6 21 0 0 <---- jump backward cpu0 80 0 116 114247 33 3 4 18 0 0 cpu1 52 0 61 183 0 3 1 2 0 0 <---- idle + iowait start with 0 cpu 133 0 178 228956 47 6 6 21 0 0 <---- jump forward cpu0 81 0 117 114929 33 3 4 18 0 0 Reason for this is that get_idle_time() in fs/proc/stat.c has different sources for both values depending on if a CPU is online or offline: - if a CPU is online the values may be taken from its per cpu tick_cpu_sched structure - if a CPU is offline the values are taken from its per cpu cpustat structure The problem is that the per cpu tick_cpu_sched structure is set to zero on CPU offline. See tick_cancel_sched_timer() in kernel/time/tick-sched.c. Therefore when a CPU is brought offline and online afterwards both its idle and iowait sleeptime will be zero, causing a jump backward in total system idle and iowait sleeptime. In a similar way if a CPU is then brought offline again the total idle and iowait sleeptimes will jump forward. It looks like this behavior was introduced with commit 4b0c0f294f60 ("tick: Cleanup NOHZ per cpu data on cpu down"). This was only noticed now on s390, since we switched to generic idle time reporting with commit be76ea614460 ("s390/idle: remove arch_cpu_idle_time() and corresponding code"). Fix this by preserving the values of idle_sleeptime and iowait_sleeptime members of the per-cpu tick_sched structure on CPU hotplug. Fixes: 4b0c0f294f60 ("tick: Cleanup NOHZ per cpu data on cpu down") Reported-by: Gerald Schaefer Signed-off-by: Heiko Carstens Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20240115163555.1004144-1-hca@linux.ibm.com --- kernel/time/tick-sched.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a17d260028310..d2501673028da 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1576,13 +1576,18 @@ void tick_setup_sched_timer(void) void tick_cancel_sched_timer(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + ktime_t idle_sleeptime, iowait_sleeptime; # ifdef CONFIG_HIGH_RES_TIMERS if (ts->sched_timer.base) hrtimer_cancel(&ts->sched_timer); # endif + idle_sleeptime = ts->idle_sleeptime; + iowait_sleeptime = ts->iowait_sleeptime; memset(ts, 0, sizeof(*ts)); + ts->idle_sleeptime = idle_sleeptime; + ts->iowait_sleeptime = iowait_sleeptime; } #endif -- GitLab From f24a70106dc1ad2a755b2d42f47cf1dcf24f0b27 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Fri, 19 Jan 2024 06:56:01 -0800 Subject: [PATCH 1205/1290] lib: checksum: Fix build with CONFIG_NET=n The generic ipv6 checksums are only defined with CONFIG_NET=y, so gate the test as well. Fixes: 6f4c45cbcb00 ("kunit: Add tests for csum_ipv6_magic and ip_fast_csum") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202401192143.jLdjbIy3-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202401192357.WU4nPRdN-lkp@intel.com/ Reviewed-By: Charlie Jenkins Link: https://lore.kernel.org/r/20240119145600.3093-2-palmer@rivosinc.com Signed-off-by: Palmer Dabbelt --- lib/checksum_kunit.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/checksum_kunit.c b/lib/checksum_kunit.c index af3e5ca4e1702..225bb77014600 100644 --- a/lib/checksum_kunit.c +++ b/lib/checksum_kunit.c @@ -593,6 +593,7 @@ static void test_ip_fast_csum(struct kunit *test) static void test_csum_ipv6_magic(struct kunit *test) { +#if defined(CONFIG_NET) const struct in6_addr *saddr; const struct in6_addr *daddr; unsigned int len; @@ -616,6 +617,7 @@ static void test_csum_ipv6_magic(struct kunit *test) CHECK_EQ(expected_csum_ipv6_magic[i], csum_ipv6_magic(saddr, daddr, len, proto, csum)); } +#endif /* !CONFIG_NET */ } static struct kunit_case __refdata checksum_test_cases[] = { -- GitLab From cfb7a13399be2234052a5bc480d166cd33047b0c Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 18 Jan 2024 22:36:13 -0600 Subject: [PATCH 1206/1290] cifs: update known bugs mentioned in kernel docs for cifs Remove bugs that have been addressed and add link to xfstest results wiki. Signed-off-by: Steve French --- Documentation/admin-guide/cifs/todo.rst | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/Documentation/admin-guide/cifs/todo.rst b/Documentation/admin-guide/cifs/todo.rst index e46c360013947..9a65c670774ee 100644 --- a/Documentation/admin-guide/cifs/todo.rst +++ b/Documentation/admin-guide/cifs/todo.rst @@ -106,13 +106,7 @@ Known Bugs See https://bugzilla.samba.org - search on product "CifsVFS" for current bug list. Also check http://bugzilla.kernel.org (Product = File System, Component = CIFS) - -1) existing symbolic links (Windows reparse points) are recognized but - can not be created remotely. They are implemented for Samba and those that - support the CIFS Unix extensions, although earlier versions of Samba - overly restrict the pathnames. -2) follow_link and readdir code does not follow dfs junctions - but recognizes them +and xfstest results e.g. https://wiki.samba.org/index.php/Xfstest-results-smb3 Misc testing to do ================== -- GitLab From 76025cc2285d9ede3d717fe4305d66f8be2d9346 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Fri, 19 Jan 2024 01:08:26 -0300 Subject: [PATCH 1207/1290] smb: client: fix parsing of SMB3.1.1 POSIX create context The data offset for the SMB3.1.1 POSIX create context will always be 8-byte aligned so having the check 'noff + nlen >= doff' in smb2_parse_contexts() is wrong as it will lead to -EINVAL because noff + nlen == doff. Fix the sanity check to correctly handle aligned create context data. Fixes: af1689a9b770 ("smb: client: fix potential OOBs in smb2_parse_contexts()") Signed-off-by: Paulo Alcantara Signed-off-by: Steve French --- fs/smb/client/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index f8d70660ba292..ec39dfbc31541 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -2277,7 +2277,7 @@ int smb2_parse_contexts(struct TCP_Server_Info *server, noff = le16_to_cpu(cc->NameOffset); nlen = le16_to_cpu(cc->NameLength); - if (noff + nlen >= doff) + if (noff + nlen > doff) return -EINVAL; name = (char *)cc + noff; -- GitLab From 858e74876c5cbff1dfd5bace99e32fbce2abd4b5 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Fri, 19 Jan 2024 01:08:27 -0300 Subject: [PATCH 1208/1290] smb: client: parse owner/group when creating reparse points Parse owner/group when creating special files and symlinks under SMB3.1.1 POSIX mounts. Move the parsing of owner/group to smb2_compound_op() so we don't have to duplicate it in both smb2_get_reparse_inode() and smb311_posix_query_path_info(). Signed-off-by: Paulo Alcantara Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 2 + fs/smb/client/inode.c | 25 +++----- fs/smb/client/smb2inode.c | 129 ++++++++++++++++++-------------------- fs/smb/client/smb2proto.h | 4 +- 4 files changed, 71 insertions(+), 89 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index f576ceee6157a..49ec4d3713fe9 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -204,6 +204,8 @@ struct cifs_open_info_data { }; } reparse; char *symlink_target; + struct cifs_sid posix_owner; + struct cifs_sid posix_group; union { struct smb2_file_all_info fi; struct smb311_posix_qinfo posix_fi; diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 9f37c1758f732..cedffaad86aea 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -665,8 +665,6 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, /* Fill a cifs_fattr struct with info from POSIX info struct */ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_info_data *data, - struct cifs_sid *owner, - struct cifs_sid *group, struct super_block *sb) { struct smb311_posix_qinfo *info = &data->posix_fi; @@ -722,8 +720,8 @@ out_reparse: fattr->cf_symlink_target = data->symlink_target; data->symlink_target = NULL; } - sid_to_id(cifs_sb, owner, fattr, SIDOWNER); - sid_to_id(cifs_sb, group, fattr, SIDGROUP); + sid_to_id(cifs_sb, &data->posix_owner, fattr, SIDOWNER); + sid_to_id(cifs_sb, &data->posix_group, fattr, SIDGROUP); cifs_dbg(FYI, "POSIX query info: mode 0x%x uniqueid 0x%llx nlink %d\n", fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink); @@ -1070,9 +1068,7 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data, const unsigned int xid, struct cifs_tcon *tcon, const char *full_path, - struct cifs_fattr *fattr, - struct cifs_sid *owner, - struct cifs_sid *group) + struct cifs_fattr *fattr) { struct TCP_Server_Info *server = tcon->ses->server; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); @@ -1117,7 +1113,7 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data, } if (tcon->posix_extensions) - smb311_posix_info_to_fattr(fattr, data, owner, group, sb); + smb311_posix_info_to_fattr(fattr, data, sb); else cifs_open_info_to_fattr(fattr, data, sb); out: @@ -1171,8 +1167,7 @@ static int cifs_get_fattr(struct cifs_open_info_data *data, */ if (cifs_open_data_reparse(data)) { rc = reparse_info_to_fattr(data, sb, xid, tcon, - full_path, fattr, - NULL, NULL); + full_path, fattr); } else { cifs_open_info_to_fattr(fattr, data, sb); } @@ -1320,7 +1315,6 @@ static int smb311_posix_get_fattr(struct cifs_open_info_data *data, struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_tcon *tcon; struct tcon_link *tlink; - struct cifs_sid owner, group; int tmprc; int rc = 0; @@ -1334,8 +1328,7 @@ static int smb311_posix_get_fattr(struct cifs_open_info_data *data, */ if (!data) { rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, - full_path, &tmp_data, - &owner, &group); + full_path, &tmp_data); data = &tmp_data; } @@ -1347,11 +1340,9 @@ static int smb311_posix_get_fattr(struct cifs_open_info_data *data, case 0: if (cifs_open_data_reparse(data)) { rc = reparse_info_to_fattr(data, sb, xid, tcon, - full_path, fattr, - &owner, &group); + full_path, fattr); } else { - smb311_posix_info_to_fattr(fattr, data, - &owner, &group, sb); + smb311_posix_info_to_fattr(fattr, data, sb); } break; case -EREMOTE: diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 5053a5550abed..f38cdc38f10cb 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -56,6 +56,35 @@ static inline __u32 file_create_options(struct dentry *dentry) return 0; } +/* Parse owner and group from SMB3.1.1 POSIX query info */ +static int parse_posix_sids(struct cifs_open_info_data *data, + struct kvec *rsp_iov) +{ + struct smb2_query_info_rsp *qi = rsp_iov->iov_base; + unsigned int out_len = le32_to_cpu(qi->OutputBufferLength); + unsigned int qi_len = sizeof(data->posix_fi); + int owner_len, group_len; + u8 *sidsbuf, *sidsbuf_end; + + if (out_len <= qi_len) + return -EINVAL; + + sidsbuf = (u8 *)qi + le16_to_cpu(qi->OutputBufferOffset) + qi_len; + sidsbuf_end = sidsbuf + out_len - qi_len; + + owner_len = posix_info_sid_size(sidsbuf, sidsbuf_end); + if (owner_len == -1) + return -EINVAL; + + memcpy(&data->posix_owner, sidsbuf, owner_len); + group_len = posix_info_sid_size(sidsbuf + owner_len, sidsbuf_end); + if (group_len == -1) + return -EINVAL; + + memcpy(&data->posix_group, sidsbuf + owner_len, group_len); + return 0; +} + /* * note: If cfile is passed, the reference to it is dropped here. * So make sure that you do not reuse cfile after return from this func. @@ -69,7 +98,6 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, __u32 desired_access, __u32 create_disposition, __u32 create_options, umode_t mode, struct kvec *in_iov, int *cmds, int num_cmds, struct cifsFileInfo *cfile, - __u8 **extbuf, size_t *extbuflen, struct kvec *out_iov, int *out_buftype) { @@ -494,21 +522,9 @@ finished: &rsp_iov[i + 1], sizeof(idata->posix_fi) /* add SIDs */, (char *)&idata->posix_fi); } - if (rc == 0) { - unsigned int length = le32_to_cpu(qi_rsp->OutputBufferLength); - - if (length > sizeof(idata->posix_fi)) { - char *base = (char *)rsp_iov[i + 1].iov_base + - le16_to_cpu(qi_rsp->OutputBufferOffset) + - sizeof(idata->posix_fi); - *extbuflen = length - sizeof(idata->posix_fi); - *extbuf = kmemdup(base, *extbuflen, GFP_KERNEL); - if (!*extbuf) - rc = -ENOMEM; - } else { - rc = -EINVAL; - } - } + if (rc == 0) + rc = parse_posix_sids(idata, &rsp_iov[i + 1]); + SMB2_query_info_free(&rqst[num_rqst++]); if (rc) trace_smb3_posix_query_info_compound_err(xid, ses->Suid, @@ -693,9 +709,8 @@ int smb2_query_path_info(const unsigned int xid, cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, - in_iov, cmds, 1, cfile, - NULL, NULL, out_iov, out_buftype); + create_options, ACL_NO_MODE, in_iov, + cmds, 1, cfile, out_iov, out_buftype); hdr = out_iov[0].iov_base; /* * If first iov is unset, then SMB session was dropped or we've got a @@ -722,8 +737,8 @@ int smb2_query_path_info(const unsigned int xid, cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, in_iov, cmds, - num_cmds, cfile, NULL, NULL, NULL, NULL); + create_options, ACL_NO_MODE, in_iov, + cmds, num_cmds, cfile, NULL, NULL); break; case -EREMOTE: break; @@ -750,19 +765,13 @@ int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, - struct cifs_open_info_data *data, - struct cifs_sid *owner, - struct cifs_sid *group) + struct cifs_open_info_data *data) { int rc; __u32 create_options = 0; struct cifsFileInfo *cfile; struct kvec in_iov[2], out_iov[3] = {}; int out_buftype[3] = {}; - __u8 *sidsbuf = NULL; - __u8 *sidsbuf_end = NULL; - size_t sidsbuflen = 0; - size_t owner_len, group_len; int cmds[2] = { SMB2_OP_POSIX_QUERY_INFO, }; int i, num_cmds; @@ -782,8 +791,8 @@ int smb311_posix_query_path_info(const unsigned int xid, cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, in_iov, cmds, 1, - cfile, &sidsbuf, &sidsbuflen, out_iov, out_buftype); + create_options, ACL_NO_MODE, in_iov, + cmds, 1, cfile, out_iov, out_buftype); /* * If first iov is unset, then SMB session was dropped or we've got a * cached open file (@cfile). @@ -810,32 +819,12 @@ int smb311_posix_query_path_info(const unsigned int xid, cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, in_iov, cmds, - num_cmds, cfile, &sidsbuf, &sidsbuflen, NULL, NULL); + create_options, ACL_NO_MODE, in_iov, + cmds, num_cmds, cfile, NULL, NULL); break; } out: - if (rc == 0) { - sidsbuf_end = sidsbuf + sidsbuflen; - - owner_len = posix_info_sid_size(sidsbuf, sidsbuf_end); - if (owner_len == -1) { - rc = -EINVAL; - goto out; - } - memcpy(owner, sidsbuf, owner_len); - - group_len = posix_info_sid_size( - sidsbuf + owner_len, sidsbuf_end); - if (group_len == -1) { - rc = -EINVAL; - goto out; - } - memcpy(group, sidsbuf + owner_len, group_len); - } - - kfree(sidsbuf); for (i = 0; i < ARRAY_SIZE(out_buftype); i++) free_rsp_buf(out_buftype[i], out_iov[i].iov_base); return rc; @@ -848,9 +837,9 @@ smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode, { return smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, - CREATE_NOT_FILE, mode, NULL, - &(int){SMB2_OP_MKDIR}, 1, - NULL, NULL, NULL, NULL, NULL); + CREATE_NOT_FILE, mode, + NULL, &(int){SMB2_OP_MKDIR}, 1, + NULL, NULL, NULL); } void @@ -875,7 +864,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, CREATE_NOT_FILE, ACL_NO_MODE, &in_iov, &(int){SMB2_OP_SET_INFO}, 1, - cfile, NULL, NULL, NULL, NULL); + cfile, NULL, NULL); if (tmprc == 0) cifs_i->cifsAttrs = dosattrs; } @@ -887,8 +876,9 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, drop_cached_dir_by_name(xid, tcon, name, cifs_sb); return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, CREATE_NOT_FILE, - ACL_NO_MODE, NULL, &(int){SMB2_OP_RMDIR}, 1, - NULL, NULL, NULL, NULL, NULL); + ACL_NO_MODE, NULL, + &(int){SMB2_OP_RMDIR}, 1, + NULL, NULL, NULL); } int @@ -897,8 +887,9 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, { return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, - ACL_NO_MODE, NULL, &(int){SMB2_OP_DELETE}, 1, - NULL, NULL, NULL, NULL, NULL); + ACL_NO_MODE, NULL, + &(int){SMB2_OP_DELETE}, 1, + NULL, NULL, NULL); } static int smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, @@ -919,8 +910,8 @@ static int smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, in_iov.iov_base = smb2_to_name; in_iov.iov_len = 2 * UniStrnlen((wchar_t *)smb2_to_name, PATH_MAX); rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access, - FILE_OPEN, create_options, ACL_NO_MODE, &in_iov, - &command, 1, cfile, NULL, NULL, NULL, NULL); + FILE_OPEN, create_options, ACL_NO_MODE, + &in_iov, &command, 1, cfile, NULL, NULL); smb2_rename_path: kfree(smb2_to_name); return rc; @@ -971,7 +962,7 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, FILE_WRITE_DATA, FILE_OPEN, 0, ACL_NO_MODE, &in_iov, &(int){SMB2_OP_SET_EOF}, 1, - cfile, NULL, NULL, NULL, NULL); + cfile, NULL, NULL); } int @@ -999,8 +990,8 @@ smb2_set_file_info(struct inode *inode, const char *full_path, rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, ACL_NO_MODE, &in_iov, - &(int){SMB2_OP_SET_INFO}, 1, cfile, - NULL, NULL, NULL, NULL); + &(int){SMB2_OP_SET_INFO}, 1, + cfile, NULL, NULL); cifs_put_tlink(tlink); return rc; } @@ -1035,7 +1026,7 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data, cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, da, cd, co, ACL_NO_MODE, in_iov, - cmds, 2, cfile, NULL, NULL, NULL, NULL); + cmds, 2, cfile, NULL, NULL); if (!rc) { rc = smb311_posix_get_inode_info(&new, full_path, data, sb, xid); @@ -1045,7 +1036,7 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data, cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, da, cd, co, ACL_NO_MODE, in_iov, - cmds, 2, cfile, NULL, NULL, NULL, NULL); + cmds, 2, cfile, NULL, NULL); if (!rc) { rc = cifs_get_inode_info(&new, full_path, data, sb, xid, NULL); @@ -1072,8 +1063,8 @@ int smb2_query_reparse_point(const unsigned int xid, rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, OPEN_REPARSE_POINT, ACL_NO_MODE, &in_iov, - &(int){SMB2_OP_GET_REPARSE}, 1, cfile, - NULL, NULL, NULL, NULL); + &(int){SMB2_OP_GET_REPARSE}, 1, + cfile, NULL, NULL); if (rc) goto out; diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h index 343ada691e763..0034b537b0b3f 100644 --- a/fs/smb/client/smb2proto.h +++ b/fs/smb/client/smb2proto.h @@ -299,9 +299,7 @@ int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, - struct cifs_open_info_data *data, - struct cifs_sid *owner, - struct cifs_sid *group); + struct cifs_open_info_data *data); int posix_info_parse(const void *beg, const void *end, struct smb2_posix_info_parsed *out); int posix_info_sid_size(const void *beg, const void *end); -- GitLab From f83709b9e0eb7048d74ba4515f268c6eacbce9c9 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Fri, 19 Jan 2024 01:08:28 -0300 Subject: [PATCH 1209/1290] smb: client: get rid of smb311_posix_query_path_info() Merge smb311_posix_query_path_info into ->query_path_info() to get rid of duplicate code. Signed-off-by: Paulo Alcantara Signed-off-by: Steve French --- fs/smb/client/inode.c | 4 +- fs/smb/client/smb2inode.c | 115 +++++++++++--------------------------- 2 files changed, 36 insertions(+), 83 deletions(-) diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index cedffaad86aea..f0989484f2c64 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -1312,6 +1312,7 @@ static int smb311_posix_get_fattr(struct cifs_open_info_data *data, const unsigned int xid) { struct cifs_open_info_data tmp_data = {}; + struct TCP_Server_Info *server; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_tcon *tcon; struct tcon_link *tlink; @@ -1322,12 +1323,13 @@ static int smb311_posix_get_fattr(struct cifs_open_info_data *data, if (IS_ERR(tlink)) return PTR_ERR(tlink); tcon = tlink_tcon(tlink); + server = tcon->ses->server; /* * 1. Fetch file metadata if not provided (data) */ if (!data) { - rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, + rc = server->ops->query_path_info(xid, tcon, cifs_sb, full_path, &tmp_data); data = &tmp_data; } diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index f38cdc38f10cb..a652200540c8a 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -678,7 +678,7 @@ int smb2_query_path_info(const unsigned int xid, struct smb2_hdr *hdr; struct kvec in_iov[2], out_iov[3] = {}; int out_buftype[3] = {}; - int cmds[2] = { SMB2_OP_QUERY_INFO, }; + int cmds[2]; bool islink; int i, num_cmds; int rc, rc2; @@ -686,20 +686,36 @@ int smb2_query_path_info(const unsigned int xid, data->adjust_tz = false; data->reparse_point = false; - if (strcmp(full_path, "")) - rc = -ENOENT; - else - rc = open_cached_dir(xid, tcon, full_path, cifs_sb, false, &cfid); - /* If it is a root and its handle is cached then use it */ - if (!rc) { - if (cfid->file_all_info_is_valid) { - memcpy(&data->fi, &cfid->file_all_info, sizeof(data->fi)); + /* + * BB TODO: Add support for using cached root handle in SMB3.1.1 POSIX. + * Create SMB2_query_posix_info worker function to do non-compounded + * query when we already have an open file handle for this. For now this + * is fast enough (always using the compounded version). + */ + if (!tcon->posix_extensions) { + if (*full_path) { + rc = -ENOENT; } else { - rc = SMB2_query_info(xid, tcon, cfid->fid.persistent_fid, - cfid->fid.volatile_fid, &data->fi); + rc = open_cached_dir(xid, tcon, full_path, + cifs_sb, false, &cfid); } - close_cached_dir(cfid); - return rc; + /* If it is a root and its handle is cached then use it */ + if (!rc) { + if (cfid->file_all_info_is_valid) { + memcpy(&data->fi, &cfid->file_all_info, + sizeof(data->fi)); + } else { + rc = SMB2_query_info(xid, tcon, + cfid->fid.persistent_fid, + cfid->fid.volatile_fid, + &data->fi); + } + close_cached_dir(cfid); + return rc; + } + cmds[0] = SMB2_OP_QUERY_INFO; + } else { + cmds[0] = SMB2_OP_POSIX_QUERY_INFO; } in_iov[0].iov_base = data; @@ -722,6 +738,10 @@ int smb2_query_path_info(const unsigned int xid, switch (rc) { case 0: case -EOPNOTSUPP: + /* + * BB TODO: When support for special files added to Samba + * re-verify this path. + */ rc = parse_create_response(data, cifs_sb, &out_iov[0]); if (rc || !data->reparse_point) goto out; @@ -761,75 +781,6 @@ out: return rc; } -int smb311_posix_query_path_info(const unsigned int xid, - struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, - const char *full_path, - struct cifs_open_info_data *data) -{ - int rc; - __u32 create_options = 0; - struct cifsFileInfo *cfile; - struct kvec in_iov[2], out_iov[3] = {}; - int out_buftype[3] = {}; - int cmds[2] = { SMB2_OP_POSIX_QUERY_INFO, }; - int i, num_cmds; - - data->adjust_tz = false; - data->reparse_point = false; - - /* - * BB TODO: Add support for using the cached root handle. - * Create SMB2_query_posix_info worker function to do non-compounded query - * when we already have an open file handle for this. For now this is fast enough - * (always using the compounded version). - */ - in_iov[0].iov_base = data; - in_iov[0].iov_len = sizeof(*data); - in_iov[1] = in_iov[0]; - - cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, in_iov, - cmds, 1, cfile, out_iov, out_buftype); - /* - * If first iov is unset, then SMB session was dropped or we've got a - * cached open file (@cfile). - */ - if (!out_iov[0].iov_base || out_buftype[0] == CIFS_NO_BUFFER) - goto out; - - switch (rc) { - case 0: - case -EOPNOTSUPP: - /* BB TODO: When support for special files added to Samba re-verify this path */ - rc = parse_create_response(data, cifs_sb, &out_iov[0]); - if (rc || !data->reparse_point) - goto out; - - if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK) { - /* symlink already parsed in create response */ - num_cmds = 1; - } else { - cmds[1] = SMB2_OP_GET_REPARSE; - num_cmds = 2; - } - create_options |= OPEN_REPARSE_POINT; - cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, in_iov, - cmds, num_cmds, cfile, NULL, NULL); - break; - } - -out: - for (i = 0; i < ARRAY_SIZE(out_buftype); i++) - free_rsp_buf(out_buftype[i], out_iov[i].iov_base); - return rc; -} - int smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode, struct cifs_tcon *tcon, const char *name, -- GitLab From 66c9314b61ed5b7bfcff0d89359aa0f975c0ab53 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Fri, 19 Jan 2024 01:08:29 -0300 Subject: [PATCH 1210/1290] smb: client: don't clobber ->i_rdev from cached reparse points Don't clobber ->i_rdev from valid reparse inodes over readdir(2) as it can't be provided by query dir responses. Signed-off-by: Paulo Alcantara Signed-off-by: Steve French --- fs/smb/client/readdir.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index e24684112ab0a..94255401b38dc 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -133,14 +133,14 @@ retry: * Query dir responses don't provide enough * information about reparse points other than * their reparse tags. Save an invalidation by - * not clobbering the existing mode, size and - * symlink target (if any) when reparse tag and - * ctime haven't changed. + * not clobbering some existing attributes when + * reparse tag and ctime haven't changed. */ rc = 0; if (fattr->cf_cifsattrs & ATTR_REPARSE) { if (likely(reparse_inode_match(inode, fattr))) { fattr->cf_mode = inode->i_mode; + fattr->cf_rdev = inode->i_rdev; fattr->cf_eof = CIFS_I(inode)->server_eof; fattr->cf_symlink_target = NULL; } else { -- GitLab From 49fe25ce838183afac20f40457157ec009a86930 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Wed, 3 Jan 2024 08:36:22 +0000 Subject: [PATCH 1211/1290] cifs: reschedule periodic query for server interfaces Today, we schedule periodic query for server interfaces once every 10 minutes once a tree connection has been established. Recent change to handle disabling of multichannel disabled this delayed work. This change reenables it following a reconnect, and the server advertises multichannel. Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/smb2pdu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index ec39dfbc31541..88c60187593f9 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -405,6 +405,8 @@ skip_sess_setup: cifs_server_dbg(VFS, "supports multichannel now\n"); cifs_try_adding_channels(ses); + queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, + (SMB_INTERFACE_POLL_INTERVAL * HZ)); } } else { mutex_unlock(&ses->session_mutex); -- GitLab From ce09f8d8a7130e6edfdd6fcad8eb277824d5de95 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Wed, 17 Jan 2024 06:09:16 +0000 Subject: [PATCH 1212/1290] cifs: new mount option called retrans We have several places in the code where we treat the error -EAGAIN very differently. Some code retry for arbitrary number of times. Introducing this new mount option named "retrans", so that all these handlers of -EAGAIN can retry a fixed number of times. This applies only to soft mounts. Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/cifsfs.c | 2 ++ fs/smb/client/cifsglob.h | 1 + fs/smb/client/connect.c | 4 ++++ fs/smb/client/fs_context.c | 6 ++++++ fs/smb/client/fs_context.h | 2 ++ 5 files changed, 15 insertions(+) diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 99b0ade833aa3..de46beb7fa4a9 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -681,6 +681,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",rasize=%u", cifs_sb->ctx->rasize); if (tcon->ses->server->min_offload) seq_printf(s, ",esize=%u", tcon->ses->server->min_offload); + if (tcon->ses->server->retrans) + seq_printf(s, ",retrans=%u", tcon->ses->server->retrans); seq_printf(s, ",echo_interval=%lu", tcon->ses->server->echo_interval / HZ); diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 49ec4d3713fe9..20036fb16cece 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -753,6 +753,7 @@ struct TCP_Server_Info { unsigned int max_read; unsigned int max_write; unsigned int min_offload; + unsigned int retrans; __le16 compress_algorithm; __u16 signing_algorithm; __le16 cipher_type; diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 3052a208c6ca0..bfd568f897105 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -1574,6 +1574,9 @@ static int match_server(struct TCP_Server_Info *server, if (server->min_offload != ctx->min_offload) return 0; + if (server->retrans != ctx->retrans) + return 0; + return 1; } @@ -1798,6 +1801,7 @@ smbd_connected: goto out_err_crypto_release; } tcp_ses->min_offload = ctx->min_offload; + tcp_ses->retrans = ctx->retrans; /* * at this point we are the only ones with the pointer * to the struct since the kernel thread not created yet diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index a3493da12ad1e..52cbef2eeb28f 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -139,6 +139,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_u32("dir_mode", Opt_dirmode), fsparam_u32("port", Opt_port), fsparam_u32("min_enc_offload", Opt_min_enc_offload), + fsparam_u32("retrans", Opt_retrans), fsparam_u32("esize", Opt_min_enc_offload), fsparam_u32("bsize", Opt_blocksize), fsparam_u32("rasize", Opt_rasize), @@ -1064,6 +1065,9 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, case Opt_min_enc_offload: ctx->min_offload = result.uint_32; break; + case Opt_retrans: + ctx->retrans = result.uint_32; + break; case Opt_blocksize: /* * inode blocksize realistically should never need to be @@ -1619,6 +1623,8 @@ int smb3_init_fs_context(struct fs_context *fc) ctx->backupuid_specified = false; /* no backup intent for a user */ ctx->backupgid_specified = false; /* no backup intent for a group */ + ctx->retrans = 1; + /* * short int override_uid = -1; * short int override_gid = -1; diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h index cf46916286d02..182ce11cbe936 100644 --- a/fs/smb/client/fs_context.h +++ b/fs/smb/client/fs_context.h @@ -118,6 +118,7 @@ enum cifs_param { Opt_file_mode, Opt_dirmode, Opt_min_enc_offload, + Opt_retrans, Opt_blocksize, Opt_rasize, Opt_rsize, @@ -245,6 +246,7 @@ struct smb3_fs_context { unsigned int rsize; unsigned int wsize; unsigned int min_offload; + unsigned int retrans; bool sockopt_tcp_nodelay:1; /* attribute cache timemout for files and directories in jiffies */ unsigned long acregmax; -- GitLab From f591062bdbf4742b7f1622173017f19e927057b0 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Tue, 2 Jan 2024 13:14:46 +0000 Subject: [PATCH 1213/1290] cifs: handle servers that still advertise multichannel after disabling Some servers like Azure SMB servers always advertise multichannel capability in server capabilities list. Such servers return error STATUS_NOT_IMPLEMENTED for ioctl calls to query server interfaces, and expect clients to consider that as a sign that they do not support multichannel. We already handled this at mount time. Soon after the tree connect, we query server interfaces. And when server returned STATUS_NOT_IMPLEMENTED, we kept interface list as empty. When cifs_try_adding_channels gets called, it would not find any interfaces, so will not add channels. For the case where an active multichannel mount exists, and multichannel is disabled by such a server, this change will now allow the client to disable secondary channels on the mount. It will check the return status of query server interfaces call soon after a tree reconnect. If the return status is EOPNOTSUPP, then instead of the check to add more channels, we'll disable the secondary channels instead. For better code reuse, this change also moves the common code for disabling multichannel to a helper function. Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/smb2ops.c | 8 +-- fs/smb/client/smb2pdu.c | 107 +++++++++++++++++++++++++--------------- 2 files changed, 69 insertions(+), 46 deletions(-) diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 01a5bd7e6a307..f080fac1b26ee 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -614,7 +614,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, "multichannel not available\n" "Empty network interface list returned by server %s\n", ses->server->hostname); - rc = -EINVAL; + rc = -EOPNOTSUPP; goto out; } @@ -734,12 +734,6 @@ next_iface: if ((bytes_left > 8) || p->Next) cifs_dbg(VFS, "%s: incomplete interface info\n", __func__); - - if (!ses->iface_count) { - rc = -EINVAL; - goto out; - } - out: /* * Go through the list again and put the inactive entries diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 88c60187593f9..288199f0b987d 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -156,6 +156,57 @@ out: return; } +/* helper function for code reuse */ +static int +cifs_chan_skip_or_disable(struct cifs_ses *ses, + struct TCP_Server_Info *server, + bool from_reconnect) +{ + struct TCP_Server_Info *pserver; + unsigned int chan_index; + + if (SERVER_IS_CHAN(server)) { + cifs_dbg(VFS, + "server %s does not support multichannel anymore. Skip secondary channel\n", + ses->server->hostname); + + spin_lock(&ses->chan_lock); + chan_index = cifs_ses_get_chan_index(ses, server); + if (chan_index == CIFS_INVAL_CHAN_INDEX) { + spin_unlock(&ses->chan_lock); + goto skip_terminate; + } + + ses->chans[chan_index].server = NULL; + spin_unlock(&ses->chan_lock); + + /* + * the above reference of server by channel + * needs to be dropped without holding chan_lock + * as cifs_put_tcp_session takes a higher lock + * i.e. cifs_tcp_ses_lock + */ + cifs_put_tcp_session(server, from_reconnect); + + server->terminate = true; + cifs_signal_cifsd_for_reconnect(server, false); + + /* mark primary server as needing reconnect */ + pserver = server->primary_server; + cifs_signal_cifsd_for_reconnect(pserver, false); +skip_terminate: + mutex_unlock(&ses->session_mutex); + return -EHOSTDOWN; + } + + cifs_server_dbg(VFS, + "server does not support multichannel anymore. Disable all other channels\n"); + cifs_disable_secondary_channels(ses); + + + return 0; +} + static int smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, struct TCP_Server_Info *server, bool from_reconnect) @@ -164,8 +215,6 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, struct nls_table *nls_codepage = NULL; struct cifs_ses *ses; int xid; - struct TCP_Server_Info *pserver; - unsigned int chan_index; /* * SMB2s NegProt, SessSetup, Logoff do not have tcon yet so @@ -310,44 +359,11 @@ again: */ if (ses->chan_count > 1 && !(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { - if (SERVER_IS_CHAN(server)) { - cifs_dbg(VFS, "server %s does not support " \ - "multichannel anymore. skipping secondary channel\n", - ses->server->hostname); - - spin_lock(&ses->chan_lock); - chan_index = cifs_ses_get_chan_index(ses, server); - if (chan_index == CIFS_INVAL_CHAN_INDEX) { - spin_unlock(&ses->chan_lock); - goto skip_terminate; - } - - ses->chans[chan_index].server = NULL; - spin_unlock(&ses->chan_lock); - - /* - * the above reference of server by channel - * needs to be dropped without holding chan_lock - * as cifs_put_tcp_session takes a higher lock - * i.e. cifs_tcp_ses_lock - */ - cifs_put_tcp_session(server, from_reconnect); - - server->terminate = true; - cifs_signal_cifsd_for_reconnect(server, false); - - /* mark primary server as needing reconnect */ - pserver = server->primary_server; - cifs_signal_cifsd_for_reconnect(pserver, false); - -skip_terminate: + rc = cifs_chan_skip_or_disable(ses, server, + from_reconnect); + if (rc) { mutex_unlock(&ses->session_mutex); - rc = -EHOSTDOWN; goto out; - } else { - cifs_server_dbg(VFS, "does not support " \ - "multichannel anymore. disabling all other channels\n"); - cifs_disable_secondary_channels(ses); } } @@ -395,11 +411,23 @@ skip_sess_setup: rc = SMB3_request_interfaces(xid, tcon, false); free_xid(xid); - if (rc) + if (rc == -EOPNOTSUPP) { + /* + * some servers like Azure SMB server do not advertise + * that multichannel has been disabled with server + * capabilities, rather return STATUS_NOT_IMPLEMENTED. + * treat this as server not supporting multichannel + */ + + rc = cifs_chan_skip_or_disable(ses, server, + from_reconnect); + goto skip_add_channels; + } else if (rc) cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n", __func__, rc); if (ses->chan_max > ses->chan_count && + ses->iface_count && !SERVER_IS_CHAN(server)) { if (ses->chan_count == 1) cifs_server_dbg(VFS, "supports multichannel now\n"); @@ -411,6 +439,7 @@ skip_sess_setup: } else { mutex_unlock(&ses->session_mutex); } +skip_add_channels: if (smb2_command != SMB2_INTERNAL_CMD) mod_delayed_work(cifsiod_wq, &server->reconnect, 0); -- GitLab From 78e727e58e54efca4c23863fbd9e16e9d2d83f81 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Wed, 3 Jan 2024 12:51:49 +0000 Subject: [PATCH 1214/1290] cifs: update iface_last_update on each query-and-update iface_last_update was an unused field when it was introduced. Later, when we had periodic update of server interface list, this field was used regularly to decide when to update next. However, with the new logic of updating the interfaces, it becomes crucial that this field be updated whenever parse_server_interfaces runs successfully. This change updates this field when either the server does not support query of interfaces; so that we do not query the interfaces repeatedly. It also updates the field when the function reaches the end. Fixes: aa45dadd34e4 ("cifs: change iface_list from array to sorted linked list") Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/smb2ops.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index f080fac1b26ee..d9553c2556a29 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -615,6 +615,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, "Empty network interface list returned by server %s\n", ses->server->hostname); rc = -EOPNOTSUPP; + ses->iface_last_update = jiffies; goto out; } @@ -712,7 +713,6 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, ses->iface_count++; spin_unlock(&ses->iface_lock); - ses->iface_last_update = jiffies; next_iface: nb_iface++; next = le32_to_cpu(p->Next); @@ -734,6 +734,8 @@ next_iface: if ((bytes_left > 8) || p->Next) cifs_dbg(VFS, "%s: incomplete interface info\n", __func__); + ses->iface_last_update = jiffies; + out: /* * Go through the list again and put the inactive entries -- GitLab From d26270061ae66b915138af7cd73ca6f8b85e6b44 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 18 Jan 2024 12:31:55 -0800 Subject: [PATCH 1215/1290] string: Remove strlcpy() With all the users of strlcpy() removed[1] from the kernel, remove the API, self-tests, and other references. Leave mentions in Documentation (about its deprecation), and in checkpatch.pl (to help migrate host-only tools/ usage). Long live strscpy(). Link: https://github.com/KSPP/linux/issues/89 [1] Cc: Azeem Shaikh Cc: Andrew Morton Cc: Andy Whitcroft Cc: Joe Perches Cc: Dwaipayan Ray Cc: Lukas Bulwahn Cc: linux-hardening@vger.kernel.org Reviewed-by: Andy Shevchenko Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 51 ------------------- include/linux/string.h | 3 -- lib/nlattr.c | 2 +- lib/string.c | 15 ------ lib/test_fortify/write_overflow-strlcpy-src.c | 5 -- lib/test_fortify/write_overflow-strlcpy.c | 5 -- 6 files changed, 1 insertion(+), 80 deletions(-) delete mode 100644 lib/test_fortify/write_overflow-strlcpy-src.c delete mode 100644 lib/test_fortify/write_overflow-strlcpy.c diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 79ef6ac4c0211..89a6888f2f9e5 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -214,51 +214,6 @@ __kernel_size_t __fortify_strlen(const char * const POS p) return ret; } -/* Defined after fortified strlen() to reuse it. */ -extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy); -/** - * strlcpy - Copy a string into another string buffer - * - * @p: pointer to destination of copy - * @q: pointer to NUL-terminated source string to copy - * @size: maximum number of bytes to write at @p - * - * If strlen(@q) >= @size, the copy of @q will be truncated at - * @size - 1 bytes. @p will always be NUL-terminated. - * - * Do not use this function. While FORTIFY_SOURCE tries to avoid - * over-reads when calculating strlen(@q), it is still possible. - * Prefer strscpy(), though note its different return values for - * detecting truncation. - * - * Returns total number of bytes written to @p, including terminating NUL. - * - */ -__FORTIFY_INLINE size_t strlcpy(char * const POS p, const char * const POS q, size_t size) -{ - const size_t p_size = __member_size(p); - const size_t q_size = __member_size(q); - size_t q_len; /* Full count of source string length. */ - size_t len; /* Count of characters going into destination. */ - - if (p_size == SIZE_MAX && q_size == SIZE_MAX) - return __real_strlcpy(p, q, size); - q_len = strlen(q); - len = (q_len >= size) ? size - 1 : q_len; - if (__builtin_constant_p(size) && __builtin_constant_p(q_len) && size) { - /* Write size is always larger than destination. */ - if (len >= p_size) - __write_overflow(); - } - if (size) { - if (len >= p_size) - fortify_panic(__func__); - __underlying_memcpy(p, q, len); - p[len] = '\0'; - } - return q_len; -} - /* Defined after fortified strnlen() to reuse it. */ extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(strscpy); /** @@ -272,12 +227,6 @@ extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(strscpy); * @p buffer. The behavior is undefined if the string buffers overlap. The * destination @p buffer is always NUL terminated, unless it's zero-sized. * - * Preferred to strlcpy() since the API doesn't require reading memory - * from the source @q string beyond the specified @size bytes, and since - * the return value is easier to error-check than strlcpy()'s. - * In addition, the implementation is robust to the string changing out - * from underneath it, unlike the current strlcpy() implementation. - * * Preferred to strncpy() since it always returns a valid string, and * doesn't unnecessarily force the tail of the destination buffer to be * zero padded. If padding is desired please use strscpy_pad(). diff --git a/include/linux/string.h b/include/linux/string.h index ce137830a0b99..ab148d8dbfc14 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -66,9 +66,6 @@ extern char * strcpy(char *,const char *); #ifndef __HAVE_ARCH_STRNCPY extern char * strncpy(char *,const char *, __kernel_size_t); #endif -#ifndef __HAVE_ARCH_STRLCPY -size_t strlcpy(char *, const char *, size_t); -#endif #ifndef __HAVE_ARCH_STRSCPY ssize_t strscpy(char *, const char *, size_t); #endif diff --git a/lib/nlattr.c b/lib/nlattr.c index dc15e7888fc1f..ed2ab43e1b22c 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -758,7 +758,7 @@ EXPORT_SYMBOL(nla_find); * @dstsize: Size of destination buffer. * * Copies at most dstsize - 1 bytes into the destination buffer. - * Unlike strlcpy the destination buffer is always padded out. + * Unlike strscpy() the destination buffer is always padded out. * * Return: * * srclen - Returns @nla length (not including the trailing %NUL). diff --git a/lib/string.c b/lib/string.c index be26623953d2e..6891d15ce991c 100644 --- a/lib/string.c +++ b/lib/string.c @@ -103,21 +103,6 @@ char *strncpy(char *dest, const char *src, size_t count) EXPORT_SYMBOL(strncpy); #endif -#ifndef __HAVE_ARCH_STRLCPY -size_t strlcpy(char *dest, const char *src, size_t size) -{ - size_t ret = strlen(src); - - if (size) { - size_t len = (ret >= size) ? size - 1 : ret; - __builtin_memcpy(dest, src, len); - dest[len] = '\0'; - } - return ret; -} -EXPORT_SYMBOL(strlcpy); -#endif - #ifndef __HAVE_ARCH_STRSCPY ssize_t strscpy(char *dest, const char *src, size_t count) { diff --git a/lib/test_fortify/write_overflow-strlcpy-src.c b/lib/test_fortify/write_overflow-strlcpy-src.c deleted file mode 100644 index 91bf83ebd34a5..0000000000000 --- a/lib/test_fortify/write_overflow-strlcpy-src.c +++ /dev/null @@ -1,5 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -#define TEST \ - strlcpy(small, large_src, sizeof(small) + 1) - -#include "test_fortify.h" diff --git a/lib/test_fortify/write_overflow-strlcpy.c b/lib/test_fortify/write_overflow-strlcpy.c deleted file mode 100644 index 1883db7c0cd67..0000000000000 --- a/lib/test_fortify/write_overflow-strlcpy.c +++ /dev/null @@ -1,5 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -#define TEST \ - strlcpy(instance.buf, large_src, sizeof(instance.buf) + 1) - -#include "test_fortify.h" -- GitLab From 68ea60a7961ca6c7c38f856572a146f66949815d Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Fri, 19 Jan 2024 14:20:57 +0800 Subject: [PATCH 1216/1290] coccinelle: device_attr_show: Adapt to the latest Documentation/filesystems/sysfs.rst Adapt description, warning message and MODE=patch according to the latest Documentation/filesystems/sysfs.rst: > show() should only use sysfs_emit() or sysfs_emit_at() when formatting > the value to be returned to user space. After this patch: When MODE=report, $ make coccicheck COCCI=scripts/coccinelle/api/device_attr_show.cocci M=drivers/hid/hid-picolcd_core.c MODE=report <...snip...> drivers/hid/hid-picolcd_core.c:304:8-16: WARNING: please use sysfs_emit or sysfs_emit_at drivers/hid/hid-picolcd_core.c:259:9-17: WARNING: please use sysfs_emit or sysfs_emit_at When MODE=patch, $ make coccicheck COCCI=scripts/coccinelle/api/device_attr_show.cocci M=drivers/hid/hid-picolcd_core.c MODE=patch <...snip...> diff -u -p a/drivers/hid/hid-picolcd_core.c b/drivers/hid/hid-picolcd_core.c --- a/drivers/hid/hid-picolcd_core.c +++ b/drivers/hid/hid-picolcd_core.c @@ -255,10 +255,12 @@ static ssize_t picolcd_operation_mode_sh { struct picolcd_data *data = dev_get_drvdata(dev); - if (data->status & PICOLCD_BOOTLOADER) - return snprintf(buf, PAGE_SIZE, "[bootloader] lcd\n"); - else - return snprintf(buf, PAGE_SIZE, "bootloader [lcd]\n"); + if (data->status & PICOLCD_BOOTLOADER) { + return sysfs_emit(buf, "[bootloader] lcd\n"); + } + else { + return sysfs_emit(buf, "bootloader [lcd]\n"); + } } static ssize_t picolcd_operation_mode_store(struct device *dev, @@ -301,7 +303,7 @@ static ssize_t picolcd_operation_mode_de { struct picolcd_data *data = dev_get_drvdata(dev); - return snprintf(buf, PAGE_SIZE, "hello world\n"); + return sysfs_emit(buf, "hello world\n"); } static ssize_t picolcd_operation_mode_delay_store(struct device *dev, CC: Julia Lawall CC: Nicolas Palix CC: cocci@inria.fr Signed-off-by: Li Zhijian --- scripts/coccinelle/api/device_attr_show.cocci | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/scripts/coccinelle/api/device_attr_show.cocci b/scripts/coccinelle/api/device_attr_show.cocci index a28dc061653aa..634514937e634 100644 --- a/scripts/coccinelle/api/device_attr_show.cocci +++ b/scripts/coccinelle/api/device_attr_show.cocci @@ -1,10 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only /// /// From Documentation/filesystems/sysfs.rst: -/// show() must not use snprintf() when formatting the value to be -/// returned to user space. If you can guarantee that an overflow -/// will never happen you can use sprintf() otherwise you must use -/// scnprintf(). +/// show() should only use sysfs_emit() or sysfs_emit_at() when formatting +/// the value to be returned to user space. /// // Confidence: High // Copyright: (C) 2020 Denis Efremov ISPRAS @@ -30,15 +28,21 @@ ssize_t show(struct device *dev, struct device_attribute *attr, char *buf) @rp depends on patch@ identifier show, dev, attr, buf; +expression BUF, SZ, FORMAT, STR; @@ ssize_t show(struct device *dev, struct device_attribute *attr, char *buf) { <... +( return -- snprintf -+ scnprintf - (...); +- snprintf(BUF, SZ, FORMAT, STR); ++ sysfs_emit(BUF, FORMAT, STR); +| + return +- snprintf(BUF, SZ, STR); ++ sysfs_emit(BUF, STR); +) ...> } @@ -46,10 +50,10 @@ ssize_t show(struct device *dev, struct device_attribute *attr, char *buf) p << r.p; @@ -coccilib.report.print_report(p[0], "WARNING: use scnprintf or sprintf") +coccilib.report.print_report(p[0], "WARNING: please use sysfs_emit or sysfs_emit_at") @script: python depends on org@ p << r.p; @@ -coccilib.org.print_todo(p[0], "WARNING: use scnprintf or sprintf") +coccilib.org.print_todo(p[0], "WARNING: please use sysfs_emit or sysfs_emit_at") -- GitLab From 2bebc3cd48701607e38e8258ab9692de9b1a718b Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 19 Jan 2024 21:47:15 +0100 Subject: [PATCH 1217/1290] Revert "firmware/sysfb: Clear screen_info state after consuming it" This reverts commit df67699c9cb0ceb70f6cc60630ca938c06773eda. Jens Axboe reported a regression that his machine is failing to show a console, or in fact anything, on current -git. There's no output and no console after: Loading Linux 6.7.0+ ... Loading initial ramdisk ... Signed-off-by: Helge Deller Cc: Thomas Zimmermann Cc: Jens Axboe --- drivers/firmware/sysfb.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/drivers/firmware/sysfb.c b/drivers/firmware/sysfb.c index 19706bd2642ad..82fcfd29bc4d2 100644 --- a/drivers/firmware/sysfb.c +++ b/drivers/firmware/sysfb.c @@ -71,7 +71,7 @@ EXPORT_SYMBOL_GPL(sysfb_disable); static __init int sysfb_init(void) { - const struct screen_info *si = &screen_info; + struct screen_info *si = &screen_info; struct simplefb_platform_data mode; const char *name; bool compatible; @@ -119,18 +119,6 @@ static __init int sysfb_init(void) if (ret) goto err; - /* - * The firmware framebuffer is now maintained by the created - * device. Disable screen_info after we've consumed it. Prevents - * invalid access during kexec reboots. - * - * TODO: Vgacon still relies on the global screen_info. Make - * vgacon work with the platform device, so we can clear - * the screen_info unconditionally. - */ - if (strcmp(name, "platform-framebuffer")) - screen_info.orig_video_isVGA = 0; - goto unlock_mutex; err: platform_device_put(pd); -- GitLab From 978ffcbf00d82b03b79e64b5c8249589b50e7463 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 8 Jan 2024 16:43:04 -0800 Subject: [PATCH 1218/1290] execve: open the executable file before doing anything else No point in allocating a new mm, counting arguments and environment variables etc if we're just going to return ENOENT. This patch does expose the fact that 'do_filp_open()' that execve() uses is still unnecessarily expensive in the failure case, because it allocates the 'struct file *' early, even if the path lookup (which is heavily optimized) fails. So that remains an unnecessary cost in the "no such executable" case, but it's a separate issue. Regardless, I do not want to do _both_ a filename_lookup() and a later do_filp_open() like the origin patch by Josh Triplett did in [1]. Reported-by: Josh Triplett Cc: Kees Cook Cc: Mateusz Guzik Cc: Al Viro Link: https://lore.kernel.org/lkml/5c7333ea4bec2fad1b47a8fa2db7c31e4ffc4f14.1663334978.git.josh@joshtriplett.org/ [1] Link: https://lore.kernel.org/lkml/202209161637.9EDAF6B18@keescook/ Link: https://lore.kernel.org/lkml/CAHk-=wgznerM-xs+x+krDfE7eVBiy_HOam35rbsFMMOwvYuEKQ@mail.gmail.com/ Link: https://lore.kernel.org/lkml/CAHk-=whf9qLO8ipps4QhmS0BkM8mtWJhvnuDSdtw5gFjhzvKNA@mail.gmail.com/ Signed-off-by: Linus Torvalds --- fs/exec.c | 69 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 35 insertions(+), 34 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 73e4045df271d..8cdd5b2dd09c2 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1508,12 +1508,24 @@ static void free_bprm(struct linux_binprm *bprm) kfree(bprm); } -static struct linux_binprm *alloc_bprm(int fd, struct filename *filename) +static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags) { - struct linux_binprm *bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); + struct linux_binprm *bprm; + struct file *file; int retval = -ENOMEM; - if (!bprm) - goto out; + + file = do_open_execat(fd, filename, flags); + if (IS_ERR(file)) + return ERR_CAST(file); + + bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); + if (!bprm) { + allow_write_access(file); + fput(file); + return ERR_PTR(-ENOMEM); + } + + bprm->file = file; if (fd == AT_FDCWD || filename->name[0] == '/') { bprm->filename = filename->name; @@ -1526,18 +1538,28 @@ static struct linux_binprm *alloc_bprm(int fd, struct filename *filename) if (!bprm->fdpath) goto out_free; + /* + * Record that a name derived from an O_CLOEXEC fd will be + * inaccessible after exec. This allows the code in exec to + * choose to fail when the executable is not mmaped into the + * interpreter and an open file descriptor is not passed to + * the interpreter. This makes for a better user experience + * than having the interpreter start and then immediately fail + * when it finds the executable is inaccessible. + */ + if (get_close_on_exec(fd)) + bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; + bprm->filename = bprm->fdpath; } bprm->interp = bprm->filename; retval = bprm_mm_init(bprm); - if (retval) - goto out_free; - return bprm; + if (!retval) + return bprm; out_free: free_bprm(bprm); -out: return ERR_PTR(retval); } @@ -1807,10 +1829,8 @@ static int exec_binprm(struct linux_binprm *bprm) /* * sys_execve() executes a new program. */ -static int bprm_execve(struct linux_binprm *bprm, - int fd, struct filename *filename, int flags) +static int bprm_execve(struct linux_binprm *bprm) { - struct file *file; int retval; retval = prepare_bprm_creds(bprm); @@ -1826,26 +1846,8 @@ static int bprm_execve(struct linux_binprm *bprm, current->in_execve = 1; sched_mm_cid_before_execve(current); - file = do_open_execat(fd, filename, flags); - retval = PTR_ERR(file); - if (IS_ERR(file)) - goto out_unmark; - sched_exec(); - bprm->file = file; - /* - * Record that a name derived from an O_CLOEXEC fd will be - * inaccessible after exec. This allows the code in exec to - * choose to fail when the executable is not mmaped into the - * interpreter and an open file descriptor is not passed to - * the interpreter. This makes for a better user experience - * than having the interpreter start and then immediately fail - * when it finds the executable is inaccessible. - */ - if (bprm->fdpath && get_close_on_exec(fd)) - bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; - /* Set the unchanging part of bprm->cred */ retval = security_bprm_creds_for_exec(bprm); if (retval) @@ -1875,7 +1877,6 @@ out: if (bprm->point_of_no_return && !fatal_signal_pending(current)) force_fatal_sig(SIGSEGV); -out_unmark: sched_mm_cid_after_execve(current); current->fs->in_exec = 0; current->in_execve = 0; @@ -1910,7 +1911,7 @@ static int do_execveat_common(int fd, struct filename *filename, * further execve() calls fail. */ current->flags &= ~PF_NPROC_EXCEEDED; - bprm = alloc_bprm(fd, filename); + bprm = alloc_bprm(fd, filename, flags); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; @@ -1959,7 +1960,7 @@ static int do_execveat_common(int fd, struct filename *filename, bprm->argc = 1; } - retval = bprm_execve(bprm, fd, filename, flags); + retval = bprm_execve(bprm); out_free: free_bprm(bprm); @@ -1984,7 +1985,7 @@ int kernel_execve(const char *kernel_filename, if (IS_ERR(filename)) return PTR_ERR(filename); - bprm = alloc_bprm(fd, filename); + bprm = alloc_bprm(fd, filename, 0); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; @@ -2019,7 +2020,7 @@ int kernel_execve(const char *kernel_filename, if (retval < 0) goto out_free; - retval = bprm_execve(bprm, fd, filename, 0); + retval = bprm_execve(bprm); out_free: free_bprm(bprm); out_ret: -- GitLab From ff82e84e80fc0c93095f5a36e0a3508ac121ab80 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 20 Jan 2024 21:56:11 +0100 Subject: [PATCH 1219/1290] coccinelle: device_attr_show: simplify patch case Replacing the final expression argument by ... allows the format string to have multiple arguments. It also has the advantage of allowing the change to be recognized as a change in a single statement, thus avoiding adding unneeded braces. Signed-off-by: Julia Lawall --- scripts/coccinelle/api/device_attr_show.cocci | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/scripts/coccinelle/api/device_attr_show.cocci b/scripts/coccinelle/api/device_attr_show.cocci index 634514937e634..550d1d2fc02a9 100644 --- a/scripts/coccinelle/api/device_attr_show.cocci +++ b/scripts/coccinelle/api/device_attr_show.cocci @@ -34,15 +34,10 @@ expression BUF, SZ, FORMAT, STR; ssize_t show(struct device *dev, struct device_attribute *attr, char *buf) { <... -( return -- snprintf(BUF, SZ, FORMAT, STR); -+ sysfs_emit(BUF, FORMAT, STR); -| - return -- snprintf(BUF, SZ, STR); -+ sysfs_emit(BUF, STR); -) +- snprintf(BUF, SZ, FORMAT ++ sysfs_emit(BUF, FORMAT + ,...); ...> } -- GitLab From 31e97d7c9ae3de072d7b424b2cf706a03ec10720 Mon Sep 17 00:00:00 2001 From: Aurelien Jarno Date: Sat, 13 Jan 2024 19:33:31 +0100 Subject: [PATCH 1220/1290] media: solo6x10: replace max(a, min(b, c)) by clamp(b, a, c) This patch replaces max(a, min(b, c)) by clamp(b, a, c) in the solo6x10 driver. This improves the readability and more importantly, for the solo6x10-p2m.c file, this reduces on my system (x86-64, gcc 13): - the preprocessed size from 121 MiB to 4.5 MiB; - the build CPU time from 46.8 s to 1.6 s; - the build memory from 2786 MiB to 98MiB. In fine, this allows this relatively simple C file to be built on a 32-bit system. Reported-by: Jiri Slaby Closes: https://lore.kernel.org/lkml/18c6df0d-45ed-450c-9eda-95160a2bbb8e@gmail.com/ Cc: # v6.7+ Suggested-by: David Laight Signed-off-by: Aurelien Jarno Reviewed-by: David Laight Reviewed-by: Hans Verkuil Signed-off-by: Linus Torvalds --- drivers/media/pci/solo6x10/solo6x10-offsets.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/media/pci/solo6x10/solo6x10-offsets.h b/drivers/media/pci/solo6x10/solo6x10-offsets.h index f414ee1316f29..fdbb817e63601 100644 --- a/drivers/media/pci/solo6x10/solo6x10-offsets.h +++ b/drivers/media/pci/solo6x10/solo6x10-offsets.h @@ -57,16 +57,16 @@ #define SOLO_MP4E_EXT_ADDR(__solo) \ (SOLO_EREF_EXT_ADDR(__solo) + SOLO_EREF_EXT_AREA(__solo)) #define SOLO_MP4E_EXT_SIZE(__solo) \ - max((__solo->nr_chans * 0x00080000), \ - min(((__solo->sdram_size - SOLO_MP4E_EXT_ADDR(__solo)) - \ - __SOLO_JPEG_MIN_SIZE(__solo)), 0x00ff0000)) + clamp(__solo->sdram_size - SOLO_MP4E_EXT_ADDR(__solo) - \ + __SOLO_JPEG_MIN_SIZE(__solo), \ + __solo->nr_chans * 0x00080000, 0x00ff0000) #define __SOLO_JPEG_MIN_SIZE(__solo) (__solo->nr_chans * 0x00080000) #define SOLO_JPEG_EXT_ADDR(__solo) \ (SOLO_MP4E_EXT_ADDR(__solo) + SOLO_MP4E_EXT_SIZE(__solo)) #define SOLO_JPEG_EXT_SIZE(__solo) \ - max(__SOLO_JPEG_MIN_SIZE(__solo), \ - min((__solo->sdram_size - SOLO_JPEG_EXT_ADDR(__solo)), 0x00ff0000)) + clamp(__solo->sdram_size - SOLO_JPEG_EXT_ADDR(__solo), \ + __SOLO_JPEG_MIN_SIZE(__solo), 0x00ff0000) #define SOLO_SDRAM_END(__solo) \ (SOLO_JPEG_EXT_ADDR(__solo) + SOLO_JPEG_EXT_SIZE(__solo)) -- GitLab From fead90507a37e73d41f6059b325b34412ed8d84b Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 5 Jan 2024 10:06:01 +0800 Subject: [PATCH 1221/1290] fbdev: vt8500lcdfb: Remove unnecessary print function dev_err() The print function dev_err() is redundant because platform_get_irq() already prints an error. Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=7824 Signed-off-by: Jiapeng Chong Signed-off-by: Helge Deller --- drivers/video/fbdev/vt8500lcdfb.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/video/fbdev/vt8500lcdfb.c b/drivers/video/fbdev/vt8500lcdfb.c index 42c25dc851976..ac73937073a76 100644 --- a/drivers/video/fbdev/vt8500lcdfb.c +++ b/drivers/video/fbdev/vt8500lcdfb.c @@ -374,7 +374,6 @@ static int vt8500lcd_probe(struct platform_device *pdev) irq = platform_get_irq(pdev, 0); if (irq < 0) { - dev_err(&pdev->dev, "no IRQ defined\n"); ret = -ENODEV; goto failed_free_palette; } -- GitLab From 04e5eac8f3ab2ff52fa191c187a46d4fdbc1e288 Mon Sep 17 00:00:00 2001 From: Fullway Wang Date: Thu, 18 Jan 2024 11:49:40 +0800 Subject: [PATCH 1222/1290] fbdev: savage: Error out if pixclock equals zero The userspace program could pass any values to the driver through ioctl() interface. If the driver doesn't check the value of pixclock, it may cause divide-by-zero error. Although pixclock is checked in savagefb_decode_var(), but it is not checked properly in savagefb_probe(). Fix this by checking whether pixclock is zero in the function savagefb_check_var() before info->var.pixclock is used as the divisor. This is similar to CVE-2022-3061 in i740fb which was fixed by commit 15cf0b8. Signed-off-by: Fullway Wang Signed-off-by: Helge Deller --- drivers/video/fbdev/savage/savagefb_driver.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/video/fbdev/savage/savagefb_driver.c b/drivers/video/fbdev/savage/savagefb_driver.c index dddd6afcb972a..ebc9aeffdde7c 100644 --- a/drivers/video/fbdev/savage/savagefb_driver.c +++ b/drivers/video/fbdev/savage/savagefb_driver.c @@ -869,6 +869,9 @@ static int savagefb_check_var(struct fb_var_screeninfo *var, DBG("savagefb_check_var"); + if (!var->pixclock) + return -EINVAL; + var->transp.offset = 0; var->transp.length = 0; switch (var->bits_per_pixel) { -- GitLab From e421946be7d9bf545147bea8419ef8239cb7ca52 Mon Sep 17 00:00:00 2001 From: Fullway Wang Date: Thu, 18 Jan 2024 14:24:43 +0800 Subject: [PATCH 1223/1290] fbdev: sis: Error out if pixclock equals zero The userspace program could pass any values to the driver through ioctl() interface. If the driver doesn't check the value of pixclock, it may cause divide-by-zero error. In sisfb_check_var(), var->pixclock is used as a divisor to caculate drate before it is checked against zero. Fix this by checking it at the beginning. This is similar to CVE-2022-3061 in i740fb which was fixed by commit 15cf0b8. Signed-off-by: Fullway Wang Signed-off-by: Helge Deller --- drivers/video/fbdev/sis/sis_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/video/fbdev/sis/sis_main.c b/drivers/video/fbdev/sis/sis_main.c index 803ccb6aa4797..009bf1d926448 100644 --- a/drivers/video/fbdev/sis/sis_main.c +++ b/drivers/video/fbdev/sis/sis_main.c @@ -1444,6 +1444,8 @@ sisfb_check_var(struct fb_var_screeninfo *var, struct fb_info *info) vtotal = var->upper_margin + var->lower_margin + var->vsync_len; + if (!var->pixclock) + return -EINVAL; pixclock = var->pixclock; if((var->vmode & FB_VMODE_MASK) == FB_VMODE_NONINTERLACED) { -- GitLab From e240c1b3635e3fc7d3ba46c6fe12a0d8efb2941a Mon Sep 17 00:00:00 2001 From: Su Yue Date: Mon, 8 Jan 2024 23:11:08 +0800 Subject: [PATCH 1224/1290] bcachefs: fix memleak in bch2_split_devs The pointer dev_name can be modified by strseq(), then causes the memleak: unreferenced object 0xffff9d08a2916c80 (size 32): comm "mount.bcachefs", pid 9090, jiffies 4295856224 (age 17.564s) hex dump (first 32 bytes): 2f 64 65 76 2f 6d 61 70 70 65 72 2f 74 65 73 74 /dev/mapper/test 2d 30 00 00 00 00 00 00 00 00 00 00 00 00 00 00 -0.............. backtrace: [<00000000c5d3be7d>] __kmem_cache_alloc_node+0x1f3/0x2c0 [<0000000052215d26>] __kmalloc_node_track_caller+0x51/0x150 [<0000000069fea956>] kstrdup+0x32/0x60 [<000000000877fcf1>] bch2_split_devs+0x3f/0x150 [bcachefs] [<000000007ee93204>] bch2_mount+0xcb/0x640 [bcachefs] [<000000002dd1e04b>] legacy_get_tree+0x30/0x60 [<000000006afc31d3>] vfs_get_tree+0x28/0xf0 [<000000007b0c538e>] path_mount+0x475/0xb60 [<0000000092de5882>] __x64_sys_mount+0x105/0x140 [<0000000054fc05d8>] do_syscall_64+0x42/0xf0 [<00000000df584910>] entry_SYSCALL_64_after_hwframe+0x6e/0x76 Fix it by copy pointer dev_name at beginning and free the copied pointer at end. Signed-off-by: Su Yue Reviewed-by: Brian Foster Signed-off-by: Kent Overstreet --- fs/bcachefs/util.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index c2ef7cddaa4fc..f927c8a19e241 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -1186,7 +1186,9 @@ int bch2_split_devs(const char *_dev_name, darray_str *ret) { darray_init(ret); - char *dev_name = kstrdup(_dev_name, GFP_KERNEL), *s = dev_name; + char *dev_name, *s, *orig; + + dev_name = orig = kstrdup(_dev_name, GFP_KERNEL); if (!dev_name) return -ENOMEM; @@ -1201,10 +1203,10 @@ int bch2_split_devs(const char *_dev_name, darray_str *ret) } } - kfree(dev_name); + kfree(orig); return 0; err: bch2_darray_str_exit(ret); - kfree(dev_name); + kfree(orig); return -ENOMEM; } -- GitLab From 4ecad0da9de830681ffff973bc0d47b07612bbe6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 10 Jan 2024 23:08:30 -0500 Subject: [PATCH 1225/1290] bcachefs: Don't log errors if BCH_WRITE_ALLOC_NOWAIT Previously, we added logging in the write path to ensure that any unexpected errors getting reported to userspace have a log message; but BCH_WRITE_ALLOC_NOWAIT is a special case, it's used for promotes where errors are expected and not reported out to userspace - so we need to silence those. Signed-off-by: Kent Overstreet --- fs/bcachefs/io_write.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 33c0e783d5469..e69c00fa32bdb 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -1447,10 +1447,11 @@ err: op->flags |= BCH_WRITE_DONE; if (ret < 0) { - bch_err_inum_offset_ratelimited(c, - op->pos.inode, - op->pos.offset << 9, - "%s(): error: %s", __func__, bch2_err_str(ret)); + if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) + bch_err_inum_offset_ratelimited(c, + op->pos.inode, + op->pos.offset << 9, + "%s(): error: %s", __func__, bch2_err_str(ret)); op->error = ret; break; } -- GitLab From 3fe8a1864042f7793e8fd79e4e24678839207153 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 6 Jan 2024 19:29:14 -0500 Subject: [PATCH 1226/1290] bcachefs: eytzinger_for_each() declares loop iter Signed-off-by: Kent Overstreet --- fs/bcachefs/bset.c | 2 +- fs/bcachefs/eytzinger.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index 74bf8eb90a4c4..044fff9b2cf6e 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -720,7 +720,7 @@ static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) { struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t); struct bkey_i min_key, max_key; - unsigned j, cacheline = 1; + unsigned cacheline = 1; t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), bset_ro_tree_capacity(b, t)); diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index 9637f636e32d5..b04750dbf870b 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -156,7 +156,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size) } #define eytzinger1_for_each(_i, _size) \ - for ((_i) = eytzinger1_first((_size)); \ + for (unsigned (_i) = eytzinger1_first((_size)); \ (_i) != 0; \ (_i) = eytzinger1_next((_i), (_size))) @@ -227,7 +227,7 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) } #define eytzinger0_for_each(_i, _size) \ - for ((_i) = eytzinger0_first((_size)); \ + for (unsigned (_i) = eytzinger0_first((_size)); \ (_i) != -1; \ (_i) = eytzinger0_next((_i), (_size))) -- GitLab From 9d5dba2ba86de28f74497d9018889918d368d9fa Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 6 Jan 2024 19:47:09 -0500 Subject: [PATCH 1227/1290] bcachefs: drop to_text code for obsolete bps in alloc keys Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index a09b9d00226a4..f31541a95537a 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -321,7 +321,6 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c { struct bch_alloc_v4 _a; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); - unsigned i; prt_newline(out); printbuf_indent_add(out, 2); @@ -353,23 +352,6 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c prt_printf(out, "fragmentation %llu", a->fragmentation_lru); prt_newline(out); prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a)); - prt_newline(out); - - if (BCH_ALLOC_V4_NR_BACKPOINTERS(a)) { - struct bkey_s_c_alloc_v4 a_raw = bkey_s_c_to_alloc_v4(k); - const struct bch_backpointer *bps = alloc_v4_backpointers_c(a_raw.v); - - prt_printf(out, "backpointers: %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v)); - printbuf_indent_add(out, 2); - - for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v); i++) { - prt_newline(out); - bch2_backpointer_to_text(out, &bps[i]); - } - - printbuf_indent_sub(out, 2); - } - printbuf_indent_sub(out, 2); } -- GitLab From 38c23fb809f60bda1adfc431b18200b8f68c2025 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 7 Jan 2024 17:14:46 -0500 Subject: [PATCH 1228/1290] bcachefs: BTREE_TRIGGER_ATOMIC Add a new flag to be explicit about when we're running atomic triggers. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 2 +- fs/bcachefs/bkey_methods.h | 10 ++++++---- fs/bcachefs/btree_trans_commit.c | 19 +++++++------------ fs/bcachefs/btree_types.h | 4 ++-- fs/bcachefs/ec.c | 2 +- fs/bcachefs/inode.c | 2 +- fs/bcachefs/reflink.c | 10 +++++----- fs/bcachefs/reflink.h | 8 ++++---- 8 files changed, 27 insertions(+), 30 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index f31541a95537a..bebaaf8dbeea8 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -821,7 +821,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, } } - if (!(flags & BTREE_TRIGGER_TRANSACTIONAL) && (flags & BTREE_TRIGGER_INSERT)) { + if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) { struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v; u64 journal_seq = trans->journal_res.seq; u64 bucket_journal_seq = new_a->journal_seq; diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h index ee82283722b75..03efe8ee565a9 100644 --- a/fs/bcachefs/bkey_methods.h +++ b/fs/bcachefs/bkey_methods.h @@ -83,9 +83,10 @@ enum btree_update_flags { __BTREE_TRIGGER_NORUN, __BTREE_TRIGGER_TRANSACTIONAL, + __BTREE_TRIGGER_ATOMIC, + __BTREE_TRIGGER_GC, __BTREE_TRIGGER_INSERT, __BTREE_TRIGGER_OVERWRITE, - __BTREE_TRIGGER_GC, __BTREE_TRIGGER_BUCKET_INVALIDATE, }; @@ -107,6 +108,10 @@ enum btree_update_flags { * causing us to go emergency read-only) */ #define BTREE_TRIGGER_TRANSACTIONAL (1U << __BTREE_TRIGGER_TRANSACTIONAL) +#define BTREE_TRIGGER_ATOMIC (1U << __BTREE_TRIGGER_ATOMIC) + +/* We're in gc/fsck: running triggers to recalculate e.g. disk usage */ +#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) /* @new is entering the btree */ #define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) @@ -114,9 +119,6 @@ enum btree_update_flags { /* @old is leaving the btree */ #define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) -/* We're in gc/fsck: running triggers to recalculate e.g. disk usage */ -#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) - /* signal from bucket invalidate path to alloc trigger */ #define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 90eb8065ff2da..e3a82c33912b9 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -448,9 +448,6 @@ static int run_one_mem_trigger(struct btree_trans *trans, if (unlikely(flags & BTREE_TRIGGER_NORUN)) return 0; - if (!btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id))) - return 0; - if (old_ops->trigger == new_ops->trigger) { ret = bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(new), @@ -586,9 +583,6 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) { - struct bch_fs *c = trans->c; - int ret = 0; - trans_for_each_update(trans, i) { /* * XXX: synchronization of cached update triggers with gc @@ -596,14 +590,15 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) */ BUG_ON(i->cached || i->level); - if (gc_visited(c, gc_pos_btree_node(insert_l(trans, i)->b))) { - ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); + if (btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)) && + gc_visited(trans->c, gc_pos_btree_node(insert_l(trans, i)->b))) { + int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); if (ret) - break; + return ret; } } - return ret; + return 0; } static inline int @@ -689,8 +684,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, } trans_for_each_update(trans, i) - if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) { - ret = run_one_mem_trigger(trans, i, i->flags); + if (BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS & (1U << i->bkey_type)) { + ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_ATOMIC|i->flags); if (ret) goto fatal_err; } diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index d530307046f4c..e46867536fa6a 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -653,7 +653,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type); BIT_ULL(BKEY_TYPE_reflink)| \ BIT_ULL(BKEY_TYPE_btree)) -#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \ +#define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS \ (BIT_ULL(BKEY_TYPE_alloc)| \ BIT_ULL(BKEY_TYPE_inodes)| \ BIT_ULL(BKEY_TYPE_stripes)| \ @@ -661,7 +661,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type); #define BTREE_NODE_TYPE_HAS_TRIGGERS \ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ - BTREE_NODE_TYPE_HAS_MEM_TRIGGERS) + BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS) static inline bool btree_node_type_needs_gc(enum btree_node_type type) { diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index d802bc63c8d0b..b29b8a20bb8b4 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -367,7 +367,7 @@ int bch2_trigger_stripe(struct btree_trans *trans, } } - if (!(flags & (BTREE_TRIGGER_TRANSACTIONAL|BTREE_TRIGGER_GC))) { + if (flags & BTREE_TRIGGER_ATOMIC) { struct stripe *m = genradix_ptr(&c->stripes, idx); if (!m) { diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 37dce96f48ac4..51a06324b21d3 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -587,7 +587,7 @@ int bch2_trigger_inode(struct btree_trans *trans, } } - if (!(flags & BTREE_TRIGGER_TRANSACTIONAL) && (flags & BTREE_TRIGGER_INSERT)) { + if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) { BUG_ON(!trans->journal_res.seq); bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index faa5d36700587..6070109174210 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -292,10 +292,10 @@ static inline void check_indirect_extent_deleting(struct bkey_s new, unsigned *f } } -int bch2_trans_mark_reflink_v(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s new, - unsigned flags) +int bch2_trigger_reflink_v(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s new, + unsigned flags) { if ((flags & BTREE_TRIGGER_TRANSACTIONAL) && (flags & BTREE_TRIGGER_INSERT)) @@ -324,7 +324,7 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out, min(datalen, 32U), d.v->data); } -int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans, +int bch2_trigger_indirect_inline_data(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s new, unsigned flags) diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h index 8ee778ec0022a..4d8867289717b 100644 --- a/fs/bcachefs/reflink.h +++ b/fs/bcachefs/reflink.h @@ -24,14 +24,14 @@ int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned, +int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, unsigned); #define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \ .key_invalid = bch2_reflink_v_invalid, \ .val_to_text = bch2_reflink_v_to_text, \ .swab = bch2_ptr_swab, \ - .trigger = bch2_trans_mark_reflink_v, \ + .trigger = bch2_trigger_reflink_v, \ .min_val_size = 8, \ }) @@ -39,7 +39,7 @@ int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_indirect_inline_data_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_trans_mark_indirect_inline_data(struct btree_trans *, +int bch2_trigger_indirect_inline_data(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, unsigned); @@ -47,7 +47,7 @@ int bch2_trans_mark_indirect_inline_data(struct btree_trans *, #define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \ .key_invalid = bch2_indirect_inline_data_invalid, \ .val_to_text = bch2_indirect_inline_data_to_text, \ - .trigger = bch2_trans_mark_indirect_inline_data, \ + .trigger = bch2_trigger_indirect_inline_data, \ .min_val_size = 8, \ }) -- GitLab From e58f963cecbdb08f28334122afba93a7840beabc Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 6 Jan 2024 20:57:43 -0500 Subject: [PATCH 1229/1290] bcachefs: helpers for printing data types We need bounds checking since new versions may introduce new data types. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 9 +++------ fs/bcachefs/alloc_foreground.c | 7 ++++--- fs/bcachefs/btree_gc.c | 24 ++++++++++++------------ fs/bcachefs/buckets.c | 26 +++++++++++++------------- fs/bcachefs/buckets.h | 15 +++++++++++++++ fs/bcachefs/ec.c | 4 ++-- fs/bcachefs/journal_io.c | 5 +---- fs/bcachefs/move.c | 6 +++--- fs/bcachefs/opts.c | 2 +- fs/bcachefs/opts.h | 2 +- fs/bcachefs/replicas.c | 18 ++++-------------- fs/bcachefs/sb-members.c | 4 ++-- fs/bcachefs/super.c | 2 +- fs/bcachefs/sysfs.c | 4 ++-- 14 files changed, 64 insertions(+), 64 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index bebaaf8dbeea8..614da759226b5 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -273,7 +273,7 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, bkey_fsck_err_on(!bch2_bucket_sectors_dirty(*a.v), c, err, alloc_key_dirty_sectors_0, "data_type %s but dirty_sectors==0", - bch2_data_types[a.v->data_type]); + bch2_data_type_str(a.v->data_type)); break; case BCH_DATA_cached: bkey_fsck_err_on(!a.v->cached_sectors || @@ -325,11 +325,8 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c prt_newline(out); printbuf_indent_add(out, 2); - prt_printf(out, "gen %u oldest_gen %u data_type %s", - a->gen, a->oldest_gen, - a->data_type < BCH_DATA_NR - ? bch2_data_types[a->data_type] - : "(invalid data type)"); + prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen); + bch2_prt_data_type(out, a->data_type); prt_newline(out); prt_printf(out, "journal_seq %llu", a->journal_seq); prt_newline(out); diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index b0ff47998a944..633d3223b353f 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -1525,10 +1525,11 @@ static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, str unsigned data_type = ob->data_type; barrier(); /* READ_ONCE() doesn't work on bitfields */ - prt_printf(out, "%zu ref %u %s %u:%llu gen %u allocated %u/%u", + prt_printf(out, "%zu ref %u ", ob - c->open_buckets, - atomic_read(&ob->pin), - data_type < BCH_DATA_NR ? bch2_data_types[data_type] : "invalid data type", + atomic_read(&ob->pin)); + bch2_prt_data_type(out, data_type); + prt_printf(out, " %u:%llu gen %u allocated %u/%u", ob->dev, ob->bucket, ob->gen, ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size); if (ob->ec) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 49b4ade758c36..523e9b1069cd2 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -597,7 +597,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[ptr_data_type(k->k, &p.ptr)], + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), p.ptr.gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { @@ -615,7 +615,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[ptr_data_type(k->k, &p.ptr)], + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), p.ptr.gen, g->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { @@ -637,7 +637,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, - bch2_data_types[ptr_data_type(k->k, &p.ptr)], + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), p.ptr.gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) @@ -649,7 +649,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[ptr_data_type(k->k, &p.ptr)], + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), p.ptr.gen, g->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) @@ -664,8 +664,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id "bucket %u:%zu different types of data in same bucket: %s, %s\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[g->data_type], - bch2_data_types[data_type], + bch2_data_type_str(g->data_type), + bch2_data_type_str(data_type), (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { if (data_type == BCH_DATA_btree) { @@ -1238,11 +1238,11 @@ static int bch2_gc_done(struct bch_fs *c, for (i = 0; i < BCH_DATA_NR; i++) { copy_dev_field(dev_usage_buckets_wrong, - d[i].buckets, "%s buckets", bch2_data_types[i]); + d[i].buckets, "%s buckets", bch2_data_type_str(i)); copy_dev_field(dev_usage_sectors_wrong, - d[i].sectors, "%s sectors", bch2_data_types[i]); + d[i].sectors, "%s sectors", bch2_data_type_str(i)); copy_dev_field(dev_usage_fragmented_wrong, - d[i].fragmented, "%s fragmented", bch2_data_types[i]); + d[i].fragmented, "%s fragmented", bch2_data_type_str(i)); } } @@ -1417,8 +1417,8 @@ static int bch2_alloc_write_key(struct btree_trans *trans, ": got %s, should be %s", iter->pos.inode, iter->pos.offset, gc.gen, - bch2_data_types[new.data_type], - bch2_data_types[gc.data_type])) + bch2_data_type_str(new.data_type), + bch2_data_type_str(gc.data_type))) new.data_type = gc.data_type; #define copy_bucket_field(_errtype, _f) \ @@ -1428,7 +1428,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans, ": got %u, should be %u", \ iter->pos.inode, iter->pos.offset, \ gc.gen, \ - bch2_data_types[gc.data_type], \ + bch2_data_type_str(gc.data_type), \ new._f, gc._f)) \ new._f = gc._f; \ diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index d83ea0e53df3f..5dc19363bb9fe 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -284,7 +284,7 @@ void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage) prt_newline(out); for (unsigned i = 0; i < BCH_DATA_NR; i++) { - prt_str(out, bch2_data_types[i]); + bch2_prt_data_type(out, i); prt_tab(out); prt_u64(out, usage->d[i].buckets); prt_tab_rjust(out); @@ -523,8 +523,8 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, if (bch2_fs_inconsistent_on(g->data_type && g->data_type != data_type, c, "different types of data in same bucket: %s, %s", - bch2_data_types[g->data_type], - bch2_data_types[data_type])) { + bch2_data_type_str(g->data_type), + bch2_data_type_str(data_type))) { ret = -EIO; goto err; } @@ -532,7 +532,7 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size", ca->dev_idx, b, g->gen, - bch2_data_types[g->data_type ?: data_type], + bch2_data_type_str(g->data_type ?: data_type), g->dirty_sectors, sectors)) { ret = -EIO; goto err; @@ -575,7 +575,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans, "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" "while marking %s", ptr->dev, bucket_nr, b_gen, - bch2_data_types[bucket_data_type ?: ptr_data_type], + bch2_data_type_str(bucket_data_type ?: ptr_data_type), ptr->gen, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ret = -EIO; @@ -588,7 +588,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" "while marking %s", ptr->dev, bucket_nr, b_gen, - bch2_data_types[bucket_data_type ?: ptr_data_type], + bch2_data_type_str(bucket_data_type ?: ptr_data_type), ptr->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); @@ -603,7 +603,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans, "while marking %s", ptr->dev, bucket_nr, b_gen, *bucket_gen(ca, bucket_nr), - bch2_data_types[bucket_data_type ?: ptr_data_type], + bch2_data_type_str(bucket_data_type ?: ptr_data_type), ptr->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); @@ -624,8 +624,8 @@ int bch2_check_bucket_ref(struct btree_trans *trans, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", ptr->dev, bucket_nr, b_gen, - bch2_data_types[bucket_data_type], - bch2_data_types[ptr_data_type], + bch2_data_type_str(bucket_data_type), + bch2_data_type_str(ptr_data_type), (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ret = -EIO; @@ -638,7 +638,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans, "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n" "while marking %s", ptr->dev, bucket_nr, b_gen, - bch2_data_types[bucket_data_type ?: ptr_data_type], + bch2_data_type_str(bucket_data_type ?: ptr_data_type), bucket_sectors, sectors, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); @@ -1130,9 +1130,9 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_types[a->v.data_type], - bch2_data_types[type], - bch2_data_types[type]); + bch2_data_type_str(a->v.data_type), + bch2_data_type_str(type), + bch2_data_type_str(type)); ret = -EIO; goto err; } diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 2c95cc5d86be6..2b1e907f2acad 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -385,6 +385,21 @@ static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) return false; } +static inline const char *bch2_data_type_str(enum bch_data_type type) +{ + return type < BCH_DATA_NR + ? __bch2_data_types[type] + : "(invalid data type)"; +} + +static inline void bch2_prt_data_type(struct printbuf *out, enum bch_data_type type) +{ + if (type < BCH_DATA_NR) + prt_str(out, __bch2_data_types[type]); + else + prt_printf(out, "(invalid data type %u)", type); +} + /* disk reservations: */ static inline void bch2_disk_reservation_put(struct bch_fs *c, diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index b29b8a20bb8b4..d503af2700247 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -190,7 +190,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, a->v.stripe_redundancy, trans, "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_types[a->v.data_type], + bch2_data_type_str(a->v.data_type), a->v.dirty_sectors, a->v.stripe, s.k->p.offset)) { ret = -EIO; @@ -200,7 +200,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans, "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_types[a->v.data_type], + bch2_data_type_str(a->v.data_type), a->v.dirty_sectors, s.k->p.offset)) { ret = -EIO; diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index b0f4dd491e120..04a1e79a5ed39 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -683,10 +683,7 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); for (i = 0; i < nr_types; i++) { - if (i < BCH_DATA_NR) - prt_printf(out, " %s", bch2_data_types[i]); - else - prt_printf(out, " (unknown data type %u)", i); + bch2_prt_data_type(out, i); prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", le64_to_cpu(u->d[i].buckets), le64_to_cpu(u->d[i].sectors), diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 7a33319dcd168..a9e0920b34f32 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -1083,9 +1083,9 @@ int bch2_data_job(struct bch_fs *c, void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) { - prt_printf(out, "%s: data type=%s pos=", - stats->name, - bch2_data_types[stats->data_type]); + prt_printf(out, "%s: data type==", stats->name); + bch2_prt_data_type(out, stats->data_type); + prt_str(out, " pos="); bch2_bbpos_to_text(out, stats->pos); prt_newline(out); printbuf_indent_add(out, 2); diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 8e6f230eac381..6aaf78de88457 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -72,7 +72,7 @@ const char * const bch2_str_hash_opts[] = { NULL }; -const char * const bch2_data_types[] = { +const char * const __bch2_data_types[] = { BCH_DATA_TYPES() NULL }; diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 93a24fef42148..67e98e00e9372 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -22,7 +22,7 @@ extern const char * const bch2_compression_types[]; extern const char * const bch2_compression_opts[]; extern const char * const bch2_str_hash_types[]; extern const char * const bch2_str_hash_opts[]; -extern const char * const bch2_data_types[]; +extern const char * const __bch2_data_types[]; extern const char * const bch2_member_states[]; extern const char * const bch2_jset_entry_types[]; extern const char * const bch2_fs_usage_types[]; diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 92ba56ef1fc89..1c3900da4c770 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -39,15 +39,10 @@ static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) static void bch2_replicas_entry_v0_to_text(struct printbuf *out, struct bch_replicas_entry_v0 *e) { - unsigned i; - - if (e->data_type < BCH_DATA_NR) - prt_printf(out, "%s", bch2_data_types[e->data_type]); - else - prt_printf(out, "(invalid data type %u)", e->data_type); + bch2_prt_data_type(out, e->data_type); prt_printf(out, ": %u [", e->nr_devs); - for (i = 0; i < e->nr_devs; i++) + for (unsigned i = 0; i < e->nr_devs; i++) prt_printf(out, i ? " %u" : "%u", e->devs[i]); prt_printf(out, "]"); } @@ -55,15 +50,10 @@ static void bch2_replicas_entry_v0_to_text(struct printbuf *out, void bch2_replicas_entry_to_text(struct printbuf *out, struct bch_replicas_entry_v1 *e) { - unsigned i; - - if (e->data_type < BCH_DATA_NR) - prt_printf(out, "%s", bch2_data_types[e->data_type]); - else - prt_printf(out, "(invalid data type %u)", e->data_type); + bch2_prt_data_type(out, e->data_type); prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs); - for (i = 0; i < e->nr_devs; i++) + for (unsigned i = 0; i < e->nr_devs; i++) prt_printf(out, i ? " %u" : "%u", e->devs[i]); prt_printf(out, "]"); } diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index a44a238bf8b55..a45354d2acde9 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -251,7 +251,7 @@ static void member_to_text(struct printbuf *out, prt_printf(out, "Data allowed:"); prt_tab(out); if (BCH_MEMBER_DATA_ALLOWED(&m)) - prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m)); + prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m)); else prt_printf(out, "(none)"); prt_newline(out); @@ -259,7 +259,7 @@ static void member_to_text(struct printbuf *out, prt_printf(out, "Has data:"); prt_tab(out); if (data_have) - prt_bitflags(out, bch2_data_types, data_have); + prt_bitflags(out, __bch2_data_types, data_have); else prt_printf(out, "(none)"); prt_newline(out); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 9dbc35940197f..a3ec21f229ed6 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1625,7 +1625,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) if (data) { struct printbuf data_has = PRINTBUF; - prt_bitflags(&data_has, bch2_data_types, data); + prt_bitflags(&data_has, __bch2_data_types, data); bch_err(ca, "Remove failed, still has data (%s)", data_has.buf); printbuf_exit(&data_has); ret = -EBUSY; diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 8ed52319ff68d..434961e400e24 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -883,7 +883,7 @@ static void dev_io_done_to_text(struct printbuf *out, struct bch_dev *ca) for (i = 1; i < BCH_DATA_NR; i++) prt_printf(out, "%-12s:%12llu\n", - bch2_data_types[i], + bch2_data_type_str(i), percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9); } } @@ -908,7 +908,7 @@ SHOW(bch2_dev) } if (attr == &sysfs_has_data) { - prt_bitflags(out, bch2_data_types, bch2_dev_has_data(c, ca)); + prt_bitflags(out, __bch2_data_types, bch2_dev_has_data(c, ca)); prt_char(out, '\n'); } -- GitLab From 4f564f4f9fdd4d120ee04678b0c22e40cc8b6b47 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 6 Jan 2024 21:01:47 -0500 Subject: [PATCH 1230/1290] bcachefs: bch2_prt_compression_type() bounds checking helper, since compression types are extensible Signed-off-by: Kent Overstreet --- fs/bcachefs/compress.h | 8 ++++++++ fs/bcachefs/extents.c | 6 +++--- fs/bcachefs/opts.c | 2 +- fs/bcachefs/opts.h | 2 +- fs/bcachefs/sysfs.c | 3 ++- 5 files changed, 15 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h index 607fd5e232c90..58c2eb45570ff 100644 --- a/fs/bcachefs/compress.h +++ b/fs/bcachefs/compress.h @@ -47,6 +47,14 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v) return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; } +static inline void bch2_prt_compression_type(struct printbuf *out, enum bch_compression_type type) +{ + if (type < BCH_COMPRESSION_TYPE_NR) + prt_str(out, __bch2_compression_types[type]); + else + prt_printf(out, "(invalid compression type %u)", type); +} + int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, struct bch_extent_crc_unpacked *); int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 82ec056f4cdbb..edb1e32d77831 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -1018,12 +1018,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct bch_extent_crc_unpacked crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s", + prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress ", crc.compressed_size, crc.uncompressed_size, crc.offset, crc.nonce, - bch2_csum_types[crc.csum_type], - bch2_compression_types[crc.compression_type]); + bch2_csum_types[crc.csum_type]); + bch2_prt_compression_type(out, crc.compression_type); break; } case BCH_EXTENT_ENTRY_stripe_ptr: { diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 6aaf78de88457..b1ed0b9a20d35 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -52,7 +52,7 @@ const char * const bch2_csum_opts[] = { NULL }; -const char * const bch2_compression_types[] = { +const char * const __bch2_compression_types[] = { BCH_COMPRESSION_TYPES() NULL }; diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 67e98e00e9372..7414c564b5d83 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -18,7 +18,7 @@ extern const char * const bch2_sb_compat[]; extern const char * const __bch2_btree_ids[]; extern const char * const bch2_csum_types[]; extern const char * const bch2_csum_opts[]; -extern const char * const bch2_compression_types[]; +extern const char * const __bch2_compression_types[]; extern const char * const bch2_compression_opts[]; extern const char * const bch2_str_hash_types[]; extern const char * const bch2_str_hash_opts[]; diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 434961e400e24..553190d719dfc 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -21,6 +21,7 @@ #include "btree_gc.h" #include "buckets.h" #include "clock.h" +#include "compress.h" #include "disk_groups.h" #include "ec.h" #include "inode.h" @@ -330,7 +331,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c prt_newline(out); for (unsigned i = 0; i < ARRAY_SIZE(s); i++) { - prt_str(out, bch2_compression_types[i]); + bch2_prt_compression_type(out, i); prt_tab(out); prt_human_readable_u64(out, s[i].sectors_compressed << 9); -- GitLab From 8e7834a8831678d0825895d7f5a02ad0b29bbcde Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 17 Nov 2023 00:03:45 -0500 Subject: [PATCH 1231/1290] bcachefs: bch_fs_usage_base Split out base filesystem usage into its own type; prep work for breaking up bch2_trans_fs_usage_apply(). Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 12 ++++---- fs/bcachefs/btree_types.h | 3 ++ fs/bcachefs/buckets.c | 56 ++++++++++++++++++------------------- fs/bcachefs/buckets_types.h | 15 ++++------ fs/bcachefs/inode.c | 2 +- fs/bcachefs/recovery.c | 2 +- fs/bcachefs/sb-clean.c | 2 +- 7 files changed, 45 insertions(+), 47 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 523e9b1069cd2..1102995643b13 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -1253,19 +1253,19 @@ static int bch2_gc_done(struct bch_fs *c, bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr); copy_fs_field(fs_usage_hidden_wrong, - hidden, "hidden"); + b.hidden, "hidden"); copy_fs_field(fs_usage_btree_wrong, - btree, "btree"); + b.btree, "btree"); if (!metadata_only) { copy_fs_field(fs_usage_data_wrong, - data, "data"); + b.data, "data"); copy_fs_field(fs_usage_cached_wrong, - cached, "cached"); + b.cached, "cached"); copy_fs_field(fs_usage_reserved_wrong, - reserved, "reserved"); + b.reserved, "reserved"); copy_fs_field(fs_usage_nr_inodes_wrong, - nr_inodes,"nr_inodes"); + b.nr_inodes,"nr_inodes"); for (i = 0; i < BCH_REPLICAS_MAX; i++) copy_fs_field(fs_usage_persistent_reserved_wrong, diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index e46867536fa6a..e58e9a7f7b627 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -430,6 +430,9 @@ struct btree_trans { struct journal_res journal_res; u64 *journal_seq; struct disk_reservation *disk_res; + + struct bch_fs_usage_base fs_usage_delta; + unsigned journal_u64s; unsigned extra_disk_res; /* XXX kill */ struct replicas_delta_list *fs_usage_deltas; diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 5dc19363bb9fe..f8b9be9a84579 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -25,7 +25,7 @@ #include -static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage, +static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage, enum bch_data_type data_type, s64 sectors) { @@ -54,20 +54,20 @@ void bch2_fs_usage_initialize(struct bch_fs *c) bch2_fs_usage_acc_to_base(c, i); for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) - usage->reserved += usage->persistent_reserved[i]; + usage->b.reserved += usage->persistent_reserved[i]; for (unsigned i = 0; i < c->replicas.nr; i++) { struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); - fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]); + fs_usage_data_type_to_base(&usage->b, e->data_type, usage->replicas[i]); } for_each_member_device(c, ca) { struct bch_dev_usage dev = bch2_dev_usage_read(ca); - usage->hidden += (dev.d[BCH_DATA_sb].buckets + - dev.d[BCH_DATA_journal].buckets) * + usage->b.hidden += (dev.d[BCH_DATA_sb].buckets + + dev.d[BCH_DATA_journal].buckets) * ca->mi.bucket_size; } @@ -188,15 +188,15 @@ void bch2_fs_usage_to_text(struct printbuf *out, prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity); prt_printf(out, "hidden:\t\t\t\t%llu\n", - fs_usage->u.hidden); + fs_usage->u.b.hidden); prt_printf(out, "data:\t\t\t\t%llu\n", - fs_usage->u.data); + fs_usage->u.b.data); prt_printf(out, "cached:\t\t\t\t%llu\n", - fs_usage->u.cached); + fs_usage->u.b.cached); prt_printf(out, "reserved:\t\t\t%llu\n", - fs_usage->u.reserved); + fs_usage->u.b.reserved); prt_printf(out, "nr_inodes:\t\t\t%llu\n", - fs_usage->u.nr_inodes); + fs_usage->u.b.nr_inodes); prt_printf(out, "online reserved:\t\t%llu\n", fs_usage->online_reserved); @@ -225,10 +225,10 @@ static u64 reserve_factor(u64 r) u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage) { - return min(fs_usage->u.hidden + - fs_usage->u.btree + - fs_usage->u.data + - reserve_factor(fs_usage->u.reserved + + return min(fs_usage->u.b.hidden + + fs_usage->u.b.btree + + fs_usage->u.b.data + + reserve_factor(fs_usage->u.b.reserved + fs_usage->online_reserved), c->capacity); } @@ -240,17 +240,17 @@ __bch2_fs_usage_read_short(struct bch_fs *c) u64 data, reserved; ret.capacity = c->capacity - - bch2_fs_usage_read_one(c, &c->usage_base->hidden); + bch2_fs_usage_read_one(c, &c->usage_base->b.hidden); - data = bch2_fs_usage_read_one(c, &c->usage_base->data) + - bch2_fs_usage_read_one(c, &c->usage_base->btree); - reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) + + data = bch2_fs_usage_read_one(c, &c->usage_base->b.data) + + bch2_fs_usage_read_one(c, &c->usage_base->b.btree); + reserved = bch2_fs_usage_read_one(c, &c->usage_base->b.reserved) + percpu_u64_get(c->online_reserved); ret.used = min(ret.capacity, data + reserve_factor(reserved)); ret.free = ret.capacity - ret.used; - ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes); + ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes); return ret; } @@ -308,9 +308,9 @@ void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, fs_usage = fs_usage_ptr(c, journal_seq, gc); if (data_type_is_hidden(old->data_type)) - fs_usage->hidden -= ca->mi.bucket_size; + fs_usage->b.hidden -= ca->mi.bucket_size; if (data_type_is_hidden(new->data_type)) - fs_usage->hidden += ca->mi.bucket_size; + fs_usage->b.hidden += ca->mi.bucket_size; u = dev_usage_ptr(ca, journal_seq, gc); @@ -359,7 +359,7 @@ static inline int __update_replicas(struct bch_fs *c, if (idx < 0) return -1; - fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); + fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors); fs_usage->replicas[idx] += sectors; return 0; } @@ -394,7 +394,7 @@ int bch2_update_replicas(struct bch_fs *c, struct bkey_s_c k, preempt_disable(); fs_usage = fs_usage_ptr(c, journal_seq, gc); - fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); + fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors); fs_usage->replicas[idx] += sectors; preempt_enable(); err: @@ -677,11 +677,11 @@ void bch2_trans_fs_usage_revert(struct btree_trans *trans, BUG_ON(__update_replicas(c, dst, &d->r, -d->delta)); } - dst->nr_inodes -= deltas->nr_inodes; + dst->b.nr_inodes -= deltas->nr_inodes; for (i = 0; i < BCH_REPLICAS_MAX; i++) { added -= deltas->persistent_reserved[i]; - dst->reserved -= deltas->persistent_reserved[i]; + dst->b.reserved -= deltas->persistent_reserved[i]; dst->persistent_reserved[i] -= deltas->persistent_reserved[i]; } @@ -723,11 +723,11 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans, goto need_mark; } - dst->nr_inodes += deltas->nr_inodes; + dst->b.nr_inodes += deltas->nr_inodes; for (i = 0; i < BCH_REPLICAS_MAX; i++) { added += deltas->persistent_reserved[i]; - dst->reserved += deltas->persistent_reserved[i]; + dst->b.reserved += deltas->persistent_reserved[i]; dst->persistent_reserved[i] += deltas->persistent_reserved[i]; } @@ -1084,7 +1084,7 @@ static int __trigger_reservation(struct btree_trans *trans, struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage_gc); replicas = min(replicas, ARRAY_SIZE(fs_usage->persistent_reserved)); - fs_usage->reserved += sectors; + fs_usage->b.reserved += sectors; fs_usage->persistent_reserved[replicas - 1] += sectors; preempt_enable(); diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 783f71017204c..6a31740222a71 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -45,23 +45,18 @@ struct bch_dev_usage { } d[BCH_DATA_NR]; }; -struct bch_fs_usage { - /* all fields are in units of 512 byte sectors: */ +struct bch_fs_usage_base { u64 hidden; u64 btree; u64 data; u64 cached; u64 reserved; u64 nr_inodes; +}; - /* XXX: add stats for compression ratio */ -#if 0 - u64 uncompressed; - u64 compressed; -#endif - - /* broken out: */ - +struct bch_fs_usage { + /* all fields are in units of 512 byte sectors: */ + struct bch_fs_usage_base b; u64 persistent_reserved[BCH_REPLICAS_MAX]; u64 replicas[]; }; diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 51a06324b21d3..18a8d141b4437 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -597,7 +597,7 @@ int bch2_trigger_inode(struct btree_trans *trans, struct bch_fs *c = trans->c; percpu_down_read(&c->mark_lock); - this_cpu_add(c->usage_gc->nr_inodes, nr); + this_cpu_add(c->usage_gc->b.nr_inodes, nr); percpu_up_read(&c->mark_lock); } diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 725214605a050..9127d0e3ca2f6 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -280,7 +280,7 @@ static int journal_replay_entry_early(struct bch_fs *c, le64_to_cpu(u->v); break; case BCH_FS_USAGE_inodes: - c->usage_base->nr_inodes = le64_to_cpu(u->v); + c->usage_base->b.nr_inodes = le64_to_cpu(u->v); break; case BCH_FS_USAGE_key_version: atomic64_set(&c->key_version, diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c index 9632f36f5f318..b6bf0ebe7e840 100644 --- a/fs/bcachefs/sb-clean.c +++ b/fs/bcachefs/sb-clean.c @@ -207,7 +207,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, u->entry.type = BCH_JSET_ENTRY_usage; u->entry.btree_id = BCH_FS_USAGE_inodes; - u->v = cpu_to_le64(c->usage_base->nr_inodes); + u->v = cpu_to_le64(c->usage_base->b.nr_inodes); } { -- GitLab From 5b14ce35af901853e91e186f34e71f31b08b4e0a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 11 Nov 2023 15:08:36 -0500 Subject: [PATCH 1232/1290] bcachefs: bch2_trans_account_disk_usage_change() The disk space accounting rewrite is splitting out accounting for each replicas set - those are moving to btree keys, instead of percpu counters. This breaks bch2_trans_fs_usage_apply() up, splitting out the part we will still need. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 5 +++ fs/bcachefs/buckets.c | 70 +++++++++++++++++++------------- fs/bcachefs/buckets.h | 2 + 3 files changed, 48 insertions(+), 29 deletions(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index e3a82c33912b9..ab00d202361eb 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -675,6 +675,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) return -BCH_ERR_btree_insert_need_mark_replicas; + /* XXX: we only want to run this if deltas are nonzero */ + bch2_trans_account_disk_usage_change(trans); + h = trans->hooks; while (h) { ret = h->fn(trans, h); @@ -989,6 +992,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) !trans->journal_entries_u64s) goto out_reset; + memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); + ret = bch2_trans_commit_run_triggers(trans); if (ret) goto out_reset; diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index f8b9be9a84579..54f7826ac4987 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -694,48 +694,25 @@ void bch2_trans_fs_usage_revert(struct btree_trans *trans, percpu_up_read(&c->mark_lock); } -int bch2_trans_fs_usage_apply(struct btree_trans *trans, - struct replicas_delta_list *deltas) +void bch2_trans_account_disk_usage_change(struct btree_trans *trans) { struct bch_fs *c = trans->c; + u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; static int warned_disk_usage = 0; bool warn = false; - u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; - struct replicas_delta *d, *d2; - struct replicas_delta *top = (void *) deltas->d + deltas->used; - struct bch_fs_usage *dst; - s64 added = 0, should_not_have_added; - unsigned i; percpu_down_read(&c->mark_lock); preempt_disable(); - dst = fs_usage_ptr(c, trans->journal_res.seq, false); - - for (d = deltas->d; d != top; d = replicas_delta_next(d)) { - switch (d->r.data_type) { - case BCH_DATA_btree: - case BCH_DATA_user: - case BCH_DATA_parity: - added += d->delta; - } - - if (__update_replicas(c, dst, &d->r, d->delta)) - goto need_mark; - } + struct bch_fs_usage_base *dst = &fs_usage_ptr(c, trans->journal_res.seq, false)->b; + struct bch_fs_usage_base *src = &trans->fs_usage_delta; - dst->b.nr_inodes += deltas->nr_inodes; - - for (i = 0; i < BCH_REPLICAS_MAX; i++) { - added += deltas->persistent_reserved[i]; - dst->b.reserved += deltas->persistent_reserved[i]; - dst->persistent_reserved[i] += deltas->persistent_reserved[i]; - } + s64 added = src->btree + src->data + src->reserved; /* * Not allowed to reduce sectors_available except by getting a * reservation: */ - should_not_have_added = added - (s64) disk_res_sectors; + s64 should_not_have_added = added - (s64) disk_res_sectors; if (unlikely(should_not_have_added > 0)) { u64 old, new, v = atomic64_read(&c->sectors_available); @@ -754,6 +731,13 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans, this_cpu_sub(*c->online_reserved, added); } + dst->hidden += src->hidden; + dst->btree += src->btree; + dst->data += src->data; + dst->cached += src->cached; + dst->reserved += src->reserved; + dst->nr_inodes += src->nr_inodes; + preempt_enable(); percpu_up_read(&c->mark_lock); @@ -761,6 +745,34 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans, bch2_trans_inconsistent(trans, "disk usage increased %lli more than %llu sectors reserved)", should_not_have_added, disk_res_sectors); +} + +int bch2_trans_fs_usage_apply(struct btree_trans *trans, + struct replicas_delta_list *deltas) +{ + struct bch_fs *c = trans->c; + struct replicas_delta *d, *d2; + struct replicas_delta *top = (void *) deltas->d + deltas->used; + struct bch_fs_usage *dst; + unsigned i; + + percpu_down_read(&c->mark_lock); + preempt_disable(); + dst = fs_usage_ptr(c, trans->journal_res.seq, false); + + for (d = deltas->d; d != top; d = replicas_delta_next(d)) + if (__update_replicas(c, dst, &d->r, d->delta)) + goto need_mark; + + dst->b.nr_inodes += deltas->nr_inodes; + + for (i = 0; i < BCH_REPLICAS_MAX; i++) { + dst->b.reserved += deltas->persistent_reserved[i]; + dst->persistent_reserved[i] += deltas->persistent_reserved[i]; + } + + preempt_enable(); + percpu_up_read(&c->mark_lock); return 0; need_mark: /* revert changes: */ diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 2b1e907f2acad..6387e039f7897 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -356,6 +356,8 @@ int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned, ret; \ }) +void bch2_trans_account_disk_usage_change(struct btree_trans *); + void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *); int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); -- GitLab From 57f2d2097603fea102330e8cfe6be4a8db24809e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 10 Jan 2024 23:47:04 -0500 Subject: [PATCH 1233/1290] bcachefs: Reduce would_deadlock restarts We don't have to take locks in any particular ordering - we'll make forward progress just fine - but if we try to stick to an ordering, it can help to avoid excessive would_deadlock transaction restarts. This tweaks the reflink path to take extents btree locks in the right order. Signed-off-by: Kent Overstreet --- fs/bcachefs/reflink.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 6070109174210..98255aa64e226 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -486,6 +486,13 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot); + if (dst_inum.inum < src_inum.inum) { + /* Avoid some lock cycle transaction restarts */ + ret = bch2_btree_iter_traverse(&dst_iter); + if (ret) + continue; + } + dst_done = dst_iter.pos.offset - dst_start.offset; src_want = POS(src_start.inode, src_start.offset + dst_done); bch2_btree_iter_set_pos(&src_iter, src_want); -- GitLab From 0124f42da70c513dc371b73688663c54e5a9666f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 15 Jan 2024 14:12:43 -0500 Subject: [PATCH 1234/1290] bcachefs: Don't pass memcmp() as a pointer Some (buggy!) compilers have issues with this. Fixes: https://github.com/koverstreet/bcachefs/issues/625 Signed-off-by: Kent Overstreet --- fs/bcachefs/replicas.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 1c3900da4c770..cc2672c120312 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -9,6 +9,12 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, struct bch_replicas_cpu *); +/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */ +static int bch2_memcmp(const void *l, const void *r, size_t size) +{ + return memcmp(l, r, size); +} + /* Replicas tracking - in memory: */ static void verify_replicas_entry(struct bch_replicas_entry_v1 *e) @@ -33,7 +39,7 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e) static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) { - eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); + eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL); } static void bch2_replicas_entry_v0_to_text(struct printbuf *out, @@ -821,7 +827,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, sort_cmp_size(cpu_r->entries, cpu_r->nr, cpu_r->entry_size, - memcmp, NULL); + bch2_memcmp, NULL); for (i = 0; i < cpu_r->nr; i++) { struct bch_replicas_entry_v1 *e = -- GitLab From 741c1d3ec1a4a91d0bf18f200e2f0f8bed1ee7e9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 15 Jan 2024 14:15:03 -0500 Subject: [PATCH 1235/1290] bcachefs: Add .val_to_text() for KEY_TYPE_cookie Signed-off-by: Kent Overstreet --- fs/bcachefs/bkey_methods.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 761f5e33b1e69..5e52684764eb1 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -63,8 +63,17 @@ static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k, return 0; } +static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_cookie ck = bkey_s_c_to_cookie(k); + + prt_printf(out, "%llu", le64_to_cpu(ck.v->cookie)); +} + #define bch2_bkey_ops_cookie ((struct bkey_ops) { \ .key_invalid = key_type_cookie_invalid, \ + .val_to_text = key_type_cookie_to_text, \ .min_val_size = 8, \ }) -- GitLab From d92b83f592d810aded2e5f90db5f560cc8cf577b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 15 Jan 2024 14:15:26 -0500 Subject: [PATCH 1236/1290] bcachefs: bch2_kthread_io_clock_wait() no longer sleeps until full amount Drop t he loop in bch2_kthread_io_clock_wait(): this allows the code that uses it to be woken up for other reasons, and fixes a bug where rebalance wouldn't wake up when a scan was requested. This raises the possibility of spurious wakeups, but callers should always be able to handle that reasonably well. Signed-off-by: Kent Overstreet --- fs/bcachefs/clock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c index f41889093a2c7..3636444511064 100644 --- a/fs/bcachefs/clock.c +++ b/fs/bcachefs/clock.c @@ -109,7 +109,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, if (cpu_timeout != MAX_SCHEDULE_TIMEOUT) mod_timer(&wait.cpu_timer, cpu_timeout + jiffies); - while (1) { + do { set_current_state(TASK_INTERRUPTIBLE); if (kthread && kthread_should_stop()) break; @@ -119,7 +119,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, schedule(); try_to_freeze(); - } + } while (0); __set_current_state(TASK_RUNNING); del_timer_sync(&wait.cpu_timer); -- GitLab From fa3185af43dce43a23df78c122bef860bcd4bf40 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 15 Jan 2024 15:04:40 -0500 Subject: [PATCH 1237/1290] bcachefs: Re-add move_extent_write tracepoint It appears this was accidentally deleted at some point - also, do a bit of cleanup. Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 9 +++++++++ fs/bcachefs/trace.h | 34 +++++++++++----------------------- 2 files changed, 20 insertions(+), 23 deletions(-) diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index a9e0920b34f32..7a66706e4dcef 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -111,6 +111,15 @@ static void move_write(struct moving_io *io) return; } + if (trace_move_extent_write_enabled()) { + struct bch_fs *c = io->write.op.c; + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k)); + trace_move_extent_write(c, buf.buf); + printbuf_exit(&buf); + } + closure_get(&io->write.ctxt->cl); atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); atomic_inc(&io->write.ctxt->write_ios); diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index c94876b3bb06e..8292efc3289ba 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -827,40 +827,28 @@ TRACE_EVENT(bucket_evacuate, ); DEFINE_EVENT(fs_str, move_extent, - TP_PROTO(struct bch_fs *c, const char *k), - TP_ARGS(c, k) + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); DEFINE_EVENT(fs_str, move_extent_read, - TP_PROTO(struct bch_fs *c, const char *k), - TP_ARGS(c, k) + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); DEFINE_EVENT(fs_str, move_extent_write, - TP_PROTO(struct bch_fs *c, const char *k), - TP_ARGS(c, k) + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); DEFINE_EVENT(fs_str, move_extent_finish, - TP_PROTO(struct bch_fs *c, const char *k), - TP_ARGS(c, k) + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -TRACE_EVENT(move_extent_fail, - TP_PROTO(struct bch_fs *c, const char *msg), - TP_ARGS(c, msg), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __string(msg, msg ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __assign_str(msg, msg); - ), - - TP_printk("%d:%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(msg)) +DEFINE_EVENT(fs_str, move_extent_fail, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); DEFINE_EVENT(fs_str, move_extent_start_fail, -- GitLab From ef740a1e2939376ea4cc11cc8b923214dc1f4a41 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 15 Jan 2024 15:06:43 -0500 Subject: [PATCH 1238/1290] bcachefs: Add missing bch2_moving_ctxt_flush_all() This fixes a bug with rebalance IOs getting stuck with reads completed, but writes never being issued. Signed-off-by: Kent Overstreet --- fs/bcachefs/rebalance.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 95f46cb3b5bdf..a729682d653d6 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -371,6 +371,7 @@ static int do_rebalance(struct moving_context *ctxt) !kthread_should_stop() && !atomic64_read(&r->work_stats.sectors_seen) && !atomic64_read(&r->scan_stats.sectors_seen)) { + bch2_moving_ctxt_flush_all(ctxt); bch2_trans_unlock_long(trans); rebalance_wait(c); } -- GitLab From 189c176c5dd324531d4cb23f172b1761e65bb0ed Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 15 Jan 2024 15:33:39 -0500 Subject: [PATCH 1239/1290] bcachefs: Improve move_extent tracepoint Also print out the data_opts, so that we can see what specifically is being done to an extent. Signed-off-by: Kent Overstreet --- fs/bcachefs/bkey.c | 2 +- fs/bcachefs/move.c | 40 ++++++++++++++++++++++++++++++++++++++-- fs/bcachefs/rebalance.c | 3 +-- fs/bcachefs/util.c | 7 ++++++- fs/bcachefs/util.h | 3 ++- 5 files changed, 48 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c index abdb05507d162..76e79a15ba08f 100644 --- a/fs/bcachefs/bkey.c +++ b/fs/bcachefs/bkey.c @@ -33,7 +33,7 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *out, next_key_bits -= 64; } - bch2_prt_u64_binary(out, v, min(word_bits, nr_key_bits)); + bch2_prt_u64_base2_nbits(out, v, min(word_bits, nr_key_bits)); if (!next_key_bits) break; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 7a66706e4dcef..2e083daedfb2b 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -9,6 +9,7 @@ #include "btree_update.h" #include "btree_update_interior.h" #include "btree_write_buffer.h" +#include "compress.h" #include "disk_groups.h" #include "ec.h" #include "errcode.h" @@ -34,12 +35,46 @@ const char * const bch2_data_ops_strs[] = { NULL }; -static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k) +static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + printbuf_tabstop_push(out, 20); + prt_str(out, "rewrite ptrs:"); + prt_tab(out); + bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); + prt_newline(out); + + prt_str(out, "kill ptrs: "); + prt_tab(out); + bch2_prt_u64_base2(out, data_opts->kill_ptrs); + prt_newline(out); + + prt_str(out, "target: "); + prt_tab(out); + bch2_target_to_text(out, c, data_opts->target); + prt_newline(out); + + prt_str(out, "compression: "); + prt_tab(out); + bch2_compression_opt_to_text(out, io_opts->background_compression ?: io_opts->compression); + prt_newline(out); + + prt_str(out, "extra replicas: "); + prt_tab(out); + prt_u64(out, data_opts->extra_replicas); +} + +static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) { if (trace_move_extent_enabled()) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); trace_move_extent(c, buf.buf); printbuf_exit(&buf); } @@ -250,9 +285,10 @@ int bch2_move_extent(struct moving_context *ctxt, unsigned sectors = k.k->size, pages; int ret = -ENOMEM; + trace_move_extent2(c, k, &io_opts, &data_opts); + if (ctxt->stats) ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos); - trace_move_extent2(c, k); bch2_data_update_opts_normalize(k, &data_opts); diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index a729682d653d6..f24106dee21d6 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -177,8 +177,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, prt_str(&buf, "target="); bch2_target_to_text(&buf, c, r->target); prt_str(&buf, " compression="); - struct bch_compression_opt opt = __bch2_compression_decode(r->compression); - prt_str(&buf, bch2_compression_opts[opt.type]); + bch2_compression_opt_to_text(&buf, r->compression); prt_str(&buf, " "); bch2_bkey_val_to_text(&buf, c, k); diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index f927c8a19e241..a135136adeee3 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -241,12 +241,17 @@ bool bch2_is_zero(const void *_p, size_t n) return true; } -void bch2_prt_u64_binary(struct printbuf *out, u64 v, unsigned nr_bits) +void bch2_prt_u64_base2_nbits(struct printbuf *out, u64 v, unsigned nr_bits) { while (nr_bits) prt_char(out, '0' + ((v >> --nr_bits) & 1)); } +void bch2_prt_u64_base2(struct printbuf *out, u64 v) +{ + bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1); +} + void bch2_print_string_as_lines(const char *prefix, const char *lines) { const char *p; diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index c75fc31915d39..df67bf55fe2bc 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -342,7 +342,8 @@ bool bch2_is_zero(const void *, size_t); u64 bch2_read_flag_list(char *, const char * const[]); -void bch2_prt_u64_binary(struct printbuf *, u64, unsigned); +void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned); +void bch2_prt_u64_base2(struct printbuf *, u64); void bch2_print_string_as_lines(const char *prefix, const char *lines); -- GitLab From a6548c8b5eb541e77ffcf497e8761f34172ff828 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 15 Jan 2024 17:56:22 -0500 Subject: [PATCH 1240/1290] bcachefs: Avoid flushing the journal in the discard path When issuing discards, we may need to flush the journal if there's too many buckets that can't be discarded until a journal flush. But the heuristic was bad; we should be comparing the number of buckets that need to flushes against the number of free buckets, not the number of buckets we saw. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 60 +++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 19 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 614da759226b5..10704f2d3af53 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1604,13 +1604,36 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c) return ret; } +struct discard_buckets_state { + u64 seen; + u64 open; + u64 need_journal_commit; + u64 discarded; + struct bch_dev *ca; + u64 need_journal_commit_this_dev; +}; + +static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca) +{ + if (s->ca == ca) + return; + + if (s->ca && s->need_journal_commit_this_dev > + bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets) + bch2_journal_flush_async(&c->journal, NULL); + + if (s->ca) + percpu_ref_put(&s->ca->ref); + if (ca) + percpu_ref_get(&ca->ref); + s->ca = ca; + s->need_journal_commit_this_dev = 0; +} + static int bch2_discard_one_bucket(struct btree_trans *trans, struct btree_iter *need_discard_iter, struct bpos *discard_pos_done, - u64 *seen, - u64 *open, - u64 *need_journal_commit, - u64 *discarded) + struct discard_buckets_state *s) { struct bch_fs *c = trans->c; struct bpos pos = need_discard_iter->pos; @@ -1622,20 +1645,24 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, int ret = 0; ca = bch_dev_bkey_exists(c, pos.inode); + if (!percpu_ref_tryget(&ca->io_ref)) { bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0)); return 0; } + discard_buckets_next_dev(c, s, ca); + if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { - (*open)++; + s->open++; goto out; } if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, c->journal.flushed_seq_ondisk, pos.inode, pos.offset)) { - (*need_journal_commit)++; + s->need_journal_commit++; + s->need_journal_commit_this_dev++; goto out; } @@ -1711,9 +1738,9 @@ write: goto out; count_event(c, bucket_discard); - (*discarded)++; + s->discarded++; out: - (*seen)++; + s->seen++; bch2_trans_iter_exit(trans, &iter); percpu_ref_put(&ca->io_ref); printbuf_exit(&buf); @@ -1723,7 +1750,7 @@ out: static void bch2_do_discards_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, discard_work); - u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0; + struct discard_buckets_state s = {}; struct bpos discard_pos_done = POS_MAX; int ret; @@ -1735,19 +1762,14 @@ static void bch2_do_discards_work(struct work_struct *work) ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_need_discard, POS_MIN, 0, k, - bch2_discard_one_bucket(trans, &iter, &discard_pos_done, - &seen, - &open, - &need_journal_commit, - &discarded))); - - if (need_journal_commit * 2 > seen) - bch2_journal_flush_async(&c->journal, NULL); + bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s))); - bch2_write_ref_put(c, BCH_WRITE_REF_discard); + discard_buckets_next_dev(c, &s, NULL); - trace_discard_buckets(c, seen, open, need_journal_commit, discarded, + trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); + + bch2_write_ref_put(c, BCH_WRITE_REF_discard); } void bch2_do_discards(struct bch_fs *c) -- GitLab From 4ae016607b907e69ed817ce14158adffb9b47978 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 15 Jan 2024 17:57:44 -0500 Subject: [PATCH 1241/1290] bcachefs: Print size of superblock with space allocated Signed-off-by: Kent Overstreet --- fs/bcachefs/super-io.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 55926b81eede6..9564d2d9ccaef 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -1320,7 +1320,9 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, prt_printf(out, "Superblock size:"); prt_tab(out); - prt_printf(out, "%zu", vstruct_bytes(sb)); + prt_units_u64(out, vstruct_bytes(sb)); + prt_str(out, "/"); + prt_units_u64(out, 512ULL << sb->layout.sb_max_size_bits); prt_newline(out); prt_printf(out, "Clean:"); -- GitLab From e6a2566f7a009b644fd84a43a6c1e3a53bb0bf00 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 15 Jan 2024 17:59:51 -0500 Subject: [PATCH 1242/1290] bcachefs: Better journal tracepoints Factor out bch2_journal_bufs_to_text(), and use it in the journal_entry_full() tracepoint; when we can't get a journal reservation we need to know the outstanding journal entry sizes to know if the problem is due to excessive flushing. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 111 +++++++++++++++++++++++++++--------------- fs/bcachefs/trace.h | 28 +++-------- 2 files changed, 79 insertions(+), 60 deletions(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 8538ef34f62bc..d71d26e39521e 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -27,6 +27,47 @@ static const char * const bch2_journal_errors[] = { NULL }; +static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq) +{ + union journal_res_state s = READ_ONCE(j->reservations); + unsigned i = seq & JOURNAL_BUF_MASK; + struct journal_buf *buf = j->buf + i; + + prt_printf(out, "seq:"); + prt_tab(out); + prt_printf(out, "%llu", seq); + prt_newline(out); + printbuf_indent_add(out, 2); + + prt_printf(out, "refcount:"); + prt_tab(out); + prt_printf(out, "%u", journal_state_count(s, i)); + prt_newline(out); + + prt_printf(out, "size:"); + prt_tab(out); + prt_human_readable_u64(out, vstruct_bytes(buf->data)); + prt_newline(out); + + prt_printf(out, "expires"); + prt_tab(out); + prt_printf(out, "%li jiffies", buf->expires - jiffies); + prt_newline(out); + + printbuf_indent_sub(out, 2); +} + +static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) +{ + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 24); + + for (u64 seq = journal_last_unwritten_seq(j); + seq <= journal_cur_seq(j); + seq++) + bch2_journal_buf_to_text(out, j, seq); +} + static inline bool journal_seq_unwritten(struct journal *j, u64 seq) { return seq > j->seq_ondisk; @@ -156,7 +197,7 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write) * We don't close a journal_buf until the next journal_buf is finished writing, * and can be opened again - this also initializes the next journal_buf: */ -static void __journal_entry_close(struct journal *j, unsigned closed_val) +static void __journal_entry_close(struct journal *j, unsigned closed_val, bool trace) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *buf = journal_cur_buf(j); @@ -185,7 +226,17 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val) /* Close out old buffer: */ buf->data->u64s = cpu_to_le32(old.cur_entry_offset); - trace_journal_entry_close(c, vstruct_bytes(buf->data)); + if (trace_journal_entry_close_enabled() && trace) { + struct printbuf pbuf = PRINTBUF; + pbuf.atomic++; + + prt_str(&pbuf, "entry size: "); + prt_human_readable_u64(&pbuf, vstruct_bytes(buf->data)); + prt_newline(&pbuf); + bch2_prt_task_backtrace(&pbuf, current, 1); + trace_journal_entry_close(c, pbuf.buf); + printbuf_exit(&pbuf); + } sectors = vstruct_blocks_plus(buf->data, c->block_bits, buf->u64s_reserved) << c->block_bits; @@ -225,7 +276,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val) void bch2_journal_halt(struct journal *j) { spin_lock(&j->lock); - __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true); if (!j->err_seq) j->err_seq = journal_cur_seq(j); journal_wake(j); @@ -239,7 +290,7 @@ static bool journal_entry_want_write(struct journal *j) /* Don't close it yet if we already have a write in flight: */ if (ret) - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); else if (nr_unwritten_journal_entries(j)) { struct journal_buf *buf = journal_cur_buf(j); @@ -406,7 +457,7 @@ static void journal_write_work(struct work_struct *work) if (delta > 0) mod_delayed_work(c->io_complete_wq, &j->write_work, delta); else - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); unlock: spin_unlock(&j->lock); } @@ -463,13 +514,21 @@ retry: buf->buf_size < JOURNAL_ENTRY_SIZE_MAX) j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false); ret = journal_entry_open(j); if (ret == JOURNAL_ERR_max_in_flight) { track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], &j->max_in_flight_start, true); - trace_and_count(c, journal_entry_full, c); + if (trace_journal_entry_full_enabled()) { + struct printbuf buf = PRINTBUF; + buf.atomic++; + + bch2_journal_bufs_to_text(&buf, j); + trace_journal_entry_full(c, buf.buf); + printbuf_exit(&buf); + } + count_event(c, journal_entry_full); } unlock: can_discard = j->can_discard; @@ -549,7 +608,7 @@ void bch2_journal_entry_res_resize(struct journal *j, /* * Not enough room in current journal entry, have to flush it: */ - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); } else { journal_cur_buf(j)->u64s_reserved += d; } @@ -606,7 +665,7 @@ recheck_need_open: struct journal_res res = { 0 }; if (journal_entry_is_open(j)) - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); spin_unlock(&j->lock); @@ -786,7 +845,7 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou if (buf->need_flush_to_write_buffer) { if (seq == journal_cur_seq(j)) - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); union journal_res_state s; s.v = atomic64_read_acquire(&j->reservations.counter); @@ -1339,35 +1398,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) } prt_newline(out); - - for (u64 seq = journal_cur_seq(j); - seq >= journal_last_unwritten_seq(j); - --seq) { - unsigned i = seq & JOURNAL_BUF_MASK; - - prt_printf(out, "unwritten entry:"); - prt_tab(out); - prt_printf(out, "%llu", seq); - prt_newline(out); - printbuf_indent_add(out, 2); - - prt_printf(out, "refcount:"); - prt_tab(out); - prt_printf(out, "%u", journal_state_count(s, i)); - prt_newline(out); - - prt_printf(out, "sectors:"); - prt_tab(out); - prt_printf(out, "%u", j->buf[i].sectors); - prt_newline(out); - - prt_printf(out, "expires"); - prt_tab(out); - prt_printf(out, "%li jiffies", j->buf[i].expires - jiffies); - prt_newline(out); - - printbuf_indent_sub(out, 2); - } + prt_printf(out, "unwritten entries:"); + prt_newline(out); + bch2_journal_bufs_to_text(out, j); prt_printf(out, "replay done:\t\t%i\n", diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 8292efc3289ba..ea307ed49424e 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -46,7 +46,7 @@ DECLARE_EVENT_CLASS(fs_str, __assign_str(str, str); ), - TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str)) + TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str)) ); DECLARE_EVENT_CLASS(trans_str, @@ -273,28 +273,14 @@ DEFINE_EVENT(bch_fs, journal_full, TP_ARGS(c) ); -DEFINE_EVENT(bch_fs, journal_entry_full, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) +DEFINE_EVENT(fs_str, journal_entry_full, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -TRACE_EVENT(journal_entry_close, - TP_PROTO(struct bch_fs *c, unsigned bytes), - TP_ARGS(c, bytes), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u32, bytes ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->bytes = bytes; - ), - - TP_printk("%d,%d entry bytes %u", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->bytes) +DEFINE_EVENT(fs_str, journal_entry_close, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); DEFINE_EVENT(bio, journal_write, -- GitLab From ba96d36ca526f99b163927115abfec36ef5565e0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 15 Jan 2024 18:08:32 -0500 Subject: [PATCH 1243/1290] bcachefs: bkey_and_val_eq() Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index e358a2ffffdea..56e5a0e213f9e 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -400,6 +400,13 @@ int bch2_check_btree_backpointers(struct bch_fs *c) return ret; } +static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) +{ + return bpos_eq(l.k->p, r.k->p) && + bkey_bytes(l.k) == bkey_bytes(r.k) && + !memcmp(l.v, r.v, bkey_val_bytes(l.k)); +} + static int check_bp_exists(struct btree_trans *trans, struct bpos bucket, struct bch_backpointer bp, @@ -433,9 +440,7 @@ static int check_bp_exists(struct btree_trans *trans, if (bp_k.k->type != KEY_TYPE_backpointer || memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) { - if (!bpos_eq(orig_k.k->p, last_flushed->k->k.p) || - bkey_bytes(orig_k.k) != bkey_bytes(&last_flushed->k->k) || - memcmp(orig_k.v, &last_flushed->k->v, bkey_val_bytes(orig_k.k))) { + if (!bkey_and_val_eq(orig_k, bkey_i_to_s_c(last_flushed->k))) { bch2_bkey_buf_reassemble(&tmp, c, orig_k); if (bp.level) { -- GitLab From 1a5039041b376f545dfc11d89af77cc720217b44 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 15 Jan 2024 18:19:52 -0500 Subject: [PATCH 1244/1290] bcachefs: extents_to_bp_state Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 89 ++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 48 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 56e5a0e213f9e..bf828f1f28d2b 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -407,13 +407,17 @@ static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) !memcmp(l.v, r.v, bkey_val_bytes(l.k)); } +struct extents_to_bp_state { + struct bpos bucket_start; + struct bpos bucket_end; + struct bkey_buf last_flushed; +}; + static int check_bp_exists(struct btree_trans *trans, + struct extents_to_bp_state *s, struct bpos bucket, struct bch_backpointer bp, - struct bkey_s_c orig_k, - struct bpos bucket_start, - struct bpos bucket_end, - struct bkey_buf *last_flushed) + struct bkey_s_c orig_k) { struct bch_fs *c = trans->c; struct btree_iter bp_iter = { NULL }; @@ -424,8 +428,8 @@ static int check_bp_exists(struct btree_trans *trans, bch2_bkey_buf_init(&tmp); - if (bpos_lt(bucket, bucket_start) || - bpos_gt(bucket, bucket_end)) + if (bpos_lt(bucket, s->bucket_start) || + bpos_gt(bucket, s->bucket_end)) return 0; if (!bch2_dev_bucket_exists(c, bucket)) @@ -440,9 +444,9 @@ static int check_bp_exists(struct btree_trans *trans, if (bp_k.k->type != KEY_TYPE_backpointer || memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) { - if (!bkey_and_val_eq(orig_k, bkey_i_to_s_c(last_flushed->k))) { - bch2_bkey_buf_reassemble(&tmp, c, orig_k); + bch2_bkey_buf_reassemble(&tmp, c, orig_k); + if (!bkey_and_val_eq(orig_k, bkey_i_to_s_c(s->last_flushed.k))) { if (bp.level) { bch2_trans_unlock(trans); bch2_btree_interior_updates_flush(c); @@ -452,7 +456,7 @@ static int check_bp_exists(struct btree_trans *trans, if (ret) goto err; - bch2_bkey_buf_copy(last_flushed, c, tmp.k); + bch2_bkey_buf_copy(&s->last_flushed, c, tmp.k); ret = -BCH_ERR_transaction_restart_write_buffer_flush; goto out; } @@ -480,10 +484,8 @@ missing: } static int check_extent_to_backpointers(struct btree_trans *trans, + struct extents_to_bp_state *s, enum btree_id btree, unsigned level, - struct bpos bucket_start, - struct bpos bucket_end, - struct bkey_buf *last_flushed, struct bkey_s_c k) { struct bch_fs *c = trans->c; @@ -503,9 +505,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans, bch2_extent_ptr_to_bp(c, btree, level, k, p, &bucket_pos, &bp); - ret = check_bp_exists(trans, bucket_pos, bp, k, - bucket_start, bucket_end, - last_flushed); + ret = check_bp_exists(trans, s, bucket_pos, bp, k); if (ret) return ret; } @@ -514,10 +514,8 @@ static int check_extent_to_backpointers(struct btree_trans *trans, } static int check_btree_root_to_backpointers(struct btree_trans *trans, + struct extents_to_bp_state *s, enum btree_id btree_id, - struct bpos bucket_start, - struct bpos bucket_end, - struct bkey_buf *last_flushed, int *level) { struct bch_fs *c = trans->c; @@ -541,9 +539,7 @@ retry: *level = b->c.level; k = bkey_i_to_s_c(&b->key); - ret = check_extent_to_backpointers(trans, btree_id, b->c.level + 1, - bucket_start, bucket_end, - last_flushed, k); + ret = check_extent_to_backpointers(trans, s, btree_id, b->c.level + 1, k); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -615,43 +611,35 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, } static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, - struct bpos bucket_start, - struct bpos bucket_end) + struct extents_to_bp_state *s) { struct bch_fs *c = trans->c; - struct btree_iter iter; - enum btree_id btree_id; - struct bkey_s_c k; - struct bkey_buf last_flushed; int ret = 0; - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); - - for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) { + for (enum btree_id btree_id = 0; + btree_id < btree_id_nr_alive(c); + btree_id++) { int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1; ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_btree_root_to_backpointers(trans, btree_id, - bucket_start, bucket_end, - &last_flushed, &level)); + check_btree_root_to_backpointers(trans, s, btree_id, &level)); if (ret) return ret; while (level >= depth) { + struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, level, BTREE_ITER_PREFETCH); while (1) { bch2_trans_begin(trans); - k = bch2_btree_iter_peek(&iter); + + struct bkey_s_c k = bch2_btree_iter_peek(&iter); if (!k.k) break; ret = bkey_err(k) ?: - check_extent_to_backpointers(trans, btree_id, level, - bucket_start, bucket_end, - &last_flushed, k) ?: + check_extent_to_backpointers(trans, s, btree_id, level, k) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { @@ -673,7 +661,6 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, } } - bch2_bkey_buf_exit(&last_flushed, c); return 0; } @@ -736,37 +723,43 @@ static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans, int bch2_check_extents_to_backpointers(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); - struct bpos start = POS_MIN, end; + struct extents_to_bp_state s = { .bucket_start = POS_MIN }; int ret; + bch2_bkey_buf_init(&s.last_flushed); + bkey_init(&s.last_flushed.k->k); + while (1) { - ret = bch2_get_alloc_in_memory_pos(trans, start, &end); + ret = bch2_get_alloc_in_memory_pos(trans, s.bucket_start, &s.bucket_end); if (ret) break; - if (bpos_eq(start, POS_MIN) && !bpos_eq(end, SPOS_MAX)) + if ( bpos_eq(s.bucket_start, POS_MIN) && + !bpos_eq(s.bucket_end, SPOS_MAX)) bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", __func__, btree_nodes_fit_in_ram(c)); - if (!bpos_eq(start, POS_MIN) || !bpos_eq(end, SPOS_MAX)) { + if (!bpos_eq(s.bucket_start, POS_MIN) || + !bpos_eq(s.bucket_end, SPOS_MAX)) { struct printbuf buf = PRINTBUF; prt_str(&buf, "check_extents_to_backpointers(): "); - bch2_bpos_to_text(&buf, start); + bch2_bpos_to_text(&buf, s.bucket_start); prt_str(&buf, "-"); - bch2_bpos_to_text(&buf, end); + bch2_bpos_to_text(&buf, s.bucket_end); bch_verbose(c, "%s", buf.buf); printbuf_exit(&buf); } - ret = bch2_check_extents_to_backpointers_pass(trans, start, end); - if (ret || bpos_eq(end, SPOS_MAX)) + ret = bch2_check_extents_to_backpointers_pass(trans, &s); + if (ret || bpos_eq(s.bucket_end, SPOS_MAX)) break; - start = bpos_successor(end); + s.bucket_start = bpos_successor(s.bucket_end); } bch2_trans_put(trans); + bch2_bkey_buf_exit(&s.last_flushed, c); bch_err_fn(c, ret); return ret; -- GitLab From 46bf2e9cc745996ca56e56ed816e60d07811bd9a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 15 Jan 2024 20:37:23 -0500 Subject: [PATCH 1245/1290] bcachefs: Fix excess transaction restarts in __bchfs_fallocate() drop_locks_do() should not be used in a fastpath without first trying the do in nonblocking mode - the unlock and relock will cause excessive transaction restarts and potentially livelocking with other threads that are contending for the same locks. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.h | 5 +++++ fs/bcachefs/fs-io-pagecache.c | 37 +++++++++++++++++++++++------------ fs/bcachefs/fs-io-pagecache.h | 2 +- fs/bcachefs/fs-io.c | 7 +++++-- 4 files changed, 35 insertions(+), 16 deletions(-) diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index da2b74fa63fce..24772538e4cc7 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -819,6 +819,11 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, #define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret) +/* + * This should not be used in a fastpath, without first trying _do in + * nonblocking mode - it will cause excessive transaction restarts and + * potentially livelocking: + */ #define drop_locks_do(_trans, _do) \ ({ \ bch2_trans_unlock(_trans); \ diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c index ff664fd0d8ef8..d359aa9b33b82 100644 --- a/fs/bcachefs/fs-io-pagecache.c +++ b/fs/bcachefs/fs-io-pagecache.c @@ -309,39 +309,49 @@ void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode, } } -void bch2_mark_pagecache_reserved(struct bch_inode_info *inode, - u64 start, u64 end) +int bch2_mark_pagecache_reserved(struct bch_inode_info *inode, + u64 *start, u64 end, + bool nonblocking) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - pgoff_t index = start >> PAGE_SECTORS_SHIFT; + pgoff_t index = *start >> PAGE_SECTORS_SHIFT; pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; struct folio_batch fbatch; s64 i_sectors_delta = 0; - unsigned i, j; + int ret = 0; - if (end <= start) - return; + if (end <= *start) + return 0; folio_batch_init(&fbatch); while (filemap_get_folios(inode->v.i_mapping, &index, end_index, &fbatch)) { - for (i = 0; i < folio_batch_count(&fbatch); i++) { + for (unsigned i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; + + if (!nonblocking) + folio_lock(folio); + else if (!folio_trylock(folio)) { + folio_batch_release(&fbatch); + ret = -EAGAIN; + break; + } + u64 folio_start = folio_sector(folio); u64 folio_end = folio_end_sector(folio); - unsigned folio_offset = max(start, folio_start) - folio_start; - unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; - struct bch_folio *s; BUG_ON(end <= folio_start); - folio_lock(folio); - s = bch2_folio(folio); + *start = min(end, folio_end); + struct bch_folio *s = bch2_folio(folio); if (s) { + unsigned folio_offset = max(*start, folio_start) - folio_start; + unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; + spin_lock(&s->lock); - for (j = folio_offset; j < folio_offset + folio_len; j++) { + for (unsigned j = folio_offset; j < folio_offset + folio_len; j++) { i_sectors_delta -= s->s[j].state == SECTOR_dirty; bch2_folio_sector_set(folio, s, j, folio_sector_reserve(s->s[j].state)); @@ -356,6 +366,7 @@ void bch2_mark_pagecache_reserved(struct bch_inode_info *inode, } bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); + return ret; } static inline unsigned sectors_to_reserve(struct bch_folio_sector *s, diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h index 27f712ae37a68..8cbaba6565b44 100644 --- a/fs/bcachefs/fs-io-pagecache.h +++ b/fs/bcachefs/fs-io-pagecache.h @@ -143,7 +143,7 @@ int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned); void bch2_bio_page_state_set(struct bio *, struct bkey_s_c); void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64); -void bch2_mark_pagecache_reserved(struct bch_inode_info *, u64, u64); +int bch2_mark_pagecache_reserved(struct bch_inode_info *, u64 *, u64, bool); int bch2_get_folio_disk_reservation(struct bch_fs *, struct bch_inode_info *, diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 98bd5babab193..dc52918d06ef3 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -675,8 +675,11 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); - drop_locks_do(trans, - (bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0)); + if (bch2_mark_pagecache_reserved(inode, &hole_start, + iter.pos.offset, true)) + drop_locks_do(trans, + bch2_mark_pagecache_reserved(inode, &hole_start, + iter.pos.offset, false)); bkey_err: bch2_quota_reservation_put(c, inode, "a_res); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -- GitLab From b97de453651f06071afbf52a5614bd55b8cc4740 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 15 Jan 2024 20:40:06 -0500 Subject: [PATCH 1246/1290] bcachefs: Improve trace_trans_restart_relock Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 2 +- fs/bcachefs/btree_locking.c | 40 ++++++++++++++++++++++++++++++------- fs/bcachefs/btree_locking.h | 9 +-------- fs/bcachefs/btree_types.h | 5 +++++ fs/bcachefs/trace.h | 12 ++++------- 5 files changed, 44 insertions(+), 24 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index fa298289e0165..5467a8635be11 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1337,7 +1337,7 @@ void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool in if (path->should_be_locked && !trans->restarted && - (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_))) + (!dup || !bch2_btree_path_relock_norestart(trans, dup))) return; if (dup) { diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 2d1c95c42f240..bed75c93c0690 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -631,8 +631,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *trans, } __flatten -bool bch2_btree_path_relock_norestart(struct btree_trans *trans, - struct btree_path *path, unsigned long trace_ip) +bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path) { struct get_locks_fail f; @@ -642,7 +641,7 @@ bool bch2_btree_path_relock_norestart(struct btree_trans *trans, int __bch2_btree_path_relock(struct btree_trans *trans, struct btree_path *path, unsigned long trace_ip) { - if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) { + if (!bch2_btree_path_relock_norestart(trans, path)) { trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path); return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path); } @@ -759,12 +758,39 @@ int bch2_trans_relock(struct btree_trans *trans) if (unlikely(trans->restarted)) return -((int) trans->restarted); - trans_for_each_path(trans, path, i) + trans_for_each_path(trans, path, i) { + struct get_locks_fail f; + if (path->should_be_locked && - !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) { - trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path); + !btree_path_get_locks(trans, path, false, &f)) { + if (trace_trans_restart_relock_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_bpos_to_text(&buf, path->pos); + prt_printf(&buf, " l=%u seq=%u node seq=", + f.l, path->l[f.l].lock_seq); + if (IS_ERR_OR_NULL(f.b)) { + prt_str(&buf, bch2_err_str(PTR_ERR(f.b))); + } else { + prt_printf(&buf, "%u", f.b->c.lock.seq); + + struct six_lock_count c = + bch2_btree_node_lock_counts(trans, NULL, &f.b->c, f.l); + prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); + + c = six_lock_counts(&f.b->c.lock); + prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); + } + + trace_trans_restart_relock(trans, _RET_IP_, buf.buf); + printbuf_exit(&buf); + } + + count_event(trans->c, trans_restart_relock); return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); } + } + return 0; } @@ -778,7 +804,7 @@ int bch2_trans_relock_notrace(struct btree_trans *trans) trans_for_each_path(trans, path, i) if (path->should_be_locked && - !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) { + !bch2_btree_path_relock_norestart(trans, path)) { return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); } return 0; diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index cc5500a957a1b..4bd72c855da1a 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -312,8 +312,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *, /* relock: */ -bool bch2_btree_path_relock_norestart(struct btree_trans *, - struct btree_path *, unsigned long); +bool bch2_btree_path_relock_norestart(struct btree_trans *, struct btree_path *); int __bch2_btree_path_relock(struct btree_trans *, struct btree_path *, unsigned long); @@ -353,12 +352,6 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans, /* upgrade */ - -struct get_locks_fail { - unsigned l; - struct btree *b; -}; - bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *, struct btree_path *, unsigned, struct get_locks_fail *); diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index e58e9a7f7b627..4a5a64499eb76 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -741,4 +741,9 @@ enum btree_node_sibling { btree_next_sib, }; +struct get_locks_fail { + unsigned l; + struct btree *b; +}; + #endif /* _BCACHEFS_BTREE_TYPES_H */ diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index ea307ed49424e..6ac1dfeaa8f29 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -528,7 +528,7 @@ TRACE_EVENT(btree_path_relock_fail, __entry->level = path->level; TRACE_BPOS_assign(pos, path->pos); - c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level), + c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level); __entry->self_read_count = c.n[SIX_LOCK_read]; __entry->self_intent_count = c.n[SIX_LOCK_intent]; @@ -1120,8 +1120,6 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split, TP_ARGS(trans, caller_ip, path) ); -struct get_locks_fail; - TRACE_EVENT(trans_restart_upgrade, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, @@ -1169,11 +1167,9 @@ TRACE_EVENT(trans_restart_upgrade, __entry->node_seq) ); -DEFINE_EVENT(transaction_restart_iter, trans_restart_relock, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) +DEFINE_EVENT(trans_str, trans_restart_relock, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str), + TP_ARGS(trans, caller_ip, str) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node, -- GitLab From aead3428e8b7502942356a17f0882d28eb3ff0c3 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 16 Jan 2024 11:07:23 +0000 Subject: [PATCH 1247/1290] bcachefs: remove redundant variable tmp The variable tmp is being assigned a value but it isn't being read afterwards. The assignment is redundant and so tmp can be removed. Cleans up clang scan build warning: warning: Although the value stored to 'ret' is used in the enclosing expression, the value is never actually read from 'ret' [deadcode.DeadStores] Signed-off-by: Colin Ian King Reviewed-by: Brian Foster Signed-off-by: Kent Overstreet --- fs/bcachefs/rebalance.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index f24106dee21d6..2d22efed981a7 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -385,7 +385,6 @@ static int bch2_rebalance_thread(void *arg) struct bch_fs *c = arg; struct bch_fs_rebalance *r = &c->rebalance; struct moving_context ctxt; - int ret; set_freezable(); @@ -393,8 +392,7 @@ static int bch2_rebalance_thread(void *arg) writepoint_ptr(&c->rebalance_write_point), true); - while (!kthread_should_stop() && - !(ret = do_rebalance(&ctxt))) + while (!kthread_should_stop() && !do_rebalance(&ctxt)) ; bch2_moving_ctxt_exit(&ctxt); -- GitLab From 00fff4dd58661944af9cb4fe8fe61b4105931776 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 16 Jan 2024 11:38:04 -0500 Subject: [PATCH 1248/1290] bcachefs: bios must be 512 byte algined Fixes: 023f9ac9f70f bcachefs: Delete dio read alignment check Reported-by: Brian Foster Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-io-direct.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index fdd57c5785c9c..e3b219e19e100 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -77,6 +77,10 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) bch2_inode_opts_get(&opts, c, &inode->ei_inode); + /* bios must be 512 byte aligned: */ + if ((offset|iter->count) & (SECTOR_SIZE - 1)) + return -EINVAL; + ret = min_t(loff_t, iter->count, max_t(loff_t, 0, i_size_read(&inode->v) - offset)); -- GitLab From 369acf97d6fd5da620d053d0f1878ffe32eff555 Mon Sep 17 00:00:00 2001 From: Su Yue Date: Tue, 16 Jan 2024 19:05:37 +0800 Subject: [PATCH 1249/1290] bcachefs: kvfree bch_fs::snapshots in bch2_fs_snapshots_exit bch_fs::snapshots is allocated by kvzalloc in __snapshot_t_mut. It should be freed by kvfree not kfree. Or umount will triger: [ 406.829178 ] BUG: unable to handle page fault for address: ffffe7b487148008 [ 406.830676 ] #PF: supervisor read access in kernel mode [ 406.831643 ] #PF: error_code(0x0000) - not-present page [ 406.832487 ] PGD 0 P4D 0 [ 406.832898 ] Oops: 0000 [#1] PREEMPT SMP PTI [ 406.833512 ] CPU: 2 PID: 1754 Comm: umount Kdump: loaded Tainted: G OE 6.7.0-rc7-custom+ #90 [ 406.834746 ] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014 [ 406.835796 ] RIP: 0010:kfree+0x62/0x140 [ 406.836197 ] Code: 80 48 01 d8 0f 82 e9 00 00 00 48 c7 c2 00 00 00 80 48 2b 15 78 9f 1f 01 48 01 d0 48 c1 e8 0c 48 c1 e0 06 48 03 05 56 9f 1f 01 <48> 8b 50 08 48 89 c7 f6 c2 01 0f 85 b0 00 00 00 66 90 48 8b 07 f6 [ 406.837810 ] RSP: 0018:ffffb9d641607e48 EFLAGS: 00010286 [ 406.838213 ] RAX: ffffe7b487148000 RBX: ffffb9d645200000 RCX: ffffb9d641607dc4 [ 406.838738 ] RDX: 000065bb00000000 RSI: ffffffffc0d88b84 RDI: ffffb9d645200000 [ 406.839217 ] RBP: ffff9a4625d00068 R08: 0000000000000001 R09: 0000000000000001 [ 406.839650 ] R10: 0000000000000001 R11: 000000000000001f R12: ffff9a4625d4da80 [ 406.840055 ] R13: ffff9a4625d00000 R14: ffffffffc0e2eb20 R15: 0000000000000000 [ 406.840451 ] FS: 00007f0a264ffb80(0000) GS:ffff9a4e2d500000(0000) knlGS:0000000000000000 [ 406.840851 ] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 406.841125 ] CR2: ffffe7b487148008 CR3: 000000018c4d2000 CR4: 00000000000006f0 [ 406.841464 ] Call Trace: [ 406.841583 ] [ 406.841682 ] ? __die+0x1f/0x70 [ 406.841828 ] ? page_fault_oops+0x159/0x470 [ 406.842014 ] ? fixup_exception+0x22/0x310 [ 406.842198 ] ? exc_page_fault+0x1ed/0x200 [ 406.842382 ] ? asm_exc_page_fault+0x22/0x30 [ 406.842574 ] ? bch2_fs_release+0x54/0x280 [bcachefs] [ 406.842842 ] ? kfree+0x62/0x140 [ 406.842988 ] ? kfree+0x104/0x140 [ 406.843138 ] bch2_fs_release+0x54/0x280 [bcachefs] [ 406.843390 ] kobject_put+0xb7/0x170 [ 406.843552 ] deactivate_locked_super+0x2f/0xa0 [ 406.843756 ] cleanup_mnt+0xba/0x150 [ 406.843917 ] task_work_run+0x59/0xa0 [ 406.844083 ] exit_to_user_mode_prepare+0x197/0x1a0 [ 406.844302 ] syscall_exit_to_user_mode+0x16/0x40 [ 406.844510 ] do_syscall_64+0x4e/0xf0 [ 406.844675 ] entry_SYSCALL_64_after_hwframe+0x6e/0x76 [ 406.844907 ] RIP: 0033:0x7f0a2664e4fb Signed-off-by: Su Yue Reviewed-by: Brian Foster Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 56af937523ff2..cdcff4e5ae5c3 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -1681,5 +1681,5 @@ int bch2_snapshots_read(struct bch_fs *c) void bch2_fs_snapshots_exit(struct bch_fs *c) { - kfree(rcu_dereference_protected(c->snapshots, true)); + kvfree(rcu_dereference_protected(c->snapshots, true)); } -- GitLab From 2acc59dd88d27ad69b66ded80df16c042b04eeec Mon Sep 17 00:00:00 2001 From: Su Yue Date: Mon, 15 Jan 2024 10:21:25 +0800 Subject: [PATCH 1250/1290] bcachefs: grab s_umount only if snapshotting When I was testing mongodb over bcachefs with compression, there is a lockdep warning when snapshotting mongodb data volume. $ cat test.sh prog=bcachefs $prog subvolume create /mnt/data $prog subvolume create /mnt/data/snapshots while true;do $prog subvolume snapshot /mnt/data /mnt/data/snapshots/$(date +%s) sleep 1s done $ cat /etc/mongodb.conf systemLog: destination: file logAppend: true path: /mnt/data/mongod.log storage: dbPath: /mnt/data/ lockdep reports: [ 3437.452330] ====================================================== [ 3437.452750] WARNING: possible circular locking dependency detected [ 3437.453168] 6.7.0-rc7-custom+ #85 Tainted: G E [ 3437.453562] ------------------------------------------------------ [ 3437.453981] bcachefs/35533 is trying to acquire lock: [ 3437.454325] ffffa0a02b2b1418 (sb_writers#10){.+.+}-{0:0}, at: filename_create+0x62/0x190 [ 3437.454875] but task is already holding lock: [ 3437.455268] ffffa0a02b2b10e0 (&type->s_umount_key#48){.+.+}-{3:3}, at: bch2_fs_file_ioctl+0x232/0xc90 [bcachefs] [ 3437.456009] which lock already depends on the new lock. [ 3437.456553] the existing dependency chain (in reverse order) is: [ 3437.457054] -> #3 (&type->s_umount_key#48){.+.+}-{3:3}: [ 3437.457507] down_read+0x3e/0x170 [ 3437.457772] bch2_fs_file_ioctl+0x232/0xc90 [bcachefs] [ 3437.458206] __x64_sys_ioctl+0x93/0xd0 [ 3437.458498] do_syscall_64+0x42/0xf0 [ 3437.458779] entry_SYSCALL_64_after_hwframe+0x6e/0x76 [ 3437.459155] -> #2 (&c->snapshot_create_lock){++++}-{3:3}: [ 3437.459615] down_read+0x3e/0x170 [ 3437.459878] bch2_truncate+0x82/0x110 [bcachefs] [ 3437.460276] bchfs_truncate+0x254/0x3c0 [bcachefs] [ 3437.460686] notify_change+0x1f1/0x4a0 [ 3437.461283] do_truncate+0x7f/0xd0 [ 3437.461555] path_openat+0xa57/0xce0 [ 3437.461836] do_filp_open+0xb4/0x160 [ 3437.462116] do_sys_openat2+0x91/0xc0 [ 3437.462402] __x64_sys_openat+0x53/0xa0 [ 3437.462701] do_syscall_64+0x42/0xf0 [ 3437.462982] entry_SYSCALL_64_after_hwframe+0x6e/0x76 [ 3437.463359] -> #1 (&sb->s_type->i_mutex_key#15){+.+.}-{3:3}: [ 3437.463843] down_write+0x3b/0xc0 [ 3437.464223] bch2_write_iter+0x5b/0xcc0 [bcachefs] [ 3437.464493] vfs_write+0x21b/0x4c0 [ 3437.464653] ksys_write+0x69/0xf0 [ 3437.464839] do_syscall_64+0x42/0xf0 [ 3437.465009] entry_SYSCALL_64_after_hwframe+0x6e/0x76 [ 3437.465231] -> #0 (sb_writers#10){.+.+}-{0:0}: [ 3437.465471] __lock_acquire+0x1455/0x21b0 [ 3437.465656] lock_acquire+0xc6/0x2b0 [ 3437.465822] mnt_want_write+0x46/0x1a0 [ 3437.465996] filename_create+0x62/0x190 [ 3437.466175] user_path_create+0x2d/0x50 [ 3437.466352] bch2_fs_file_ioctl+0x2ec/0xc90 [bcachefs] [ 3437.466617] __x64_sys_ioctl+0x93/0xd0 [ 3437.466791] do_syscall_64+0x42/0xf0 [ 3437.466957] entry_SYSCALL_64_after_hwframe+0x6e/0x76 [ 3437.467180] other info that might help us debug this: [ 3437.469670] 2 locks held by bcachefs/35533: other info that might help us debug this: [ 3437.467507] Chain exists of: sb_writers#10 --> &c->snapshot_create_lock --> &type->s_umount_key#48 [ 3437.467979] Possible unsafe locking scenario: [ 3437.468223] CPU0 CPU1 [ 3437.468405] ---- ---- [ 3437.468585] rlock(&type->s_umount_key#48); [ 3437.468758] lock(&c->snapshot_create_lock); [ 3437.469030] lock(&type->s_umount_key#48); [ 3437.469291] rlock(sb_writers#10); [ 3437.469434] *** DEADLOCK *** [ 3437.469670] 2 locks held by bcachefs/35533: [ 3437.469838] #0: ffffa0a02ce00a88 (&c->snapshot_create_lock){++++}-{3:3}, at: bch2_fs_file_ioctl+0x1e3/0xc90 [bcachefs] [ 3437.470294] #1: ffffa0a02b2b10e0 (&type->s_umount_key#48){.+.+}-{3:3}, at: bch2_fs_file_ioctl+0x232/0xc90 [bcachefs] [ 3437.470744] stack backtrace: [ 3437.470922] CPU: 7 PID: 35533 Comm: bcachefs Kdump: loaded Tainted: G E 6.7.0-rc7-custom+ #85 [ 3437.471313] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014 [ 3437.471694] Call Trace: [ 3437.471795] [ 3437.471884] dump_stack_lvl+0x57/0x90 [ 3437.472035] check_noncircular+0x132/0x150 [ 3437.472202] __lock_acquire+0x1455/0x21b0 [ 3437.472369] lock_acquire+0xc6/0x2b0 [ 3437.472518] ? filename_create+0x62/0x190 [ 3437.472683] ? lock_is_held_type+0x97/0x110 [ 3437.472856] mnt_want_write+0x46/0x1a0 [ 3437.473025] ? filename_create+0x62/0x190 [ 3437.473204] filename_create+0x62/0x190 [ 3437.473380] user_path_create+0x2d/0x50 [ 3437.473555] bch2_fs_file_ioctl+0x2ec/0xc90 [bcachefs] [ 3437.473819] ? lock_acquire+0xc6/0x2b0 [ 3437.474002] ? __fget_files+0x2a/0x190 [ 3437.474195] ? __fget_files+0xbc/0x190 [ 3437.474380] ? lock_release+0xc5/0x270 [ 3437.474567] ? __x64_sys_ioctl+0x93/0xd0 [ 3437.474764] ? __pfx_bch2_fs_file_ioctl+0x10/0x10 [bcachefs] [ 3437.475090] __x64_sys_ioctl+0x93/0xd0 [ 3437.475277] do_syscall_64+0x42/0xf0 [ 3437.475454] entry_SYSCALL_64_after_hwframe+0x6e/0x76 [ 3437.475691] RIP: 0033:0x7f2743c313af ====================================================== In __bch2_ioctl_subvolume_create(), we grab s_umount unconditionally and unlock it at the end of the function. There is a comment "why do we need this lock?" about the lock coming from commit 42d237320e98 ("bcachefs: Snapshot creation, deletion") The reason is that __bch2_ioctl_subvolume_create() calls sync_inodes_sb() which enforce locked s_umount to writeback all dirty nodes before doing snapshot works. Fix it by read locking s_umount for snapshotting only and unlocking s_umount after sync_inodes_sb(). Signed-off-by: Su Yue Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-ioctl.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index e0a19a73c8e1a..1346861ed9443 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -337,11 +337,12 @@ static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO) create_flags |= BCH_CREATE_SNAPSHOT_RO; - /* why do we need this lock? */ - down_read(&c->vfs_sb->s_umount); - - if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) + if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) { + /* sync_inodes_sb enforce s_umount is locked */ + down_read(&c->vfs_sb->s_umount); sync_inodes_sb(c->vfs_sb); + up_read(&c->vfs_sb->s_umount); + } retry: if (arg.src_ptr) { error = user_path_at(arg.dirfd, @@ -425,8 +426,6 @@ err2: goto retry; } err1: - up_read(&c->vfs_sb->s_umount); - return error; } -- GitLab From ec4edd7b9d2038a97e0ba3fad8fc8492b0d12d35 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 16 Jan 2024 13:29:59 -0500 Subject: [PATCH 1251/1290] bcachefs: Prep work for variable size btree node buffers bcachefs btree nodes are big - typically 256k - and btree roots are pinned in memory. As we're now up to 18 btrees, we now have significant memory overhead in mostly empty btree roots. And in the future we're going to start enforcing that certain btree node boundaries exist, to solve lock contention issues - analagous to XFS's AGIs. Thus, we need to start allocating smaller btree node buffers when we can. This patch changes code that refers to the filesystem constant c->opts.btree_node_size to refer to the btree node buffer size - btree_buf_bytes() - where appropriate. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 2 +- fs/bcachefs/backpointers.h | 1 + fs/bcachefs/bcachefs.h | 5 ---- fs/bcachefs/bset.c | 5 ++-- fs/bcachefs/bset.h | 3 +-- fs/bcachefs/btree_cache.c | 12 ++++----- fs/bcachefs/btree_cache.h | 19 ++++++++----- fs/bcachefs/btree_io.c | 38 +++++++++++++------------- fs/bcachefs/btree_trans_commit.c | 9 +++---- fs/bcachefs/btree_update_interior.c | 8 +++--- fs/bcachefs/btree_update_interior.h | 42 ++++++++++++----------------- fs/bcachefs/btree_write_buffer.c | 7 +++-- fs/bcachefs/debug.c | 16 +++++------ fs/bcachefs/extents.c | 1 + fs/bcachefs/move.c | 10 ++++--- fs/bcachefs/super.c | 2 +- fs/bcachefs/sysfs.c | 2 +- fs/bcachefs/trace.h | 2 +- 18 files changed, 87 insertions(+), 97 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index bf828f1f28d2b..b4dc319bcb2bc 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -560,7 +560,7 @@ static size_t btree_nodes_fit_in_ram(struct bch_fs *c) si_meminfo(&i); mem_bytes = i.totalram * i.mem_unit; - return div_u64(mem_bytes >> 1, btree_bytes(c)); + return div_u64(mem_bytes >> 1, c->opts.btree_node_size); } static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 737e2396ade7e..327365a9feac4 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -2,6 +2,7 @@ #ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H #define _BCACHEFS_BACKPOINTERS_BACKGROUND_H +#include "btree_cache.h" #include "btree_iter.h" #include "btree_update.h" #include "buckets.h" diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index dac383e371816..b80c6c9efd8ce 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -1204,11 +1204,6 @@ static inline unsigned block_sectors(const struct bch_fs *c) return c->opts.block_size >> 9; } -static inline size_t btree_sectors(const struct bch_fs *c) -{ - return c->opts.btree_node_size >> 9; -} - static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree) { return c->btree_key_cache_btrees & (1U << btree); diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index 044fff9b2cf6e..3fd1085b6c61e 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -823,13 +823,12 @@ void bch2_bset_init_first(struct btree *b, struct bset *i) set_btree_bset(b, t, i); } -void bch2_bset_init_next(struct bch_fs *c, struct btree *b, - struct btree_node_entry *bne) +void bch2_bset_init_next(struct btree *b, struct btree_node_entry *bne) { struct bset *i = &bne->keys; struct bset_tree *t; - BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c)); + BUG_ON(bset_byte_offset(b, bne) >= btree_buf_bytes(b)); BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b))); BUG_ON(b->nsets >= MAX_BSETS); diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h index 632c2b8c54609..79c77baaa3838 100644 --- a/fs/bcachefs/bset.h +++ b/fs/bcachefs/bset.h @@ -264,8 +264,7 @@ static inline struct bset *bset_next_set(struct btree *b, void bch2_btree_keys_init(struct btree *); void bch2_bset_init_first(struct btree *, struct bset *); -void bch2_bset_init_next(struct bch_fs *, struct btree *, - struct btree_node_entry *); +void bch2_bset_init_next(struct btree *, struct btree_node_entry *); void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); void bch2_bset_insert(struct btree *, struct btree_node_iter *, diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 8e2488a4b58d0..d7c81beac14af 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -60,7 +60,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b) clear_btree_node_just_written(b); - kvpfree(b->data, btree_bytes(c)); + kvpfree(b->data, btree_buf_bytes(b)); b->data = NULL; #ifdef __KERNEL__ kvfree(b->aux_data); @@ -94,7 +94,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) { BUG_ON(b->data || b->aux_data); - b->data = kvpmalloc(btree_bytes(c), gfp); + b->data = kvpmalloc(btree_buf_bytes(b), gfp); if (!b->data) return -BCH_ERR_ENOMEM_btree_node_mem_alloc; #ifdef __KERNEL__ @@ -107,7 +107,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) b->aux_data = NULL; #endif if (!b->aux_data) { - kvpfree(b->data, btree_bytes(c)); + kvpfree(b->data, btree_buf_bytes(b)); b->data = NULL; return -BCH_ERR_ENOMEM_btree_node_mem_alloc; } @@ -126,7 +126,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) bkey_btree_ptr_init(&b->key); INIT_LIST_HEAD(&b->list); INIT_LIST_HEAD(&b->write_blocked); - b->byte_order = ilog2(btree_bytes(c)); + b->byte_order = ilog2(c->opts.btree_node_size); return b; } @@ -408,7 +408,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) if (c->verify_data) list_move(&c->verify_data->list, &bc->live); - kvpfree(c->verify_ondisk, btree_bytes(c)); + kvpfree(c->verify_ondisk, c->opts.btree_node_size); for (i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); @@ -1192,7 +1192,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc " failed unpacked %zu\n", b->unpack_fn_len, b->nr.live_u64s * sizeof(u64), - btree_bytes(c) - sizeof(struct btree_node), + btree_buf_bytes(b) - sizeof(struct btree_node), b->nr.live_u64s * 100 / btree_max_u64s(c), b->sib_u64s[0], b->sib_u64s[1], diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index 4e1af58820522..6d33885fdbde0 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -74,22 +74,27 @@ static inline bool btree_node_hashed(struct btree *b) _iter = 0; _iter < (_tbl)->size; _iter++) \ rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash) -static inline size_t btree_bytes(struct bch_fs *c) +static inline size_t btree_buf_bytes(const struct btree *b) { - return c->opts.btree_node_size; + return 1UL << b->byte_order; } -static inline size_t btree_max_u64s(struct bch_fs *c) +static inline size_t btree_buf_max_u64s(const struct btree *b) { - return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64); + return (btree_buf_bytes(b) - sizeof(struct btree_node)) / sizeof(u64); } -static inline size_t btree_pages(struct bch_fs *c) +static inline size_t btree_max_u64s(const struct bch_fs *c) { - return btree_bytes(c) / PAGE_SIZE; + return (c->opts.btree_node_size - sizeof(struct btree_node)) / sizeof(u64); } -static inline unsigned btree_blocks(struct bch_fs *c) +static inline size_t btree_sectors(const struct bch_fs *c) +{ + return c->opts.btree_node_size >> SECTOR_SHIFT; +} + +static inline unsigned btree_blocks(const struct bch_fs *c) { return btree_sectors(c) >> c->block_bits; } diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 33db48e2153fe..aa9b6cbe32269 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -112,7 +112,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size, unsigned flags = memalloc_nofs_save(); void *p; - BUG_ON(size > btree_bytes(c)); + BUG_ON(size > c->opts.btree_node_size); *used_mempool = false; p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT); @@ -174,8 +174,8 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) ptrs = ptrs_end = ((void *) new_whiteouts + bytes); - for (k = unwritten_whiteouts_start(c, b); - k != unwritten_whiteouts_end(c, b); + for (k = unwritten_whiteouts_start(b); + k != unwritten_whiteouts_end(b); k = bkey_p_next(k)) *--ptrs = k; @@ -192,7 +192,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) verify_no_dups(b, new_whiteouts, (void *) ((u64 *) new_whiteouts + b->whiteout_u64s)); - memcpy_u64s(unwritten_whiteouts_start(c, b), + memcpy_u64s(unwritten_whiteouts_start(b), new_whiteouts, b->whiteout_u64s); btree_bounce_free(c, bytes, used_mempool, new_whiteouts); @@ -313,7 +313,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, } bytes = sorting_entire_node - ? btree_bytes(c) + ? btree_buf_bytes(b) : __vstruct_bytes(struct btree_node, u64s); out = btree_bounce_alloc(c, bytes, &used_mempool); @@ -338,7 +338,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, if (sorting_entire_node) { u64s = le16_to_cpu(out->keys.u64s); - BUG_ON(bytes != btree_bytes(c)); + BUG_ON(bytes != btree_buf_bytes(b)); /* * Our temporary buffer is the same size as the btree node's @@ -502,7 +502,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) bne = want_new_bset(c, b); if (bne) - bch2_bset_init_next(c, b, bne); + bch2_bset_init_next(b, bne); bch2_btree_build_aux_trees(b); @@ -1160,7 +1160,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, ptr_written, b->written); } else { for (bne = write_block(b); - bset_byte_offset(b, bne) < btree_bytes(c); + bset_byte_offset(b, bne) < btree_buf_bytes(b); bne = (void *) bne + block_bytes(c)) btree_err_on(bne->keys.seq == b->data->keys.seq && !bch2_journal_seq_is_blacklisted(c, @@ -1172,7 +1172,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, "found bset signature after last bset"); } - sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool); + sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool); sorted->keys.u64s = 0; set_btree_bset(b, b->set, &b->data->keys); @@ -1188,7 +1188,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, BUG_ON(b->nr.live_u64s != u64s); - btree_bounce_free(c, btree_bytes(c), used_mempool, sorted); + btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted); if (updated_range) bch2_btree_node_drop_keys_outside_node(b); @@ -1284,7 +1284,7 @@ static void btree_node_read_work(struct work_struct *work) rb->have_ioref = bch2_dev_get_ioref(ca, READ); bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = rb->pick.ptr.offset; - bio->bi_iter.bi_size = btree_bytes(c); + bio->bi_iter.bi_size = btree_buf_bytes(b); if (rb->have_ioref) { bio_set_dev(bio, ca->disk_sb.bdev); @@ -1512,7 +1512,7 @@ fsck_err: } if (best >= 0) { - memcpy(b->data, ra->buf[best], btree_bytes(c)); + memcpy(b->data, ra->buf[best], btree_buf_bytes(b)); ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error); } else { ret = -1; @@ -1578,7 +1578,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool for (i = 0; i < ra->nr; i++) { ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); ra->bio[i] = bio_alloc_bioset(NULL, - buf_pages(ra->buf[i], btree_bytes(c)), + buf_pages(ra->buf[i], btree_buf_bytes(b)), REQ_OP_READ|REQ_SYNC|REQ_META, GFP_NOFS, &c->btree_bio); @@ -1598,7 +1598,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool rb->pick = pick; rb->bio.bi_iter.bi_sector = pick.ptr.offset; rb->bio.bi_end_io = btree_node_read_all_replicas_endio; - bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c)); + bch2_bio_map(&rb->bio, ra->buf[i], btree_buf_bytes(b)); if (rb->have_ioref) { this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], @@ -1665,7 +1665,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, ca = bch_dev_bkey_exists(c, pick.ptr.dev); bio = bio_alloc_bioset(NULL, - buf_pages(b->data, btree_bytes(c)), + buf_pages(b->data, btree_buf_bytes(b)), REQ_OP_READ|REQ_SYNC|REQ_META, GFP_NOFS, &c->btree_bio); @@ -1679,7 +1679,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, INIT_WORK(&rb->work, btree_node_read_work); bio->bi_iter.bi_sector = pick.ptr.offset; bio->bi_end_io = btree_node_read_endio; - bch2_bio_map(bio, b->data, btree_bytes(c)); + bch2_bio_map(bio, b->data, btree_buf_bytes(b)); if (rb->have_ioref) { this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], @@ -2074,8 +2074,8 @@ do_write: i->u64s = 0; sort_iter_add(&sort_iter.iter, - unwritten_whiteouts_start(c, b), - unwritten_whiteouts_end(c, b)); + unwritten_whiteouts_start(b), + unwritten_whiteouts_end(b)); SET_BSET_SEPARATE_WHITEOUTS(i, false); b->whiteout_u64s = 0; @@ -2251,7 +2251,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) bne = want_new_bset(c, b); if (bne) - bch2_bset_init_next(c, b, bne); + bch2_bset_init_next(b, bne); bch2_btree_build_aux_trees(b); diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index ab00d202361eb..10f2478e45d18 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -139,8 +139,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans, EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); EBUG_ON(bpos_lt(insert->k.p, b->data->min_key)); EBUG_ON(bpos_gt(insert->k.p, b->data->max_key)); - EBUG_ON(insert->k.u64s > - bch_btree_keys_u64s_remaining(trans->c, b)); + EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b)); EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos)); k = bch2_btree_node_iter_peek_all(node_iter, b); @@ -160,7 +159,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans, k->type = KEY_TYPE_deleted; if (k->needs_whiteout) - push_whiteout(trans->c, b, insert->k.p); + push_whiteout(b, insert->k.p); k->needs_whiteout = false; if (k >= btree_bset_last(b)->start) { @@ -348,9 +347,7 @@ static noinline void journal_transaction_name(struct btree_trans *trans) static inline int btree_key_can_insert(struct btree_trans *trans, struct btree *b, unsigned u64s) { - struct bch_fs *c = trans->c; - - if (!bch2_btree_node_insert_fits(c, b, u64s)) + if (!bch2_btree_node_insert_fits(b, u64s)) return -BCH_ERR_btree_insert_btree_node_full; return 0; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 44f9dfa28a09d..17a5938aa71a6 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -159,7 +159,7 @@ static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, { size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f); - return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c); + return __vstruct_bytes(struct btree_node, u64s) < btree_buf_bytes(b); } /* Btree node freeing/allocation: */ @@ -1097,7 +1097,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, * Always check for space for two keys, even if we won't have to * split at prior level - it might have been a merge instead: */ - if (bch2_btree_node_insert_fits(c, path->l[update_level].b, + if (bch2_btree_node_insert_fits(path->l[update_level].b, BKEY_BTREE_PTR_U64s_MAX * 2)) break; @@ -1401,7 +1401,7 @@ static void __btree_split_node(struct btree_update *as, unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s + nr_keys[i].val_u64s; - if (__vstruct_bytes(struct btree_node, u64s) > btree_bytes(as->c)) + if (__vstruct_bytes(struct btree_node, u64s) > btree_buf_bytes(b)) n[i]->data->format = b->format; btree_node_set_format(n[i], n[i]->data->format); @@ -1703,7 +1703,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t bch2_btree_node_prep_for_write(trans, path, b); - if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) { + if (!bch2_btree_node_insert_fits(b, bch2_keylist_u64s(keys))) { bch2_btree_node_unlock_write(trans, path, b); goto split; } diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index adfc62083844c..c593c925d1e3b 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -184,21 +184,19 @@ static inline void btree_node_reset_sib_u64s(struct btree *b) b->sib_u64s[1] = b->nr.live_u64s; } -static inline void *btree_data_end(struct bch_fs *c, struct btree *b) +static inline void *btree_data_end(struct btree *b) { - return (void *) b->data + btree_bytes(c); + return (void *) b->data + btree_buf_bytes(b); } -static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c, - struct btree *b) +static inline struct bkey_packed *unwritten_whiteouts_start(struct btree *b) { - return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s); + return (void *) ((u64 *) btree_data_end(b) - b->whiteout_u64s); } -static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c, - struct btree *b) +static inline struct bkey_packed *unwritten_whiteouts_end(struct btree *b) { - return btree_data_end(c, b); + return btree_data_end(b); } static inline void *write_block(struct btree *b) @@ -221,13 +219,11 @@ static inline bool bkey_written(struct btree *b, struct bkey_packed *k) return __btree_addr_written(b, k); } -static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, - struct btree *b, - void *end) +static inline ssize_t __bch2_btree_u64s_remaining(struct btree *b, void *end) { ssize_t used = bset_byte_offset(b, end) / sizeof(u64) + b->whiteout_u64s; - ssize_t total = c->opts.btree_node_size >> 3; + ssize_t total = btree_buf_bytes(b) >> 3; /* Always leave one extra u64 for bch2_varint_decode: */ used++; @@ -235,10 +231,9 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, return total - used; } -static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c, - struct btree *b) +static inline size_t bch2_btree_keys_u64s_remaining(struct btree *b) { - ssize_t remaining = __bch_btree_u64s_remaining(c, b, + ssize_t remaining = __bch2_btree_u64s_remaining(b, btree_bkey_last(b, bset_tree_last(b))); BUG_ON(remaining < 0); @@ -260,14 +255,13 @@ static inline unsigned btree_write_set_buffer(struct btree *b) return 8 << BTREE_WRITE_SET_U64s_BITS; } -static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, - struct btree *b) +static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct btree *b) { struct bset_tree *t = bset_tree_last(b); struct btree_node_entry *bne = max(write_block(b), (void *) btree_bkey_last(b, bset_tree_last(b))); ssize_t remaining_space = - __bch_btree_u64s_remaining(c, b, bne->keys.start); + __bch2_btree_u64s_remaining(b, bne->keys.start); if (unlikely(bset_written(b, bset(b, t)))) { if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) @@ -281,12 +275,11 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, return NULL; } -static inline void push_whiteout(struct bch_fs *c, struct btree *b, - struct bpos pos) +static inline void push_whiteout(struct btree *b, struct bpos pos) { struct bkey_packed k; - BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s); + BUG_ON(bch2_btree_keys_u64s_remaining(b) < BKEY_U64s); EBUG_ON(btree_node_just_written(b)); if (!bkey_pack_pos(&k, pos, b)) { @@ -299,20 +292,19 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b, k.needs_whiteout = true; b->whiteout_u64s += k.u64s; - bkey_p_copy(unwritten_whiteouts_start(c, b), &k); + bkey_p_copy(unwritten_whiteouts_start(b), &k); } /* * write lock must be held on @b (else the dirty bset that we were going to * insert into could be written out from under us) */ -static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, - struct btree *b, unsigned u64s) +static inline bool bch2_btree_node_insert_fits(struct btree *b, unsigned u64s) { if (unlikely(btree_node_need_rewrite(b))) return false; - return u64s <= bch_btree_keys_u64s_remaining(c, b); + return u64s <= bch2_btree_keys_u64s_remaining(b); } void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *); diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 5c1169c78dafe..ac78448619663 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -125,13 +125,12 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite struct btree_write_buffered_key *wb, bool *write_locked, size_t *fast) { - struct bch_fs *c = trans->c; struct btree_path *path; int ret; EBUG_ON(!wb->journal_seq); - EBUG_ON(!c->btree_write_buffer.flushing.pin.seq); - EBUG_ON(c->btree_write_buffer.flushing.pin.seq > wb->journal_seq); + EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq); + EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq); ret = bch2_btree_iter_traverse(iter); if (ret) @@ -155,7 +154,7 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite *write_locked = true; } - if (unlikely(!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s))) { + if (unlikely(!bch2_btree_node_insert_fits(path->l[0].b, wb->k.k.u64s))) { *write_locked = false; return wb_flush_one_slowpath(trans, iter, wb); } diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index d6418948495f8..cadda9bbe4a4c 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -44,19 +44,19 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, return false; bio = bio_alloc_bioset(ca->disk_sb.bdev, - buf_pages(n_sorted, btree_bytes(c)), + buf_pages(n_sorted, btree_buf_bytes(b)), REQ_OP_READ|REQ_META, GFP_NOFS, &c->btree_bio); bio->bi_iter.bi_sector = pick.ptr.offset; - bch2_bio_map(bio, n_sorted, btree_bytes(c)); + bch2_bio_map(bio, n_sorted, btree_buf_bytes(b)); submit_bio_wait(bio); bio_put(bio); percpu_ref_put(&ca->io_ref); - memcpy(n_ondisk, n_sorted, btree_bytes(c)); + memcpy(n_ondisk, n_sorted, btree_buf_bytes(b)); v->written = 0; if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error) @@ -137,7 +137,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) mutex_lock(&c->verify_lock); if (!c->verify_ondisk) { - c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); + c->verify_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL); if (!c->verify_ondisk) goto out; } @@ -199,19 +199,19 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, return; } - n_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); + n_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL); if (!n_ondisk) { prt_printf(out, "memory allocation failure\n"); goto out; } bio = bio_alloc_bioset(ca->disk_sb.bdev, - buf_pages(n_ondisk, btree_bytes(c)), + buf_pages(n_ondisk, btree_buf_bytes(b)), REQ_OP_READ|REQ_META, GFP_NOFS, &c->btree_bio); bio->bi_iter.bi_sector = pick.ptr.offset; - bch2_bio_map(bio, n_ondisk, btree_bytes(c)); + bch2_bio_map(bio, n_ondisk, btree_buf_bytes(b)); ret = submit_bio_wait(bio); if (ret) { @@ -293,7 +293,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, out: if (bio) bio_put(bio); - kvpfree(n_ondisk, btree_bytes(c)); + kvpfree(n_ondisk, btree_buf_bytes(b)); percpu_ref_put(&ca->io_ref); } diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index edb1e32d77831..3ae4aba4f151b 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -8,6 +8,7 @@ #include "bcachefs.h" #include "bkey_methods.h" +#include "btree_cache.h" #include "btree_gc.h" #include "btree_io.h" #include "btree_iter.h" diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 2e083daedfb2b..dc284a89bd2db 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -6,6 +6,7 @@ #include "backpointers.h" #include "bkey_buf.h" #include "btree_gc.h" +#include "btree_io.h" #include "btree_update.h" #include "btree_update_interior.h" #include "btree_write_buffer.h" @@ -804,6 +805,8 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, if (!b) goto next; + unsigned sectors = btree_ptr_sectors_written(&b->key); + ret = bch2_btree_node_rewrite(trans, &iter, b, 0); bch2_trans_iter_exit(trans, &iter); @@ -813,11 +816,10 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, goto err; if (ctxt->rate) - bch2_ratelimit_increment(ctxt->rate, - c->opts.btree_node_size >> 9); + bch2_ratelimit_increment(ctxt->rate, sectors); if (ctxt->stats) { - atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen); - atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved); + atomic64_add(sectors, &ctxt->stats->sectors_seen); + atomic64_add(sectors, &ctxt->stats->sectors_moved); } } next: diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index a3ec21f229ed6..9262a9298fcd1 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -883,7 +883,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || !(c->online_reserved = alloc_percpu(u64)) || mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, - btree_bytes(c)) || + c->opts.btree_node_size) || mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, sizeof(u64), GFP_KERNEL))) { diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 553190d719dfc..46c4e98dc1004 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -248,7 +248,7 @@ static size_t bch2_btree_cache_size(struct bch_fs *c) mutex_lock(&c->btree_cache.lock); list_for_each_entry(b, &c->btree_cache.live, list) - ret += btree_bytes(c); + ret += btree_buf_bytes(b); mutex_unlock(&c->btree_cache.lock); return ret; diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 6ac1dfeaa8f29..293b90d704fb5 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -1013,7 +1013,7 @@ TRACE_EVENT(trans_restart_split_race, __entry->level = b->c.level; __entry->written = b->written; __entry->blocks = btree_blocks(trans->c); - __entry->u64s_remaining = bch_btree_keys_u64s_remaining(trans->c, b); + __entry->u64s_remaining = bch2_btree_keys_u64s_remaining(b); ), TP_printk("%s %pS l=%u written %u/%u u64s remaining %u", -- GitLab From d7e77f53e90e1eb87838eed7c651531427b9114a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 16 Jan 2024 16:20:21 -0500 Subject: [PATCH 1252/1290] bcachefs: opts->compression can now also be applied in the background The "apply this compression method in the background" paths now use the compression option if background_compression is not set; this means that setting or changing the compression option will cause existing data to be compressed accordingly in the background. Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 6 ++---- fs/bcachefs/extents.c | 4 +++- fs/bcachefs/extents.h | 2 +- fs/bcachefs/io_misc.c | 4 +--- fs/bcachefs/io_write.c | 4 +--- fs/bcachefs/move.c | 2 +- fs/bcachefs/opts.h | 5 +++++ fs/bcachefs/rebalance.c | 5 ++--- fs/bcachefs/reflink.c | 4 +--- fs/bcachefs/sysfs.c | 6 ++++-- fs/bcachefs/xattr.c | 5 +++-- 11 files changed, 24 insertions(+), 23 deletions(-) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 6f13477ff652e..4150feca42a2e 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -285,9 +285,7 @@ restart_drop_extra_replicas: k.k->p, bkey_start_pos(&insert->k)) ?: bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, insert->k.p) ?: - bch2_bkey_set_needs_rebalance(c, insert, - op->opts.background_target, - op->opts.background_compression) ?: + bch2_bkey_set_needs_rebalance(c, insert, &op->opts) ?: bch2_trans_update(trans, &iter, insert, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: bch2_trans_commit(trans, &op->res, @@ -529,7 +527,7 @@ int bch2_data_update_init(struct btree_trans *trans, BCH_WRITE_DATA_ENCODED| BCH_WRITE_MOVE| m->data_opts.write_flags; - m->op.compression_opt = io_opts.background_compression ?: io_opts.compression; + m->op.compression_opt = background_compression(io_opts); m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; bkey_for_each_ptr(ptrs, ptr) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 3ae4aba4f151b..61395b113df9b 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -1335,10 +1335,12 @@ bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k) } int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k, - unsigned target, unsigned compression) + struct bch_io_opts *opts) { struct bkey_s k = bkey_i_to_s(_k); struct bch_extent_rebalance *r; + unsigned target = opts->background_target; + unsigned compression = background_compression(*opts); bool needs_rebalance; if (!bkey_extent_is_direct_data(k.k)) diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index a855c94d43ddb..6bf839d69e84e 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -708,7 +708,7 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c, bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c); int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *, - unsigned, unsigned); + struct bch_io_opts *); /* Generic extent code: */ diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index ca6d5f516aa2b..1baf78594ccaf 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -442,9 +442,7 @@ case LOGGED_OP_FINSERT_shift_extents: op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset); - ret = bch2_bkey_set_needs_rebalance(c, copy, - opts.background_target, - opts.background_compression) ?: + ret = bch2_bkey_set_needs_rebalance(c, copy, &opts) ?: bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?: bch2_logged_op_update(trans, &op->k_i) ?: diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index e69c00fa32bdb..ef3a53f9045af 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -362,9 +362,7 @@ static int bch2_write_index_default(struct bch_write_op *op) bkey_start_pos(&sk.k->k), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - ret = bch2_bkey_set_needs_rebalance(c, sk.k, - op->opts.background_target, - op->opts.background_compression) ?: + ret = bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?: bch2_extent_update(trans, inum, &iter, sk.k, &op->res, op->new_i_size, &op->i_sectors_delta, diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index dc284a89bd2db..bf68ea49447b9 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -58,7 +58,7 @@ static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c prt_str(out, "compression: "); prt_tab(out); - bch2_compression_opt_to_text(out, io_opts->background_compression ?: io_opts->compression); + bch2_compression_opt_to_text(out, background_compression(*io_opts)); prt_newline(out); prt_str(out, "extra replicas: "); diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 7414c564b5d83..9a4b7faa37650 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -564,6 +564,11 @@ struct bch_io_opts { #undef x }; +static inline unsigned background_compression(struct bch_io_opts opts) +{ + return opts.background_compression ?: opts.compression; +} + struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); bool bch2_opt_is_inode_opt(enum bch_opt_id); diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 2d22efed981a7..22d1017aa49b9 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -253,13 +253,12 @@ static bool rebalance_pred(struct bch_fs *c, void *arg, if (k.k->p.inode) { target = io_opts->background_target; - compression = io_opts->background_compression ?: io_opts->compression; + compression = background_compression(*io_opts); } else { const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); target = r ? r->target : io_opts->background_target; - compression = r ? r->compression : - (io_opts->background_compression ?: io_opts->compression); + compression = r ? r->compression : background_compression(*io_opts); } data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression); diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 98255aa64e226..c47c66c2b394d 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -545,9 +545,7 @@ s64 bch2_remap_range(struct bch_fs *c, min(src_k.k->p.offset - src_want.offset, dst_end.offset - dst_iter.pos.offset)); - ret = bch2_bkey_set_needs_rebalance(c, new_dst.k, - opts.background_target, - opts.background_compression) ?: + ret = bch2_bkey_set_needs_rebalance(c, new_dst.k, &opts) ?: bch2_extent_update(trans, dst_inum, &dst_iter, new_dst.k, &disk_res, new_i_size, i_sectors_delta, diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 46c4e98dc1004..cee80c47feea2 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -726,8 +726,10 @@ STORE(bch2_fs_opts_dir) bch2_opt_set_sb(c, opt, v); bch2_opt_set_by_id(&c->opts, id, v); - if ((id == Opt_background_target || - id == Opt_background_compression) && v) + if (v && + (id == Opt_background_target || + id == Opt_background_compression || + (id == Opt_compression && !c->opts.background_compression))) bch2_set_rebalance_needs_scan(c, 0); ret = size; diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 5a1858fb9879a..9c0d2316031b1 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -590,8 +590,9 @@ err: mutex_unlock(&inode->ei_update_lock); if (value && - (opt_id == Opt_background_compression || - opt_id == Opt_background_target)) + (opt_id == Opt_background_target || + opt_id == Opt_background_compression || + (opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression)))) bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum); return bch2_err_class(ret); -- GitLab From 7be0208fc99207e86974f40a3b57949dae67976c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 17 Jan 2024 17:16:07 -0500 Subject: [PATCH 1253/1290] bcachefs: add missing __GFP_NOWARN Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 10f2478e45d18..30d69a6d133ee 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -415,7 +415,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags return 0; new_u64s = roundup_pow_of_two(u64s); - new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT); + new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN); if (unlikely(!new_k)) return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s); -- GitLab From d32088f2f2f0f361caeb87dfc71b632231fd6c7b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 20 Jan 2024 23:35:41 -0500 Subject: [PATCH 1254/1290] bcachefs: bch_snapshot::btime Add a field to bch_snapshot for creation time; this will be important when we start exposing the snapshot tree to userspace. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 1 + fs/bcachefs/snapshot.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 0d5ac4184fbce..a76036179238e 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -1149,6 +1149,7 @@ struct bch_snapshot { __le32 tree; __le32 depth; __le32 skip[3]; + bch_le128 btime; }; LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index cdcff4e5ae5c3..45f67e8b29eb6 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -1053,6 +1053,8 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, n->v.subvol = cpu_to_le32(snapshot_subvols[i]); n->v.tree = cpu_to_le32(tree); n->v.depth = cpu_to_le32(depth); + n->v.btime.lo = cpu_to_le64(bch2_current_time(c)); + n->v.btime.hi = 0; for (j = 0; j < ARRAY_SIZE(n->v.skip); j++) n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent)); -- GitLab From 12207f49ef41d5599fb313d103f2c7b485848c9d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 20 Jan 2024 23:44:17 -0500 Subject: [PATCH 1255/1290] bcachefs: comment bch_subvolume Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index a76036179238e..6abd19cdbfcfe 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -1123,6 +1123,9 @@ struct bch_subvolume { * Snapshot subvolumes form a tree, separate from the snapshot nodes * tree - if this subvolume is a snapshot, this is the ID of the * subvolume it was created from: + * + * This is _not_ necessarily the subvolume of the directory containing + * this subvolume: */ __le32 parent; __le32 pad; -- GitLab From 3a58dfbc46c277b090f1b72c949e15da7e1290bf Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 20 Jan 2024 23:46:35 -0500 Subject: [PATCH 1256/1290] bcachefs: counters.c -> sb-counters.c Signed-off-by: Kent Overstreet --- fs/bcachefs/Makefile | 2 +- fs/bcachefs/{counters.c => sb-counters.c} | 2 +- fs/bcachefs/{counters.h => sb-counters.h} | 7 +++---- fs/bcachefs/super-io.c | 2 +- fs/bcachefs/super.c | 2 +- 5 files changed, 7 insertions(+), 8 deletions(-) rename fs/bcachefs/{counters.c => sb-counters.c} (99%) rename fs/bcachefs/{counters.h => sb-counters.h} (77%) diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 7423a3557c680..1a05cecda7cc5 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -27,7 +27,6 @@ bcachefs-y := \ checksum.o \ clock.o \ compress.o \ - counters.o \ darray.o \ debug.o \ dirent.o \ @@ -71,6 +70,7 @@ bcachefs-y := \ reflink.o \ replicas.o \ sb-clean.o \ + sb-counters.o \ sb-downgrade.o \ sb-errors.o \ sb-members.o \ diff --git a/fs/bcachefs/counters.c b/fs/bcachefs/sb-counters.c similarity index 99% rename from fs/bcachefs/counters.c rename to fs/bcachefs/sb-counters.c index 02a996e06a64e..7dc898761bb31 100644 --- a/fs/bcachefs/counters.c +++ b/fs/bcachefs/sb-counters.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "super-io.h" -#include "counters.h" +#include "sb-counters.h" /* BCH_SB_FIELD_counters */ diff --git a/fs/bcachefs/counters.h b/fs/bcachefs/sb-counters.h similarity index 77% rename from fs/bcachefs/counters.h rename to fs/bcachefs/sb-counters.h index 4778aa19bf346..81f8aec9fcb1c 100644 --- a/fs/bcachefs/counters.h +++ b/fs/bcachefs/sb-counters.h @@ -1,11 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_COUNTERS_H -#define _BCACHEFS_COUNTERS_H +#ifndef _BCACHEFS_SB_COUNTERS_H +#define _BCACHEFS_SB_COUNTERS_H #include "bcachefs.h" #include "super-io.h" - int bch2_sb_counters_to_cpu(struct bch_fs *); int bch2_sb_counters_from_cpu(struct bch_fs *); @@ -14,4 +13,4 @@ int bch2_fs_counters_init(struct bch_fs *); extern const struct bch_sb_field_ops bch_sb_field_ops_counters; -#endif // _BCACHEFS_COUNTERS_H +#endif // _BCACHEFS_SB_COUNTERS_H diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 9564d2d9ccaef..4e4da1a5e5d78 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -2,7 +2,6 @@ #include "bcachefs.h" #include "checksum.h" -#include "counters.h" #include "disk_groups.h" #include "ec.h" #include "error.h" @@ -13,6 +12,7 @@ #include "replicas.h" #include "quota.h" #include "sb-clean.h" +#include "sb-counters.h" #include "sb-downgrade.h" #include "sb-errors.h" #include "sb-members.h" diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 9262a9298fcd1..670fe1e6733ad 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -23,7 +23,6 @@ #include "checksum.h" #include "clock.h" #include "compress.h" -#include "counters.h" #include "debug.h" #include "disk_groups.h" #include "ec.h" @@ -49,6 +48,7 @@ #include "recovery.h" #include "replicas.h" #include "sb-clean.h" +#include "sb-counters.h" #include "sb-errors.h" #include "sb-members.h" #include "snapshot.h" -- GitLab From 43314801a43985aa78cf475ccbdb3c520aa1e3d0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 20 Jan 2024 23:50:56 -0500 Subject: [PATCH 1257/1290] bcachefs: sb-counters_format.h bcachefs_format.h has gotten too big; let's do some organizing. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 97 +------------------------------ fs/bcachefs/sb-counters_format.h | 98 ++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 95 deletions(-) create mode 100644 fs/bcachefs/sb-counters_format.h diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 6abd19cdbfcfe..a9d2086ea2be3 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -1234,6 +1234,8 @@ struct bch_sb_field { x(ext, 13) \ x(downgrade, 14) +#include "sb-counters_format.h" + enum bch_sb_field_type { #define x(f, nr) BCH_SB_FIELD_##f = nr, BCH_SB_FIELDS() @@ -1504,101 +1506,6 @@ struct bch_sb_field_disk_groups { struct bch_disk_group entries[]; } __packed __aligned(8); -/* BCH_SB_FIELD_counters */ - -#define BCH_PERSISTENT_COUNTERS() \ - x(io_read, 0) \ - x(io_write, 1) \ - x(io_move, 2) \ - x(bucket_invalidate, 3) \ - x(bucket_discard, 4) \ - x(bucket_alloc, 5) \ - x(bucket_alloc_fail, 6) \ - x(btree_cache_scan, 7) \ - x(btree_cache_reap, 8) \ - x(btree_cache_cannibalize, 9) \ - x(btree_cache_cannibalize_lock, 10) \ - x(btree_cache_cannibalize_lock_fail, 11) \ - x(btree_cache_cannibalize_unlock, 12) \ - x(btree_node_write, 13) \ - x(btree_node_read, 14) \ - x(btree_node_compact, 15) \ - x(btree_node_merge, 16) \ - x(btree_node_split, 17) \ - x(btree_node_rewrite, 18) \ - x(btree_node_alloc, 19) \ - x(btree_node_free, 20) \ - x(btree_node_set_root, 21) \ - x(btree_path_relock_fail, 22) \ - x(btree_path_upgrade_fail, 23) \ - x(btree_reserve_get_fail, 24) \ - x(journal_entry_full, 25) \ - x(journal_full, 26) \ - x(journal_reclaim_finish, 27) \ - x(journal_reclaim_start, 28) \ - x(journal_write, 29) \ - x(read_promote, 30) \ - x(read_bounce, 31) \ - x(read_split, 33) \ - x(read_retry, 32) \ - x(read_reuse_race, 34) \ - x(move_extent_read, 35) \ - x(move_extent_write, 36) \ - x(move_extent_finish, 37) \ - x(move_extent_fail, 38) \ - x(move_extent_start_fail, 39) \ - x(copygc, 40) \ - x(copygc_wait, 41) \ - x(gc_gens_end, 42) \ - x(gc_gens_start, 43) \ - x(trans_blocked_journal_reclaim, 44) \ - x(trans_restart_btree_node_reused, 45) \ - x(trans_restart_btree_node_split, 46) \ - x(trans_restart_fault_inject, 47) \ - x(trans_restart_iter_upgrade, 48) \ - x(trans_restart_journal_preres_get, 49) \ - x(trans_restart_journal_reclaim, 50) \ - x(trans_restart_journal_res_get, 51) \ - x(trans_restart_key_cache_key_realloced, 52) \ - x(trans_restart_key_cache_raced, 53) \ - x(trans_restart_mark_replicas, 54) \ - x(trans_restart_mem_realloced, 55) \ - x(trans_restart_memory_allocation_failure, 56) \ - x(trans_restart_relock, 57) \ - x(trans_restart_relock_after_fill, 58) \ - x(trans_restart_relock_key_cache_fill, 59) \ - x(trans_restart_relock_next_node, 60) \ - x(trans_restart_relock_parent_for_fill, 61) \ - x(trans_restart_relock_path, 62) \ - x(trans_restart_relock_path_intent, 63) \ - x(trans_restart_too_many_iters, 64) \ - x(trans_restart_traverse, 65) \ - x(trans_restart_upgrade, 66) \ - x(trans_restart_would_deadlock, 67) \ - x(trans_restart_would_deadlock_write, 68) \ - x(trans_restart_injected, 69) \ - x(trans_restart_key_cache_upgrade, 70) \ - x(trans_traverse_all, 71) \ - x(transaction_commit, 72) \ - x(write_super, 73) \ - x(trans_restart_would_deadlock_recursion_limit, 74) \ - x(trans_restart_write_buffer_flush, 75) \ - x(trans_restart_split_race, 76) \ - x(write_buffer_flush_slowpath, 77) \ - x(write_buffer_flush_sync, 78) - -enum bch_persistent_counters { -#define x(t, n, ...) BCH_COUNTER_##t, - BCH_PERSISTENT_COUNTERS() -#undef x - BCH_COUNTER_NR -}; - -struct bch_sb_field_counters { - struct bch_sb_field field; - __le64 d[]; -}; - /* * On clean shutdown, store btree roots and current journal sequence number in * the superblock: diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h new file mode 100644 index 0000000000000..62ea478215d08 --- /dev/null +++ b/fs/bcachefs/sb-counters_format.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H +#define _BCACHEFS_SB_COUNTERS_FORMAT_H + +#define BCH_PERSISTENT_COUNTERS() \ + x(io_read, 0) \ + x(io_write, 1) \ + x(io_move, 2) \ + x(bucket_invalidate, 3) \ + x(bucket_discard, 4) \ + x(bucket_alloc, 5) \ + x(bucket_alloc_fail, 6) \ + x(btree_cache_scan, 7) \ + x(btree_cache_reap, 8) \ + x(btree_cache_cannibalize, 9) \ + x(btree_cache_cannibalize_lock, 10) \ + x(btree_cache_cannibalize_lock_fail, 11) \ + x(btree_cache_cannibalize_unlock, 12) \ + x(btree_node_write, 13) \ + x(btree_node_read, 14) \ + x(btree_node_compact, 15) \ + x(btree_node_merge, 16) \ + x(btree_node_split, 17) \ + x(btree_node_rewrite, 18) \ + x(btree_node_alloc, 19) \ + x(btree_node_free, 20) \ + x(btree_node_set_root, 21) \ + x(btree_path_relock_fail, 22) \ + x(btree_path_upgrade_fail, 23) \ + x(btree_reserve_get_fail, 24) \ + x(journal_entry_full, 25) \ + x(journal_full, 26) \ + x(journal_reclaim_finish, 27) \ + x(journal_reclaim_start, 28) \ + x(journal_write, 29) \ + x(read_promote, 30) \ + x(read_bounce, 31) \ + x(read_split, 33) \ + x(read_retry, 32) \ + x(read_reuse_race, 34) \ + x(move_extent_read, 35) \ + x(move_extent_write, 36) \ + x(move_extent_finish, 37) \ + x(move_extent_fail, 38) \ + x(move_extent_start_fail, 39) \ + x(copygc, 40) \ + x(copygc_wait, 41) \ + x(gc_gens_end, 42) \ + x(gc_gens_start, 43) \ + x(trans_blocked_journal_reclaim, 44) \ + x(trans_restart_btree_node_reused, 45) \ + x(trans_restart_btree_node_split, 46) \ + x(trans_restart_fault_inject, 47) \ + x(trans_restart_iter_upgrade, 48) \ + x(trans_restart_journal_preres_get, 49) \ + x(trans_restart_journal_reclaim, 50) \ + x(trans_restart_journal_res_get, 51) \ + x(trans_restart_key_cache_key_realloced, 52) \ + x(trans_restart_key_cache_raced, 53) \ + x(trans_restart_mark_replicas, 54) \ + x(trans_restart_mem_realloced, 55) \ + x(trans_restart_memory_allocation_failure, 56) \ + x(trans_restart_relock, 57) \ + x(trans_restart_relock_after_fill, 58) \ + x(trans_restart_relock_key_cache_fill, 59) \ + x(trans_restart_relock_next_node, 60) \ + x(trans_restart_relock_parent_for_fill, 61) \ + x(trans_restart_relock_path, 62) \ + x(trans_restart_relock_path_intent, 63) \ + x(trans_restart_too_many_iters, 64) \ + x(trans_restart_traverse, 65) \ + x(trans_restart_upgrade, 66) \ + x(trans_restart_would_deadlock, 67) \ + x(trans_restart_would_deadlock_write, 68) \ + x(trans_restart_injected, 69) \ + x(trans_restart_key_cache_upgrade, 70) \ + x(trans_traverse_all, 71) \ + x(transaction_commit, 72) \ + x(write_super, 73) \ + x(trans_restart_would_deadlock_recursion_limit, 74) \ + x(trans_restart_write_buffer_flush, 75) \ + x(trans_restart_split_race, 76) \ + x(write_buffer_flush_slowpath, 77) \ + x(write_buffer_flush_sync, 78) + +enum bch_persistent_counters { +#define x(t, n, ...) BCH_COUNTER_##t, + BCH_PERSISTENT_COUNTERS() +#undef x + BCH_COUNTER_NR +}; + +struct bch_sb_field_counters { + struct bch_sb_field field; + __le64 d[]; +}; + +#endif /* _BCACHEFS_SB_COUNTERS_FORMAT_H */ -- GitLab From 82de6207fb20ea9a467065f4c8ec382affc38405 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 20 Jan 2024 23:53:52 -0500 Subject: [PATCH 1258/1290] bcachefs; quota_format.h Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 43 +------------------------------- fs/bcachefs/quota_format.h | 47 +++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 42 deletions(-) create mode 100644 fs/bcachefs/quota_format.h diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index a9d2086ea2be3..2e91403f82355 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -1030,31 +1030,6 @@ struct bch_bucket_gens { u8 gens[KEY_TYPE_BUCKET_GENS_NR]; } __packed __aligned(8); -/* Quotas: */ - -enum quota_types { - QTYP_USR = 0, - QTYP_GRP = 1, - QTYP_PRJ = 2, - QTYP_NR = 3, -}; - -enum quota_counters { - Q_SPC = 0, - Q_INO = 1, - Q_COUNTERS = 2, -}; - -struct bch_quota_counter { - __le64 hardlimit; - __le64 softlimit; -}; - -struct bch_quota { - struct bch_val v; - struct bch_quota_counter c[Q_COUNTERS]; -} __packed __aligned(8); - /* Erasure coding */ struct bch_stripe { @@ -1234,6 +1209,7 @@ struct bch_sb_field { x(ext, 13) \ x(downgrade, 14) +#include "quota_format.h" #include "sb-counters_format.h" enum bch_sb_field_type { @@ -1471,23 +1447,6 @@ struct bch_sb_field_replicas { struct bch_replicas_entry_v1 entries[]; } __packed __aligned(8); -/* BCH_SB_FIELD_quota: */ - -struct bch_sb_quota_counter { - __le32 timelimit; - __le32 warnlimit; -}; - -struct bch_sb_quota_type { - __le64 flags; - struct bch_sb_quota_counter c[Q_COUNTERS]; -}; - -struct bch_sb_field_quota { - struct bch_sb_field field; - struct bch_sb_quota_type q[QTYP_NR]; -} __packed __aligned(8); - /* BCH_SB_FIELD_disk_groups: */ #define BCH_SB_LABEL_SIZE 32 diff --git a/fs/bcachefs/quota_format.h b/fs/bcachefs/quota_format.h new file mode 100644 index 0000000000000..dc34347ef6c74 --- /dev/null +++ b/fs/bcachefs/quota_format.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_QUOTA_FORMAT_H +#define _BCACHEFS_QUOTA_FORMAT_H + +/* KEY_TYPE_quota: */ + +enum quota_types { + QTYP_USR = 0, + QTYP_GRP = 1, + QTYP_PRJ = 2, + QTYP_NR = 3, +}; + +enum quota_counters { + Q_SPC = 0, + Q_INO = 1, + Q_COUNTERS = 2, +}; + +struct bch_quota_counter { + __le64 hardlimit; + __le64 softlimit; +}; + +struct bch_quota { + struct bch_val v; + struct bch_quota_counter c[Q_COUNTERS]; +} __packed __aligned(8); + +/* BCH_SB_FIELD_quota: */ + +struct bch_sb_quota_counter { + __le32 timelimit; + __le32 warnlimit; +}; + +struct bch_sb_quota_type { + __le64 flags; + struct bch_sb_quota_counter c[Q_COUNTERS]; +}; + +struct bch_sb_field_quota { + struct bch_sb_field field; + struct bch_sb_quota_type q[QTYP_NR]; +} __packed __aligned(8); + +#endif /* _BCACHEFS_QUOTA_FORMAT_H */ -- GitLab From b36425da71fe25a51c7f28af0e92b37e535db4a2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 20 Jan 2024 23:55:39 -0500 Subject: [PATCH 1259/1290] bcachefs: inode_format.h Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 165 +-------------------------------- fs/bcachefs/inode_format.h | 166 ++++++++++++++++++++++++++++++++++ 2 files changed, 167 insertions(+), 164 deletions(-) create mode 100644 fs/bcachefs/inode_format.h diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 2e91403f82355..691654f265529 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -710,170 +710,6 @@ struct bch_reservation { #define BKEY_BTREE_PTR_U64s_MAX \ (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) -/* Inodes */ - -#define BLOCKDEV_INODE_MAX 4096 - -#define BCACHEFS_ROOT_INO 4096 - -struct bch_inode { - struct bch_val v; - - __le64 bi_hash_seed; - __le32 bi_flags; - __le16 bi_mode; - __u8 fields[]; -} __packed __aligned(8); - -struct bch_inode_v2 { - struct bch_val v; - - __le64 bi_journal_seq; - __le64 bi_hash_seed; - __le64 bi_flags; - __le16 bi_mode; - __u8 fields[]; -} __packed __aligned(8); - -struct bch_inode_v3 { - struct bch_val v; - - __le64 bi_journal_seq; - __le64 bi_hash_seed; - __le64 bi_flags; - __le64 bi_sectors; - __le64 bi_size; - __le64 bi_version; - __u8 fields[]; -} __packed __aligned(8); - -#define INODEv3_FIELDS_START_INITIAL 6 -#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64)) - -struct bch_inode_generation { - struct bch_val v; - - __le32 bi_generation; - __le32 pad; -} __packed __aligned(8); - -/* - * bi_subvol and bi_parent_subvol are only set for subvolume roots: - */ - -#define BCH_INODE_FIELDS_v2() \ - x(bi_atime, 96) \ - x(bi_ctime, 96) \ - x(bi_mtime, 96) \ - x(bi_otime, 96) \ - x(bi_size, 64) \ - x(bi_sectors, 64) \ - x(bi_uid, 32) \ - x(bi_gid, 32) \ - x(bi_nlink, 32) \ - x(bi_generation, 32) \ - x(bi_dev, 32) \ - x(bi_data_checksum, 8) \ - x(bi_compression, 8) \ - x(bi_project, 32) \ - x(bi_background_compression, 8) \ - x(bi_data_replicas, 8) \ - x(bi_promote_target, 16) \ - x(bi_foreground_target, 16) \ - x(bi_background_target, 16) \ - x(bi_erasure_code, 16) \ - x(bi_fields_set, 16) \ - x(bi_dir, 64) \ - x(bi_dir_offset, 64) \ - x(bi_subvol, 32) \ - x(bi_parent_subvol, 32) - -#define BCH_INODE_FIELDS_v3() \ - x(bi_atime, 96) \ - x(bi_ctime, 96) \ - x(bi_mtime, 96) \ - x(bi_otime, 96) \ - x(bi_uid, 32) \ - x(bi_gid, 32) \ - x(bi_nlink, 32) \ - x(bi_generation, 32) \ - x(bi_dev, 32) \ - x(bi_data_checksum, 8) \ - x(bi_compression, 8) \ - x(bi_project, 32) \ - x(bi_background_compression, 8) \ - x(bi_data_replicas, 8) \ - x(bi_promote_target, 16) \ - x(bi_foreground_target, 16) \ - x(bi_background_target, 16) \ - x(bi_erasure_code, 16) \ - x(bi_fields_set, 16) \ - x(bi_dir, 64) \ - x(bi_dir_offset, 64) \ - x(bi_subvol, 32) \ - x(bi_parent_subvol, 32) \ - x(bi_nocow, 8) - -/* subset of BCH_INODE_FIELDS */ -#define BCH_INODE_OPTS() \ - x(data_checksum, 8) \ - x(compression, 8) \ - x(project, 32) \ - x(background_compression, 8) \ - x(data_replicas, 8) \ - x(promote_target, 16) \ - x(foreground_target, 16) \ - x(background_target, 16) \ - x(erasure_code, 16) \ - x(nocow, 8) - -enum inode_opt_id { -#define x(name, ...) \ - Inode_opt_##name, - BCH_INODE_OPTS() -#undef x - Inode_opt_nr, -}; - -#define BCH_INODE_FLAGS() \ - x(sync, 0) \ - x(immutable, 1) \ - x(append, 2) \ - x(nodump, 3) \ - x(noatime, 4) \ - x(i_size_dirty, 5) \ - x(i_sectors_dirty, 6) \ - x(unlinked, 7) \ - x(backptr_untrusted, 8) - -/* bits 20+ reserved for packed fields below: */ - -enum bch_inode_flags { -#define x(t, n) BCH_INODE_##t = 1U << n, - BCH_INODE_FLAGS() -#undef x -}; - -enum __bch_inode_flags { -#define x(t, n) __BCH_INODE_##t = n, - BCH_INODE_FLAGS() -#undef x -}; - -LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); -LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); -LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32); - -LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24); -LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31); - -LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24); -LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31); - -LE64_BITMASK(INODEv3_FIELDS_START, - struct bch_inode_v3, bi_flags, 31, 36); -LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52); - /* Dirents */ /* @@ -1209,6 +1045,7 @@ struct bch_sb_field { x(ext, 13) \ x(downgrade, 14) +#include "inode_format.h" #include "quota_format.h" #include "sb-counters_format.h" diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h new file mode 100644 index 0000000000000..83d107331edf4 --- /dev/null +++ b/fs/bcachefs/inode_format.h @@ -0,0 +1,166 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_INODE_FORMAT_H +#define _BCACHEFS_INODE_FORMAT_H + +#define BLOCKDEV_INODE_MAX 4096 +#define BCACHEFS_ROOT_INO 4096 + +struct bch_inode { + struct bch_val v; + + __le64 bi_hash_seed; + __le32 bi_flags; + __le16 bi_mode; + __u8 fields[]; +} __packed __aligned(8); + +struct bch_inode_v2 { + struct bch_val v; + + __le64 bi_journal_seq; + __le64 bi_hash_seed; + __le64 bi_flags; + __le16 bi_mode; + __u8 fields[]; +} __packed __aligned(8); + +struct bch_inode_v3 { + struct bch_val v; + + __le64 bi_journal_seq; + __le64 bi_hash_seed; + __le64 bi_flags; + __le64 bi_sectors; + __le64 bi_size; + __le64 bi_version; + __u8 fields[]; +} __packed __aligned(8); + +#define INODEv3_FIELDS_START_INITIAL 6 +#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64)) + +struct bch_inode_generation { + struct bch_val v; + + __le32 bi_generation; + __le32 pad; +} __packed __aligned(8); + +/* + * bi_subvol and bi_parent_subvol are only set for subvolume roots: + */ + +#define BCH_INODE_FIELDS_v2() \ + x(bi_atime, 96) \ + x(bi_ctime, 96) \ + x(bi_mtime, 96) \ + x(bi_otime, 96) \ + x(bi_size, 64) \ + x(bi_sectors, 64) \ + x(bi_uid, 32) \ + x(bi_gid, 32) \ + x(bi_nlink, 32) \ + x(bi_generation, 32) \ + x(bi_dev, 32) \ + x(bi_data_checksum, 8) \ + x(bi_compression, 8) \ + x(bi_project, 32) \ + x(bi_background_compression, 8) \ + x(bi_data_replicas, 8) \ + x(bi_promote_target, 16) \ + x(bi_foreground_target, 16) \ + x(bi_background_target, 16) \ + x(bi_erasure_code, 16) \ + x(bi_fields_set, 16) \ + x(bi_dir, 64) \ + x(bi_dir_offset, 64) \ + x(bi_subvol, 32) \ + x(bi_parent_subvol, 32) + +#define BCH_INODE_FIELDS_v3() \ + x(bi_atime, 96) \ + x(bi_ctime, 96) \ + x(bi_mtime, 96) \ + x(bi_otime, 96) \ + x(bi_uid, 32) \ + x(bi_gid, 32) \ + x(bi_nlink, 32) \ + x(bi_generation, 32) \ + x(bi_dev, 32) \ + x(bi_data_checksum, 8) \ + x(bi_compression, 8) \ + x(bi_project, 32) \ + x(bi_background_compression, 8) \ + x(bi_data_replicas, 8) \ + x(bi_promote_target, 16) \ + x(bi_foreground_target, 16) \ + x(bi_background_target, 16) \ + x(bi_erasure_code, 16) \ + x(bi_fields_set, 16) \ + x(bi_dir, 64) \ + x(bi_dir_offset, 64) \ + x(bi_subvol, 32) \ + x(bi_parent_subvol, 32) \ + x(bi_nocow, 8) + +/* subset of BCH_INODE_FIELDS */ +#define BCH_INODE_OPTS() \ + x(data_checksum, 8) \ + x(compression, 8) \ + x(project, 32) \ + x(background_compression, 8) \ + x(data_replicas, 8) \ + x(promote_target, 16) \ + x(foreground_target, 16) \ + x(background_target, 16) \ + x(erasure_code, 16) \ + x(nocow, 8) + +enum inode_opt_id { +#define x(name, ...) \ + Inode_opt_##name, + BCH_INODE_OPTS() +#undef x + Inode_opt_nr, +}; + +#define BCH_INODE_FLAGS() \ + x(sync, 0) \ + x(immutable, 1) \ + x(append, 2) \ + x(nodump, 3) \ + x(noatime, 4) \ + x(i_size_dirty, 5) \ + x(i_sectors_dirty, 6) \ + x(unlinked, 7) \ + x(backptr_untrusted, 8) + +/* bits 20+ reserved for packed fields below: */ + +enum bch_inode_flags { +#define x(t, n) BCH_INODE_##t = 1U << n, + BCH_INODE_FLAGS() +#undef x +}; + +enum __bch_inode_flags { +#define x(t, n) __BCH_INODE_##t = n, + BCH_INODE_FLAGS() +#undef x +}; + +LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); +LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); +LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32); + +LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24); +LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31); + +LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24); +LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31); + +LE64_BITMASK(INODEv3_FIELDS_START, + struct bch_inode_v3, bi_flags, 31, 36); +LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52); + +#endif /* _BCACHEFS_INODE_FORMAT_H */ -- GitLab From 7ffc4daa5f08b13f88c0ce743dadb18040926cbf Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 20 Jan 2024 23:57:10 -0500 Subject: [PATCH 1260/1290] bcachefs: dirent_format.h Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 40 +-------------------------------- fs/bcachefs/dirent_format.h | 42 +++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 39 deletions(-) create mode 100644 fs/bcachefs/dirent_format.h diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 691654f265529..2af3795b49177 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -710,45 +710,6 @@ struct bch_reservation { #define BKEY_BTREE_PTR_U64s_MAX \ (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) -/* Dirents */ - -/* - * Dirents (and xattrs) have to implement string lookups; since our b-tree - * doesn't support arbitrary length strings for the key, we instead index by a - * 64 bit hash (currently truncated sha1) of the string, stored in the offset - * field of the key - using linear probing to resolve hash collisions. This also - * provides us with the readdir cookie posix requires. - * - * Linear probing requires us to use whiteouts for deletions, in the event of a - * collision: - */ - -struct bch_dirent { - struct bch_val v; - - /* Target inode number: */ - union { - __le64 d_inum; - struct { /* DT_SUBVOL */ - __le32 d_child_subvol; - __le32 d_parent_subvol; - }; - }; - - /* - * Copy of mode bits 12-15 from the target inode - so userspace can get - * the filetype without having to do a stat() - */ - __u8 d_type; - - __u8 d_name[]; -} __packed __aligned(8); - -#define DT_SUBVOL 16 -#define BCH_DT_MAX 17 - -#define BCH_NAME_MAX 512 - /* Xattrs */ #define KEY_TYPE_XATTR_INDEX_USER 0 @@ -1045,6 +1006,7 @@ struct bch_sb_field { x(ext, 13) \ x(downgrade, 14) +#include "dirent_format.h" #include "inode_format.h" #include "quota_format.h" #include "sb-counters_format.h" diff --git a/fs/bcachefs/dirent_format.h b/fs/bcachefs/dirent_format.h new file mode 100644 index 0000000000000..5e116b88e8146 --- /dev/null +++ b/fs/bcachefs/dirent_format.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DIRENT_FORMAT_H +#define _BCACHEFS_DIRENT_FORMAT_H + +/* + * Dirents (and xattrs) have to implement string lookups; since our b-tree + * doesn't support arbitrary length strings for the key, we instead index by a + * 64 bit hash (currently truncated sha1) of the string, stored in the offset + * field of the key - using linear probing to resolve hash collisions. This also + * provides us with the readdir cookie posix requires. + * + * Linear probing requires us to use whiteouts for deletions, in the event of a + * collision: + */ + +struct bch_dirent { + struct bch_val v; + + /* Target inode number: */ + union { + __le64 d_inum; + struct { /* DT_SUBVOL */ + __le32 d_child_subvol; + __le32 d_parent_subvol; + }; + }; + + /* + * Copy of mode bits 12-15 from the target inode - so userspace can get + * the filetype without having to do a stat() + */ + __u8 d_type; + + __u8 d_name[]; +} __packed __aligned(8); + +#define DT_SUBVOL 16 +#define BCH_DT_MAX 17 + +#define BCH_NAME_MAX 512 + +#endif /* _BCACHEFS_DIRENT_FORMAT_H */ -- GitLab From 72e0801049c9b10fe3cadacf57eb040dbe65ba52 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 20 Jan 2024 23:59:15 -0500 Subject: [PATCH 1261/1290] bcachefs: xattr_format.h Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 16 +--------------- fs/bcachefs/xattr_format.h | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 15 deletions(-) create mode 100644 fs/bcachefs/xattr_format.h diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 2af3795b49177..1dbc26a5945ea 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -710,21 +710,6 @@ struct bch_reservation { #define BKEY_BTREE_PTR_U64s_MAX \ (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) -/* Xattrs */ - -#define KEY_TYPE_XATTR_INDEX_USER 0 -#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1 -#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2 -#define KEY_TYPE_XATTR_INDEX_TRUSTED 3 -#define KEY_TYPE_XATTR_INDEX_SECURITY 4 - -struct bch_xattr { - struct bch_val v; - __u8 x_type; - __u8 x_name_len; - __le16 x_val_len; - __u8 x_name[]; -} __packed __aligned(8); /* Bucket/allocation information: */ @@ -1008,6 +993,7 @@ struct bch_sb_field { #include "dirent_format.h" #include "inode_format.h" +#include "xattr_format.h" #include "quota_format.h" #include "sb-counters_format.h" diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h new file mode 100644 index 0000000000000..e9f810539552e --- /dev/null +++ b/fs/bcachefs/xattr_format.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_XATTR_FORMAT_H +#define _BCACHEFS_XATTR_FORMAT_H + +#define KEY_TYPE_XATTR_INDEX_USER 0 +#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1 +#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2 +#define KEY_TYPE_XATTR_INDEX_TRUSTED 3 +#define KEY_TYPE_XATTR_INDEX_SECURITY 4 + +struct bch_xattr { + struct bch_val v; + __u8 x_type; + __u8 x_name_len; + __le16 x_val_len; + __u8 x_name[]; +} __packed __aligned(8); + +#endif /* _BCACHEFS_XATTR_FORMAT_H */ -- GitLab From d455179fce10f0a7a76b84d1c8327988a93e3216 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 21 Jan 2024 00:01:52 -0500 Subject: [PATCH 1262/1290] bcachefs: alloc_background_format.h Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background_format.h | 92 ++++++++++++++++++++++++++ fs/bcachefs/bcachefs_format.h | 95 +-------------------------- 2 files changed, 94 insertions(+), 93 deletions(-) create mode 100644 fs/bcachefs/alloc_background_format.h diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h new file mode 100644 index 0000000000000..b4ec20be93b86 --- /dev/null +++ b/fs/bcachefs/alloc_background_format.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H +#define _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H + +struct bch_alloc { + struct bch_val v; + __u8 fields; + __u8 gen; + __u8 data[]; +} __packed __aligned(8); + +#define BCH_ALLOC_FIELDS_V1() \ + x(read_time, 16) \ + x(write_time, 16) \ + x(data_type, 8) \ + x(dirty_sectors, 16) \ + x(cached_sectors, 16) \ + x(oldest_gen, 8) \ + x(stripe, 32) \ + x(stripe_redundancy, 8) + +enum { +#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, + BCH_ALLOC_FIELDS_V1() +#undef x +}; + +struct bch_alloc_v2 { + struct bch_val v; + __u8 nr_fields; + __u8 gen; + __u8 oldest_gen; + __u8 data_type; + __u8 data[]; +} __packed __aligned(8); + +#define BCH_ALLOC_FIELDS_V2() \ + x(read_time, 64) \ + x(write_time, 64) \ + x(dirty_sectors, 32) \ + x(cached_sectors, 32) \ + x(stripe, 32) \ + x(stripe_redundancy, 8) + +struct bch_alloc_v3 { + struct bch_val v; + __le64 journal_seq; + __le32 flags; + __u8 nr_fields; + __u8 gen; + __u8 oldest_gen; + __u8 data_type; + __u8 data[]; +} __packed __aligned(8); + +LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1) +LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) + +struct bch_alloc_v4 { + struct bch_val v; + __u64 journal_seq; + __u32 flags; + __u8 gen; + __u8 oldest_gen; + __u8 data_type; + __u8 stripe_redundancy; + __u32 dirty_sectors; + __u32 cached_sectors; + __u64 io_time[2]; + __u32 stripe; + __u32 nr_external_backpointers; + __u64 fragmentation_lru; +} __packed __aligned(8); + +#define BCH_ALLOC_V4_U64s_V0 6 +#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64)) + +BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1) +BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2) +BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8) +BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14) + +#define KEY_TYPE_BUCKET_GENS_BITS 8 +#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS) +#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1) + +struct bch_bucket_gens { + struct bch_val v; + u8 gens[KEY_TYPE_BUCKET_GENS_NR]; +} __packed __aligned(8); + +#endif /* _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H */ diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 1dbc26a5945ea..e26b8000c26bf 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -710,89 +710,6 @@ struct bch_reservation { #define BKEY_BTREE_PTR_U64s_MAX \ (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) - -/* Bucket/allocation information: */ - -struct bch_alloc { - struct bch_val v; - __u8 fields; - __u8 gen; - __u8 data[]; -} __packed __aligned(8); - -#define BCH_ALLOC_FIELDS_V1() \ - x(read_time, 16) \ - x(write_time, 16) \ - x(data_type, 8) \ - x(dirty_sectors, 16) \ - x(cached_sectors, 16) \ - x(oldest_gen, 8) \ - x(stripe, 32) \ - x(stripe_redundancy, 8) - -enum { -#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, - BCH_ALLOC_FIELDS_V1() -#undef x -}; - -struct bch_alloc_v2 { - struct bch_val v; - __u8 nr_fields; - __u8 gen; - __u8 oldest_gen; - __u8 data_type; - __u8 data[]; -} __packed __aligned(8); - -#define BCH_ALLOC_FIELDS_V2() \ - x(read_time, 64) \ - x(write_time, 64) \ - x(dirty_sectors, 32) \ - x(cached_sectors, 32) \ - x(stripe, 32) \ - x(stripe_redundancy, 8) - -struct bch_alloc_v3 { - struct bch_val v; - __le64 journal_seq; - __le32 flags; - __u8 nr_fields; - __u8 gen; - __u8 oldest_gen; - __u8 data_type; - __u8 data[]; -} __packed __aligned(8); - -LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1) -LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) - -struct bch_alloc_v4 { - struct bch_val v; - __u64 journal_seq; - __u32 flags; - __u8 gen; - __u8 oldest_gen; - __u8 data_type; - __u8 stripe_redundancy; - __u32 dirty_sectors; - __u32 cached_sectors; - __u64 io_time[2]; - __u32 stripe; - __u32 nr_external_backpointers; - __u64 fragmentation_lru; -} __packed __aligned(8); - -#define BCH_ALLOC_V4_U64s_V0 6 -#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64)) - -BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1) -BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2) -BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8) -BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14) - -#define BCH_ALLOC_V4_NR_BACKPOINTERS_MAX 40 - struct bch_backpointer { struct bch_val v; __u8 btree_id; @@ -803,15 +720,6 @@ struct bch_backpointer { struct bpos pos; } __packed __aligned(8); -#define KEY_TYPE_BUCKET_GENS_BITS 8 -#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS) -#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1) - -struct bch_bucket_gens { - struct bch_val v; - u8 gens[KEY_TYPE_BUCKET_GENS_NR]; -} __packed __aligned(8); - /* Erasure coding */ struct bch_stripe { @@ -991,8 +899,9 @@ struct bch_sb_field { x(ext, 13) \ x(downgrade, 14) -#include "dirent_format.h" +#include "alloc_background_format.h" #include "inode_format.h" +#include "dirent_format.h" #include "xattr_format.h" #include "quota_format.h" #include "sb-counters_format.h" -- GitLab From 8fed323b14040f42e5755bbb9bd778415634c4b6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 21 Jan 2024 02:41:06 -0500 Subject: [PATCH 1263/1290] bcachefs: snapshot_format.h Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 34 +-------------------------------- fs/bcachefs/snapshot_format.h | 36 +++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 33 deletions(-) create mode 100644 fs/bcachefs/snapshot_format.h diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index e26b8000c26bf..cfce8ca1f8359 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -805,39 +805,6 @@ LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1) LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2) LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3) -/* Snapshots */ - -struct bch_snapshot { - struct bch_val v; - __le32 flags; - __le32 parent; - __le32 children[2]; - __le32 subvol; - /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */ - __le32 tree; - __le32 depth; - __le32 skip[3]; - bch_le128 btime; -}; - -LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) - -/* True if a subvolume points to this snapshot node: */ -LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) - -/* - * Snapshot trees: - * - * The snapshot_trees btree gives us persistent indentifier for each tree of - * bch_snapshot nodes, and allow us to record and easily find the root/master - * subvolume that other snapshots were created from: - */ -struct bch_snapshot_tree { - struct bch_val v; - __le32 master_subvol; - __le32 root_snapshot; -}; - /* LRU btree: */ struct bch_lru { @@ -904,6 +871,7 @@ struct bch_sb_field { #include "dirent_format.h" #include "xattr_format.h" #include "quota_format.h" +#include "snapshot_format.h" #include "sb-counters_format.h" enum bch_sb_field_type { diff --git a/fs/bcachefs/snapshot_format.h b/fs/bcachefs/snapshot_format.h new file mode 100644 index 0000000000000..aabcd3a74cd95 --- /dev/null +++ b/fs/bcachefs/snapshot_format.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SNAPSHOT_FORMAT_H +#define _BCACHEFS_SNAPSHOT_FORMAT_H + +struct bch_snapshot { + struct bch_val v; + __le32 flags; + __le32 parent; + __le32 children[2]; + __le32 subvol; + /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */ + __le32 tree; + __le32 depth; + __le32 skip[3]; + bch_le128 btime; +}; + +LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) + +/* True if a subvolume points to this snapshot node: */ +LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) + +/* + * Snapshot trees: + * + * The snapshot_trees btree gives us persistent indentifier for each tree of + * bch_snapshot nodes, and allow us to record and easily find the root/master + * subvolume that other snapshots were created from: + */ +struct bch_snapshot_tree { + struct bch_val v; + __le32 master_subvol; + __le32 root_snapshot; +}; + +#endif /* _BCACHEFS_SNAPSHOT_FORMAT_H */ -- GitLab From c6c4ff6507c4e1d32f9c2019795d4b7aa6eb559f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 21 Jan 2024 02:42:53 -0500 Subject: [PATCH 1264/1290] bcachefs: subvolume_format.h Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 33 +------------------------------- fs/bcachefs/subvolume_format.h | 35 ++++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 32 deletions(-) create mode 100644 fs/bcachefs/subvolume_format.h diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index cfce8ca1f8359..6e4fc27ffb3bd 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -773,38 +773,6 @@ struct bch_inline_data { u8 data[]; }; -/* Subvolumes: */ - -#define SUBVOL_POS_MIN POS(0, 1) -#define SUBVOL_POS_MAX POS(0, S32_MAX) -#define BCACHEFS_ROOT_SUBVOL 1 - -struct bch_subvolume { - struct bch_val v; - __le32 flags; - __le32 snapshot; - __le64 inode; - /* - * Snapshot subvolumes form a tree, separate from the snapshot nodes - * tree - if this subvolume is a snapshot, this is the ID of the - * subvolume it was created from: - * - * This is _not_ necessarily the subvolume of the directory containing - * this subvolume: - */ - __le32 parent; - __le32 pad; - bch_le128 otime; -}; - -LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1) -/* - * We need to know whether a subvolume is a snapshot so we can know whether we - * can delete it (or whether it should just be rm -rf'd) - */ -LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2) -LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3) - /* LRU btree: */ struct bch_lru { @@ -872,6 +840,7 @@ struct bch_sb_field { #include "xattr_format.h" #include "quota_format.h" #include "snapshot_format.h" +#include "subvolume_format.h" #include "sb-counters_format.h" enum bch_sb_field_type { diff --git a/fs/bcachefs/subvolume_format.h b/fs/bcachefs/subvolume_format.h new file mode 100644 index 0000000000000..af79134b07d6a --- /dev/null +++ b/fs/bcachefs/subvolume_format.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUBVOLUME_FORMAT_H +#define _BCACHEFS_SUBVOLUME_FORMAT_H + +#define SUBVOL_POS_MIN POS(0, 1) +#define SUBVOL_POS_MAX POS(0, S32_MAX) +#define BCACHEFS_ROOT_SUBVOL 1 + +struct bch_subvolume { + struct bch_val v; + __le32 flags; + __le32 snapshot; + __le64 inode; + /* + * Snapshot subvolumes form a tree, separate from the snapshot nodes + * tree - if this subvolume is a snapshot, this is the ID of the + * subvolume it was created from: + * + * This is _not_ necessarily the subvolume of the directory containing + * this subvolume: + */ + __le32 parent; + __le32 pad; + bch_le128 otime; +}; + +LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1) +/* + * We need to know whether a subvolume is a snapshot so we can know whether we + * can delete it (or whether it should just be rm -rf'd) + */ +LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2) +LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3) + +#endif /* _BCACHEFS_SUBVOLUME_FORMAT_H */ -- GitLab From 0560eb9abf7dee3c3517cb38246522ecaf1efc12 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 21 Jan 2024 02:47:14 -0500 Subject: [PATCH 1265/1290] bcachefs: ec_format.h Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 17 +---------------- fs/bcachefs/ec_format.h | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 16 deletions(-) create mode 100644 fs/bcachefs/ec_format.h diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 6e4fc27ffb3bd..5327514d96f9f 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -720,22 +720,6 @@ struct bch_backpointer { struct bpos pos; } __packed __aligned(8); -/* Erasure coding */ - -struct bch_stripe { - struct bch_val v; - __le16 sectors; - __u8 algorithm; - __u8 nr_blocks; - __u8 nr_redundant; - - __u8 csum_granularity_bits; - __u8 csum_type; - __u8 pad; - - struct bch_extent_ptr ptrs[]; -} __packed __aligned(8); - /* Reflink: */ struct bch_reflink_p { @@ -835,6 +819,7 @@ struct bch_sb_field { x(downgrade, 14) #include "alloc_background_format.h" +#include "ec_format.h" #include "inode_format.h" #include "dirent_format.h" #include "xattr_format.h" diff --git a/fs/bcachefs/ec_format.h b/fs/bcachefs/ec_format.h new file mode 100644 index 0000000000000..44ce88ba08d71 --- /dev/null +++ b/fs/bcachefs/ec_format.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EC_FORMAT_H +#define _BCACHEFS_EC_FORMAT_H + +struct bch_stripe { + struct bch_val v; + __le16 sectors; + __u8 algorithm; + __u8 nr_blocks; + __u8 nr_redundant; + + __u8 csum_granularity_bits; + __u8 csum_type; + __u8 pad; + + struct bch_extent_ptr ptrs[]; +} __packed __aligned(8); + +#endif /* _BCACHEFS_EC_FORMAT_H */ -- GitLab From b2fa1b633bac0c3b2d04ae00e8801414d251aace Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 21 Jan 2024 02:51:56 -0500 Subject: [PATCH 1266/1290] bcachefs; extents_format.h Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 281 +-------------------------------- fs/bcachefs/extents_format.h | 282 ++++++++++++++++++++++++++++++++++ 2 files changed, 284 insertions(+), 279 deletions(-) create mode 100644 fs/bcachefs/extents_format.h diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 5327514d96f9f..2921ecd49c6ee 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -417,272 +417,12 @@ struct bch_set { struct bch_val v; }; -/* Extents */ - -/* - * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally - * preceded by checksum/compression information (bch_extent_crc32 or - * bch_extent_crc64). - * - * One major determining factor in the format of extents is how we handle and - * represent extents that have been partially overwritten and thus trimmed: - * - * If an extent is not checksummed or compressed, when the extent is trimmed we - * don't have to remember the extent we originally allocated and wrote: we can - * merely adjust ptr->offset to point to the start of the data that is currently - * live. The size field in struct bkey records the current (live) size of the - * extent, and is also used to mean "size of region on disk that we point to" in - * this case. - * - * Thus an extent that is not checksummed or compressed will consist only of a - * list of bch_extent_ptrs, with none of the fields in - * bch_extent_crc32/bch_extent_crc64. - * - * When an extent is checksummed or compressed, it's not possible to read only - * the data that is currently live: we have to read the entire extent that was - * originally written, and then return only the part of the extent that is - * currently live. - * - * Thus, in addition to the current size of the extent in struct bkey, we need - * to store the size of the originally allocated space - this is the - * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also, - * when the extent is trimmed, instead of modifying the offset field of the - * pointer, we keep a second smaller offset field - "offset into the original - * extent of the currently live region". - * - * The other major determining factor is replication and data migration: - * - * Each pointer may have its own bch_extent_crc32/64. When doing a replicated - * write, we will initially write all the replicas in the same format, with the - * same checksum type and compression format - however, when copygc runs later (or - * tiering/cache promotion, anything that moves data), it is not in general - * going to rewrite all the pointers at once - one of the replicas may be in a - * bucket on one device that has very little fragmentation while another lives - * in a bucket that has become heavily fragmented, and thus is being rewritten - * sooner than the rest. - * - * Thus it will only move a subset of the pointers (or in the case of - * tiering/cache promotion perhaps add a single pointer without dropping any - * current pointers), and if the extent has been partially overwritten it must - * write only the currently live portion (or copygc would not be able to reduce - * fragmentation!) - which necessitates a different bch_extent_crc format for - * the new pointer. - * - * But in the interests of space efficiency, we don't want to store one - * bch_extent_crc for each pointer if we don't have to. - * - * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and - * bch_extent_ptrs appended arbitrarily one after the other. We determine the - * type of a given entry with a scheme similar to utf8 (except we're encoding a - * type, not a size), encoding the type in the position of the first set bit: - * - * bch_extent_crc32 - 0b1 - * bch_extent_ptr - 0b10 - * bch_extent_crc64 - 0b100 - * - * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and - * bch_extent_crc64 is the least constrained). - * - * Then, each bch_extent_crc32/64 applies to the pointers that follow after it, - * until the next bch_extent_crc32/64. - * - * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer - * is neither checksummed nor compressed. - */ - /* 128 bits, sufficient for cryptographic MACs: */ struct bch_csum { __le64 lo; __le64 hi; } __packed __aligned(8); -#define BCH_EXTENT_ENTRY_TYPES() \ - x(ptr, 0) \ - x(crc32, 1) \ - x(crc64, 2) \ - x(crc128, 3) \ - x(stripe_ptr, 4) \ - x(rebalance, 5) -#define BCH_EXTENT_ENTRY_MAX 6 - -enum bch_extent_entry_type { -#define x(f, n) BCH_EXTENT_ENTRY_##f = n, - BCH_EXTENT_ENTRY_TYPES() -#undef x -}; - -/* Compressed/uncompressed size are stored biased by 1: */ -struct bch_extent_crc32 { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u32 type:2, - _compressed_size:7, - _uncompressed_size:7, - offset:7, - _unused:1, - csum_type:4, - compression_type:4; - __u32 csum; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u32 csum; - __u32 compression_type:4, - csum_type:4, - _unused:1, - offset:7, - _uncompressed_size:7, - _compressed_size:7, - type:2; -#endif -} __packed __aligned(8); - -#define CRC32_SIZE_MAX (1U << 7) -#define CRC32_NONCE_MAX 0 - -struct bch_extent_crc64 { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:3, - _compressed_size:9, - _uncompressed_size:9, - offset:9, - nonce:10, - csum_type:4, - compression_type:4, - csum_hi:16; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 csum_hi:16, - compression_type:4, - csum_type:4, - nonce:10, - offset:9, - _uncompressed_size:9, - _compressed_size:9, - type:3; -#endif - __u64 csum_lo; -} __packed __aligned(8); - -#define CRC64_SIZE_MAX (1U << 9) -#define CRC64_NONCE_MAX ((1U << 10) - 1) - -struct bch_extent_crc128 { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:4, - _compressed_size:13, - _uncompressed_size:13, - offset:13, - nonce:13, - csum_type:4, - compression_type:4; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 compression_type:4, - csum_type:4, - nonce:13, - offset:13, - _uncompressed_size:13, - _compressed_size:13, - type:4; -#endif - struct bch_csum csum; -} __packed __aligned(8); - -#define CRC128_SIZE_MAX (1U << 13) -#define CRC128_NONCE_MAX ((1U << 13) - 1) - -/* - * @reservation - pointer hasn't been written to, just reserved - */ -struct bch_extent_ptr { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:1, - cached:1, - unused:1, - unwritten:1, - offset:44, /* 8 petabytes */ - dev:8, - gen:8; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 gen:8, - dev:8, - offset:44, - unwritten:1, - unused:1, - cached:1, - type:1; -#endif -} __packed __aligned(8); - -struct bch_extent_stripe_ptr { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:5, - block:8, - redundancy:4, - idx:47; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 idx:47, - redundancy:4, - block:8, - type:5; -#endif -}; - -struct bch_extent_rebalance { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:6, - unused:34, - compression:8, /* enum bch_compression_opt */ - target:16; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 target:16, - compression:8, - unused:34, - type:6; -#endif -}; - -union bch_extent_entry { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 - unsigned long type; -#elif __BITS_PER_LONG == 32 - struct { - unsigned long pad; - unsigned long type; - }; -#else -#error edit for your odd byteorder. -#endif - -#define x(f, n) struct bch_extent_##f f; - BCH_EXTENT_ENTRY_TYPES() -#undef x -}; - -struct bch_btree_ptr { - struct bch_val v; - - __u64 _data[0]; - struct bch_extent_ptr start[]; -} __packed __aligned(8); - -struct bch_btree_ptr_v2 { - struct bch_val v; - - __u64 mem_ptr; - __le64 seq; - __le16 sectors_written; - __le16 flags; - struct bpos min_key; - __u64 _data[0]; - struct bch_extent_ptr start[]; -} __packed __aligned(8); - -LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1); - -struct bch_extent { - struct bch_val v; - - __u64 _data[0]; - union bch_extent_entry start[]; -} __packed __aligned(8); - struct bch_reservation { struct bch_val v; @@ -691,25 +431,6 @@ struct bch_reservation { __u8 pad[3]; } __packed __aligned(8); -/* Maximum size (in u64s) a single pointer could be: */ -#define BKEY_EXTENT_PTR_U64s_MAX\ - ((sizeof(struct bch_extent_crc128) + \ - sizeof(struct bch_extent_ptr)) / sizeof(__u64)) - -/* Maximum possible size of an entire extent value: */ -#define BKEY_EXTENT_VAL_U64s_MAX \ - (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) - -/* * Maximum possible size of an entire extent, key + value: */ -#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) - -/* Btree pointers don't carry around checksums: */ -#define BKEY_BTREE_PTR_VAL_U64s_MAX \ - ((sizeof(struct bch_btree_ptr_v2) + \ - sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64)) -#define BKEY_BTREE_PTR_U64s_MAX \ - (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) - struct bch_backpointer { struct bch_val v; __u8 btree_id; @@ -720,6 +441,8 @@ struct bch_backpointer { struct bpos pos; } __packed __aligned(8); +#include "extents_format.h" + /* Reflink: */ struct bch_reflink_p { diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h new file mode 100644 index 0000000000000..939d09f9d3b84 --- /dev/null +++ b/fs/bcachefs/extents_format.h @@ -0,0 +1,282 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EXTENTS_FORMAT_H +#define _BCACHEFS_EXTENTS_FORMAT_H + +/* + * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally + * preceded by checksum/compression information (bch_extent_crc32 or + * bch_extent_crc64). + * + * One major determining factor in the format of extents is how we handle and + * represent extents that have been partially overwritten and thus trimmed: + * + * If an extent is not checksummed or compressed, when the extent is trimmed we + * don't have to remember the extent we originally allocated and wrote: we can + * merely adjust ptr->offset to point to the start of the data that is currently + * live. The size field in struct bkey records the current (live) size of the + * extent, and is also used to mean "size of region on disk that we point to" in + * this case. + * + * Thus an extent that is not checksummed or compressed will consist only of a + * list of bch_extent_ptrs, with none of the fields in + * bch_extent_crc32/bch_extent_crc64. + * + * When an extent is checksummed or compressed, it's not possible to read only + * the data that is currently live: we have to read the entire extent that was + * originally written, and then return only the part of the extent that is + * currently live. + * + * Thus, in addition to the current size of the extent in struct bkey, we need + * to store the size of the originally allocated space - this is the + * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also, + * when the extent is trimmed, instead of modifying the offset field of the + * pointer, we keep a second smaller offset field - "offset into the original + * extent of the currently live region". + * + * The other major determining factor is replication and data migration: + * + * Each pointer may have its own bch_extent_crc32/64. When doing a replicated + * write, we will initially write all the replicas in the same format, with the + * same checksum type and compression format - however, when copygc runs later (or + * tiering/cache promotion, anything that moves data), it is not in general + * going to rewrite all the pointers at once - one of the replicas may be in a + * bucket on one device that has very little fragmentation while another lives + * in a bucket that has become heavily fragmented, and thus is being rewritten + * sooner than the rest. + * + * Thus it will only move a subset of the pointers (or in the case of + * tiering/cache promotion perhaps add a single pointer without dropping any + * current pointers), and if the extent has been partially overwritten it must + * write only the currently live portion (or copygc would not be able to reduce + * fragmentation!) - which necessitates a different bch_extent_crc format for + * the new pointer. + * + * But in the interests of space efficiency, we don't want to store one + * bch_extent_crc for each pointer if we don't have to. + * + * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and + * bch_extent_ptrs appended arbitrarily one after the other. We determine the + * type of a given entry with a scheme similar to utf8 (except we're encoding a + * type, not a size), encoding the type in the position of the first set bit: + * + * bch_extent_crc32 - 0b1 + * bch_extent_ptr - 0b10 + * bch_extent_crc64 - 0b100 + * + * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and + * bch_extent_crc64 is the least constrained). + * + * Then, each bch_extent_crc32/64 applies to the pointers that follow after it, + * until the next bch_extent_crc32/64. + * + * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer + * is neither checksummed nor compressed. + */ + +#define BCH_EXTENT_ENTRY_TYPES() \ + x(ptr, 0) \ + x(crc32, 1) \ + x(crc64, 2) \ + x(crc128, 3) \ + x(stripe_ptr, 4) \ + x(rebalance, 5) +#define BCH_EXTENT_ENTRY_MAX 6 + +enum bch_extent_entry_type { +#define x(f, n) BCH_EXTENT_ENTRY_##f = n, + BCH_EXTENT_ENTRY_TYPES() +#undef x +}; + +/* Compressed/uncompressed size are stored biased by 1: */ +struct bch_extent_crc32 { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u32 type:2, + _compressed_size:7, + _uncompressed_size:7, + offset:7, + _unused:1, + csum_type:4, + compression_type:4; + __u32 csum; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u32 csum; + __u32 compression_type:4, + csum_type:4, + _unused:1, + offset:7, + _uncompressed_size:7, + _compressed_size:7, + type:2; +#endif +} __packed __aligned(8); + +#define CRC32_SIZE_MAX (1U << 7) +#define CRC32_NONCE_MAX 0 + +struct bch_extent_crc64 { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:3, + _compressed_size:9, + _uncompressed_size:9, + offset:9, + nonce:10, + csum_type:4, + compression_type:4, + csum_hi:16; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 csum_hi:16, + compression_type:4, + csum_type:4, + nonce:10, + offset:9, + _uncompressed_size:9, + _compressed_size:9, + type:3; +#endif + __u64 csum_lo; +} __packed __aligned(8); + +#define CRC64_SIZE_MAX (1U << 9) +#define CRC64_NONCE_MAX ((1U << 10) - 1) + +struct bch_extent_crc128 { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:4, + _compressed_size:13, + _uncompressed_size:13, + offset:13, + nonce:13, + csum_type:4, + compression_type:4; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 compression_type:4, + csum_type:4, + nonce:13, + offset:13, + _uncompressed_size:13, + _compressed_size:13, + type:4; +#endif + struct bch_csum csum; +} __packed __aligned(8); + +#define CRC128_SIZE_MAX (1U << 13) +#define CRC128_NONCE_MAX ((1U << 13) - 1) + +/* + * @reservation - pointer hasn't been written to, just reserved + */ +struct bch_extent_ptr { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:1, + cached:1, + unused:1, + unwritten:1, + offset:44, /* 8 petabytes */ + dev:8, + gen:8; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 gen:8, + dev:8, + offset:44, + unwritten:1, + unused:1, + cached:1, + type:1; +#endif +} __packed __aligned(8); + +struct bch_extent_stripe_ptr { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:5, + block:8, + redundancy:4, + idx:47; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 idx:47, + redundancy:4, + block:8, + type:5; +#endif +}; + +struct bch_extent_rebalance { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:6, + unused:34, + compression:8, /* enum bch_compression_opt */ + target:16; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 target:16, + compression:8, + unused:34, + type:6; +#endif +}; + +union bch_extent_entry { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 + unsigned long type; +#elif __BITS_PER_LONG == 32 + struct { + unsigned long pad; + unsigned long type; + }; +#else +#error edit for your odd byteorder. +#endif + +#define x(f, n) struct bch_extent_##f f; + BCH_EXTENT_ENTRY_TYPES() +#undef x +}; + +struct bch_btree_ptr { + struct bch_val v; + + __u64 _data[0]; + struct bch_extent_ptr start[]; +} __packed __aligned(8); + +struct bch_btree_ptr_v2 { + struct bch_val v; + + __u64 mem_ptr; + __le64 seq; + __le16 sectors_written; + __le16 flags; + struct bpos min_key; + __u64 _data[0]; + struct bch_extent_ptr start[]; +} __packed __aligned(8); + +LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1); + +struct bch_extent { + struct bch_val v; + + __u64 _data[0]; + union bch_extent_entry start[]; +} __packed __aligned(8); + +/* Maximum size (in u64s) a single pointer could be: */ +#define BKEY_EXTENT_PTR_U64s_MAX\ + ((sizeof(struct bch_extent_crc128) + \ + sizeof(struct bch_extent_ptr)) / sizeof(__u64)) + +/* Maximum possible size of an entire extent value: */ +#define BKEY_EXTENT_VAL_U64s_MAX \ + (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) + +/* * Maximum possible size of an entire extent, key + value: */ +#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) + +/* Btree pointers don't carry around checksums: */ +#define BKEY_BTREE_PTR_VAL_U64s_MAX \ + ((sizeof(struct bch_btree_ptr_v2) + \ + sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64)) +#define BKEY_BTREE_PTR_U64s_MAX \ + (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) + +#endif /* _BCACHEFS_EXTENTS_FORMAT_H */ -- GitLab From 8d52ba60c4dccbf5d45db70f41b82b18c38059bd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 21 Jan 2024 02:54:47 -0500 Subject: [PATCH 1267/1290] bcachefs: reflink_format.h Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 49 ++--------------------------------- fs/bcachefs/extents_format.h | 13 ++++++++++ fs/bcachefs/reflink_format.h | 33 +++++++++++++++++++++++ 3 files changed, 48 insertions(+), 47 deletions(-) create mode 100644 fs/bcachefs/reflink_format.h diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 2921ecd49c6ee..12b0ddedebd7a 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -423,14 +423,6 @@ struct bch_csum { __le64 hi; } __packed __aligned(8); -struct bch_reservation { - struct bch_val v; - - __le32 generation; - __u8 nr_replicas; - __u8 pad[3]; -} __packed __aligned(8); - struct bch_backpointer { struct bch_val v; __u8 btree_id; @@ -441,45 +433,6 @@ struct bch_backpointer { struct bpos pos; } __packed __aligned(8); -#include "extents_format.h" - -/* Reflink: */ - -struct bch_reflink_p { - struct bch_val v; - __le64 idx; - /* - * A reflink pointer might point to an indirect extent which is then - * later split (by copygc or rebalance). If we only pointed to part of - * the original indirect extent, and then one of the fragments is - * outside the range we point to, we'd leak a refcount: so when creating - * reflink pointers, we need to store pad values to remember the full - * range we were taking a reference on. - */ - __le32 front_pad; - __le32 back_pad; -} __packed __aligned(8); - -struct bch_reflink_v { - struct bch_val v; - __le64 refcount; - union bch_extent_entry start[0]; - __u64 _data[]; -} __packed __aligned(8); - -struct bch_indirect_inline_data { - struct bch_val v; - __le64 refcount; - u8 data[]; -}; - -/* Inline data */ - -struct bch_inline_data { - struct bch_val v; - u8 data[]; -}; - /* LRU btree: */ struct bch_lru { @@ -542,6 +495,8 @@ struct bch_sb_field { x(downgrade, 14) #include "alloc_background_format.h" +#include "extents_format.h" +#include "reflink_format.h" #include "ec_format.h" #include "inode_format.h" #include "dirent_format.h" diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h index 939d09f9d3b84..3bd2fdbb08174 100644 --- a/fs/bcachefs/extents_format.h +++ b/fs/bcachefs/extents_format.h @@ -279,4 +279,17 @@ struct bch_extent { #define BKEY_BTREE_PTR_U64s_MAX \ (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) +struct bch_reservation { + struct bch_val v; + + __le32 generation; + __u8 nr_replicas; + __u8 pad[3]; +} __packed __aligned(8); + +struct bch_inline_data { + struct bch_val v; + u8 data[]; +}; + #endif /* _BCACHEFS_EXTENTS_FORMAT_H */ diff --git a/fs/bcachefs/reflink_format.h b/fs/bcachefs/reflink_format.h new file mode 100644 index 0000000000000..6772eebb1fc66 --- /dev/null +++ b/fs/bcachefs/reflink_format.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REFLINK_FORMAT_H +#define _BCACHEFS_REFLINK_FORMAT_H + +struct bch_reflink_p { + struct bch_val v; + __le64 idx; + /* + * A reflink pointer might point to an indirect extent which is then + * later split (by copygc or rebalance). If we only pointed to part of + * the original indirect extent, and then one of the fragments is + * outside the range we point to, we'd leak a refcount: so when creating + * reflink pointers, we need to store pad values to remember the full + * range we were taking a reference on. + */ + __le32 front_pad; + __le32 back_pad; +} __packed __aligned(8); + +struct bch_reflink_v { + struct bch_val v; + __le64 refcount; + union bch_extent_entry start[0]; + __u64 _data[]; +} __packed __aligned(8); + +struct bch_indirect_inline_data { + struct bch_val v; + __le64 refcount; + u8 data[]; +}; + +#endif /* _BCACHEFS_REFLINK_FORMAT_H */ -- GitLab From d826cc57c53fa759cac019efc9e59e475cf41070 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 21 Jan 2024 02:57:45 -0500 Subject: [PATCH 1268/1290] bcachefs: logged_ops_format.h Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 28 +--------------------------- fs/bcachefs/logged_ops_format.h | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 27 deletions(-) create mode 100644 fs/bcachefs/logged_ops_format.h diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 12b0ddedebd7a..0668b682a21ca 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -442,33 +442,6 @@ struct bch_lru { #define LRU_ID_STRIPES (1U << 16) -/* Logged operations btree: */ - -struct bch_logged_op_truncate { - struct bch_val v; - __le32 subvol; - __le32 pad; - __le64 inum; - __le64 new_i_size; -}; - -enum logged_op_finsert_state { - LOGGED_OP_FINSERT_start, - LOGGED_OP_FINSERT_shift_extents, - LOGGED_OP_FINSERT_finish, -}; - -struct bch_logged_op_finsert { - struct bch_val v; - __u8 state; - __u8 pad[3]; - __le32 subvol; - __le64 inum; - __le64 dst_offset; - __le64 src_offset; - __le64 pos; -}; - /* Optional/variable size superblock sections: */ struct bch_sb_field { @@ -502,6 +475,7 @@ struct bch_sb_field { #include "dirent_format.h" #include "xattr_format.h" #include "quota_format.h" +#include "logged_ops_format.h" #include "snapshot_format.h" #include "subvolume_format.h" #include "sb-counters_format.h" diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h new file mode 100644 index 0000000000000..6a4bf7129dba2 --- /dev/null +++ b/fs/bcachefs/logged_ops_format.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H +#define _BCACHEFS_LOGGED_OPS_FORMAT_H + +struct bch_logged_op_truncate { + struct bch_val v; + __le32 subvol; + __le32 pad; + __le64 inum; + __le64 new_i_size; +}; + +enum logged_op_finsert_state { + LOGGED_OP_FINSERT_start, + LOGGED_OP_FINSERT_shift_extents, + LOGGED_OP_FINSERT_finish, +}; + +struct bch_logged_op_finsert { + struct bch_val v; + __u8 state; + __u8 pad[3]; + __le32 subvol; + __le64 inum; + __le64 dst_offset; + __le64 src_offset; + __le64 pos; +}; + +#endif /* _BCACHEFS_LOGGED_OPS_FORMAT_H */ -- GitLab From 249f441f83c546281f1c175756c81fac332bb64c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 21 Jan 2024 12:19:01 -0500 Subject: [PATCH 1269/1290] bcachefs: Improve inode_to_text() Add line breaks - inode_to_text() is now much easier to read. Signed-off-by: Kent Overstreet --- fs/bcachefs/inode.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 18a8d141b4437..086f0090b03a4 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -506,22 +506,33 @@ fsck_err: static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) { - prt_printf(out, "mode=%o ", inode->bi_mode); + printbuf_indent_add(out, 2); + prt_printf(out, "mode=%o", inode->bi_mode); + prt_newline(out); prt_str(out, "flags="); prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1)); prt_printf(out, " (%x)", inode->bi_flags); + prt_newline(out); - prt_printf(out, " journal_seq=%llu bi_size=%llu bi_sectors=%llu bi_version=%llu", - inode->bi_journal_seq, - inode->bi_size, - inode->bi_sectors, - inode->bi_version); + prt_printf(out, "journal_seq=%llu", inode->bi_journal_seq); + prt_newline(out); + + prt_printf(out, "bi_size=%llu", inode->bi_size); + prt_newline(out); + + prt_printf(out, "bi_sectors=%llu", inode->bi_sectors); + prt_newline(out); + + prt_newline(out); + prt_printf(out, "bi_version=%llu", inode->bi_version); #define x(_name, _bits) \ - prt_printf(out, " "#_name "=%llu", (u64) inode->_name); + prt_printf(out, #_name "=%llu", (u64) inode->_name); \ + prt_newline(out); BCH_INODE_FIELDS_v3() #undef x + printbuf_indent_sub(out, 2); } void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) -- GitLab From 6613476e225e090cc9aad49be7fa504e290dd33d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 21 Jan 2024 14:11:32 -0800 Subject: [PATCH 1270/1290] Linux 6.8-rc1 --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index e5321e45e4e52..9869f57c3fb3e 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 6 -PATCHLEVEL = 7 +PATCHLEVEL = 8 SUBLEVEL = 0 -EXTRAVERSION = +EXTRAVERSION = -rc1 NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* -- GitLab From cd30e8bde28ac361e15d67ee5c00e0125ed42548 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 15 Jan 2024 21:37:47 +0100 Subject: [PATCH 1271/1290] rbd: remove usage of the deprecated ida_simple_*() API ida_alloc() and ida_free() should be preferred to the deprecated ida_simple_get() and ida_simple_remove(). Note that the upper limit of ida_simple_get() is exclusive, while that of ida_alloc_max() is inclusive, so 1 has been subtracted. [ idryomov: tweak changelog ] Signed-off-by: Christophe JAILLET Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index a999b698b131f..63897d0d66292 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -5326,7 +5326,7 @@ static void rbd_dev_release(struct device *dev) if (need_put) { destroy_workqueue(rbd_dev->task_wq); - ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); + ida_free(&rbd_dev_id_ida, rbd_dev->dev_id); } rbd_dev_free(rbd_dev); @@ -5402,9 +5402,9 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, return NULL; /* get an id and fill in device name */ - rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0, - minor_to_rbd_dev_id(1 << MINORBITS), - GFP_KERNEL); + rbd_dev->dev_id = ida_alloc_max(&rbd_dev_id_ida, + minor_to_rbd_dev_id(1 << MINORBITS) - 1, + GFP_KERNEL); if (rbd_dev->dev_id < 0) goto fail_rbd_dev; @@ -5425,7 +5425,7 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, return rbd_dev; fail_dev_id: - ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); + ida_free(&rbd_dev_id_ida, rbd_dev->dev_id); fail_rbd_dev: rbd_dev_free(rbd_dev); return NULL; -- GitLab From ded080c86b3f99683774af0441a58fc2e3d60cae Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 17 Jan 2024 18:59:44 +0100 Subject: [PATCH 1272/1290] rbd: don't move requests to the running list on errors The running list is supposed to contain requests that are pinning the exclusive lock, i.e. those that must be flushed before exclusive lock is released. When wake_lock_waiters() is called to handle an error, requests on the acquiring list are failed with that error and no flushing takes place. Briefly moving them to the running list is not only pointless but also harmful: if exclusive lock gets acquired before all of their state machines are scheduled and go through rbd_lock_del_request(), we trigger rbd_assert(list_empty(&rbd_dev->running_list)); in rbd_try_acquire_lock(). Cc: stable@vger.kernel.org Fixes: 637cd060537d ("rbd: new exclusive lock wait/wake code") Signed-off-by: Ilya Dryomov Reviewed-by: Dongsheng Yang --- drivers/block/rbd.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 63897d0d66292..12b5d53ec8564 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3452,14 +3452,15 @@ static bool rbd_lock_add_request(struct rbd_img_request *img_req) static void rbd_lock_del_request(struct rbd_img_request *img_req) { struct rbd_device *rbd_dev = img_req->rbd_dev; - bool need_wakeup; + bool need_wakeup = false; lockdep_assert_held(&rbd_dev->lock_rwsem); spin_lock(&rbd_dev->lock_lists_lock); - rbd_assert(!list_empty(&img_req->lock_item)); - list_del_init(&img_req->lock_item); - need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING && - list_empty(&rbd_dev->running_list)); + if (!list_empty(&img_req->lock_item)) { + list_del_init(&img_req->lock_item); + need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING && + list_empty(&rbd_dev->running_list)); + } spin_unlock(&rbd_dev->lock_lists_lock); if (need_wakeup) complete(&rbd_dev->releasing_wait); @@ -3842,14 +3843,19 @@ static void wake_lock_waiters(struct rbd_device *rbd_dev, int result) return; } - list_for_each_entry(img_req, &rbd_dev->acquiring_list, lock_item) { + while (!list_empty(&rbd_dev->acquiring_list)) { + img_req = list_first_entry(&rbd_dev->acquiring_list, + struct rbd_img_request, lock_item); mutex_lock(&img_req->state_mutex); rbd_assert(img_req->state == RBD_IMG_EXCLUSIVE_LOCK); + if (!result) + list_move_tail(&img_req->lock_item, + &rbd_dev->running_list); + else + list_del_init(&img_req->lock_item); rbd_img_schedule(img_req, result); mutex_unlock(&img_req->state_mutex); } - - list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list); } static bool locker_equal(const struct ceph_locker *lhs, -- GitLab From 113a61863ecbfb3c29f3eb18fd9813bccf1743c1 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 31 Oct 2023 17:50:11 -0600 Subject: [PATCH 1273/1290] Makefile: Enable -Wstringop-overflow globally It seems that we have finished addressing all the remaining issues regarding -Wstringop-overflow. So, we are now in good shape to enable this compiler option globally. Signed-off-by: Gustavo A. R. Silva --- Makefile | 2 ++ scripts/Makefile.extrawarn | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 9869f57c3fb3e..c01cbbb208cab 100644 --- a/Makefile +++ b/Makefile @@ -986,6 +986,8 @@ NOSTDINC_FLAGS += -nostdinc # perform bounds checking. KBUILD_CFLAGS += $(call cc-option, -fstrict-flex-arrays=3) +KBUILD_CFLAGS += $(call cc-option, -Wstringop-overflow) + # disable invalid "can't wrap" optimizations for signed / pointers KBUILD_CFLAGS += -fno-strict-overflow diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn index 9b7a37ae28a88..a9e552a1e9105 100644 --- a/scripts/Makefile.extrawarn +++ b/scripts/Makefile.extrawarn @@ -97,7 +97,6 @@ KBUILD_CFLAGS += $(call cc-option, -Wunused-const-variable) KBUILD_CFLAGS += $(call cc-option, -Wpacked-not-aligned) KBUILD_CFLAGS += $(call cc-option, -Wformat-overflow) KBUILD_CFLAGS += $(call cc-option, -Wformat-truncation) -KBUILD_CFLAGS += $(call cc-option, -Wstringop-overflow) KBUILD_CFLAGS += $(call cc-option, -Wstringop-truncation) KBUILD_CPPFLAGS += -Wundef @@ -113,7 +112,6 @@ KBUILD_CFLAGS += $(call cc-disable-warning, restrict) KBUILD_CFLAGS += $(call cc-disable-warning, packed-not-aligned) KBUILD_CFLAGS += $(call cc-disable-warning, format-overflow) KBUILD_CFLAGS += $(call cc-disable-warning, format-truncation) -KBUILD_CFLAGS += $(call cc-disable-warning, stringop-overflow) KBUILD_CFLAGS += $(call cc-disable-warning, stringop-truncation) ifdef CONFIG_CC_IS_CLANG -- GitLab From a5e0ace04fbf56c1794b1a2fa7a93672753b3fc7 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 30 Nov 2023 14:29:34 -0600 Subject: [PATCH 1274/1290] init: Kconfig: Disable -Wstringop-overflow for GCC-11 -Wstringop-overflow is buggy in GCC-11. Therefore, we should disable this option specifically for that compiler version. To achieve this, we introduce a new configuration option: GCC11_NO_STRINGOP_OVERFLOW. The compiler option related to string operation overflow is now managed under configuration CC_STRINGOP_OVERFLOW. This option is enabled by default for all other versions of GCC that support it. Link: https://lore.kernel.org/lkml/b3c99290-40bc-426f-b3d2-1aa903f95c4e@embeddedor.com/ Link: https://lore.kernel.org/lkml/20231128091351.2bfb38dd@canb.auug.org.au/ Reviewed-by: Kees Cook Link: https://lore.kernel.org/linux-hardening/ZWj1+jkweEDWbmAR@work/ Signed-off-by: Gustavo A. R. Silva --- Makefile | 4 +++- init/Kconfig | 12 ++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index c01cbbb208cab..9f9b76d3a4b7d 100644 --- a/Makefile +++ b/Makefile @@ -986,7 +986,9 @@ NOSTDINC_FLAGS += -nostdinc # perform bounds checking. KBUILD_CFLAGS += $(call cc-option, -fstrict-flex-arrays=3) -KBUILD_CFLAGS += $(call cc-option, -Wstringop-overflow) +#Currently, disable -Wstringop-overflow for GCC 11, globally. +KBUILD_CFLAGS-$(CONFIG_CC_NO_STRINGOP_OVERFLOW) += $(call cc-option, -Wno-stringop-overflow) +KBUILD_CFLAGS-$(CONFIG_CC_STRINGOP_OVERFLOW) += $(call cc-option, -Wstringop-overflow) # disable invalid "can't wrap" optimizations for signed / pointers KBUILD_CFLAGS += -fno-strict-overflow diff --git a/init/Kconfig b/init/Kconfig index 8df18f3a97484..8d4e836e1b6b1 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -876,6 +876,18 @@ config CC_NO_ARRAY_BOUNDS bool default y if CC_IS_GCC && GCC_VERSION >= 110000 && GCC11_NO_ARRAY_BOUNDS +# Currently, disable -Wstringop-overflow for GCC 11, globally. +config GCC11_NO_STRINGOP_OVERFLOW + def_bool y + +config CC_NO_STRINGOP_OVERFLOW + bool + default y if CC_IS_GCC && GCC_VERSION >= 110000 && GCC_VERSION < 120000 && GCC11_NO_STRINGOP_OVERFLOW + +config CC_STRINGOP_OVERFLOW + bool + default y if CC_IS_GCC && !CC_NO_STRINGOP_OVERFLOW + # # For architectures that know their GCC __int128 support is sound # -- GitLab From 27daa514c48d5796d564ea5410cb72f78a521891 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 6 Dec 2023 12:21:42 +0300 Subject: [PATCH 1275/1290] ELF, MAINTAINERS: specifically mention ELF People complain when I miss people in Cc. [ kees: Also add the ELF uapi doc link ] Signed-off-by: Alexey Dobriyan Link: https://lore.kernel.org/r/2cb0891e-d7c0-4939-bb5f-282812de6078@p183 Signed-off-by: Kees Cook --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 8d1052fa6a692..39219b144c239 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7955,12 +7955,13 @@ L: rust-for-linux@vger.kernel.org S: Maintained F: rust/kernel/net/phy.rs -EXEC & BINFMT API +EXEC & BINFMT API, ELF R: Eric Biederman R: Kees Cook L: linux-mm@kvack.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/execve +F: Documentation/userspace-api/ELF.rst F: fs/*binfmt_*.c F: fs/exec.c F: include/linux/binfmts.h -- GitLab From 8788a17c2319f020ccdc3f2907179a5ae81b7ad6 Mon Sep 17 00:00:00 2001 From: Askar Safin Date: Tue, 9 Jan 2024 06:04:34 +0300 Subject: [PATCH 1276/1290] exec: remove useless comment Function name is wrong and the comment tells us nothing Signed-off-by: Askar Safin Link: https://lore.kernel.org/r/20240109030801.31827-1-safinaskar@zohomail.com Signed-off-by: Kees Cook --- fs/exec.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 8cdd5b2dd09c2..ba7d0548ac57f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1826,9 +1826,6 @@ static int exec_binprm(struct linux_binprm *bprm) return 0; } -/* - * sys_execve() executes a new program. - */ static int bprm_execve(struct linux_binprm *bprm) { int retval; -- GitLab From bdd8f62431ebcf15902a5fce3336388e436405c6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 16 Sep 2022 17:11:18 -0700 Subject: [PATCH 1277/1290] exec: Add do_close_execat() helper Consolidate the calls to allow_write_access()/fput() into a single place, since we repeat this code pattern. Add comments around the callers for the details on it. Link: https://lore.kernel.org/r/202209161637.9EDAF6B18@keescook Signed-off-by: Kees Cook --- fs/exec.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index ba7d0548ac57f..2037cc6360364 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -904,6 +904,10 @@ EXPORT_SYMBOL(transfer_args_to_stack); #endif /* CONFIG_MMU */ +/* + * On success, caller must call do_close_execat() on the returned + * struct file to close it. + */ static struct file *do_open_execat(int fd, struct filename *name, int flags) { struct file *file; @@ -948,6 +952,17 @@ exit: return ERR_PTR(err); } +/** + * open_exec - Open a path name for execution + * + * @name: path name to open with the intent of executing it. + * + * Returns ERR_PTR on failure or allocated struct file on success. + * + * As this is a wrapper for the internal do_open_execat(), callers + * must call allow_write_access() before fput() on release. Also see + * do_close_execat(). + */ struct file *open_exec(const char *name) { struct filename *filename = getname_kernel(name); @@ -1484,6 +1499,15 @@ static int prepare_bprm_creds(struct linux_binprm *bprm) return -ENOMEM; } +/* Matches do_open_execat() */ +static void do_close_execat(struct file *file) +{ + if (!file) + return; + allow_write_access(file); + fput(file); +} + static void free_bprm(struct linux_binprm *bprm) { if (bprm->mm) { @@ -1495,10 +1519,7 @@ static void free_bprm(struct linux_binprm *bprm) mutex_unlock(¤t->signal->cred_guard_mutex); abort_creds(bprm->cred); } - if (bprm->file) { - allow_write_access(bprm->file); - fput(bprm->file); - } + do_close_execat(bprm->file); if (bprm->executable) fput(bprm->executable); /* If a binfmt changed the interp, free it. */ @@ -1520,8 +1541,7 @@ static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int fl bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); if (!bprm) { - allow_write_access(file); - fput(file); + do_close_execat(file); return ERR_PTR(-ENOMEM); } -- GitLab From 84c39ec57d409e803a9bb6e4e85daf1243e0e80b Mon Sep 17 00:00:00 2001 From: Bernd Edlinger Date: Mon, 22 Jan 2024 19:34:21 +0100 Subject: [PATCH 1278/1290] exec: Fix error handling in begin_new_exec() If get_unused_fd_flags() fails, the error handling is incomplete because bprm->cred is already set to NULL, and therefore free_bprm will not unlock the cred_guard_mutex. Note there are two error conditions which end up here, one before and one after bprm->cred is cleared. Fixes: b8a61c9e7b4a ("exec: Generic execfd support") Signed-off-by: Bernd Edlinger Acked-by: Eric W. Biederman Link: https://lore.kernel.org/r/AS8P193MB128517ADB5EFF29E04389EDAE4752@AS8P193MB1285.EURP193.PROD.OUTLOOK.COM Cc: stable@vger.kernel.org Signed-off-by: Kees Cook --- fs/exec.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/exec.c b/fs/exec.c index 2037cc6360364..39d773021fffd 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1424,6 +1424,9 @@ int begin_new_exec(struct linux_binprm * bprm) out_unlock: up_write(&me->signal->exec_update_lock); + if (!bprm->cred) + mutex_unlock(&me->signal->cred_guard_mutex); + out: return retval; } -- GitLab From 018856c3f171517c66d5d7d3755ae0c517924fd7 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 22 Jan 2024 16:04:58 +0100 Subject: [PATCH 1279/1290] fbcon: Fix incorrect printed function name in fbcon_prepare_logo() If the boot logo does not fit, a message is printed, including a wrong function name prefix. Instead of correcting the function name (or using __func__), just use "fbcon", like is done in several other messages. While at it, modernize the call by switching to pr_info(). Signed-off-by: Geert Uytterhoeven Signed-off-by: Helge Deller --- drivers/video/fbdev/core/fbcon.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 63af6ab034b5f..1183e7a871f8b 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -631,8 +631,7 @@ static void fbcon_prepare_logo(struct vc_data *vc, struct fb_info *info, if (logo_lines > vc->vc_bottom) { logo_shown = FBCON_LOGO_CANSHOW; - printk(KERN_INFO - "fbcon_init: disable boot-logo (boot-logo bigger than screen).\n"); + pr_info("fbcon: disable boot-logo (boot-logo bigger than screen).\n"); } else { logo_shown = FBCON_LOGO_DRAW; vc->vc_top = logo_lines; -- GitLab From 2b44760609e9eaafc9d234a6883d042fc21132a7 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Mon, 22 Jan 2024 16:09:28 +0100 Subject: [PATCH 1280/1290] tracing: Ensure visibility when inserting an element into tracing_map Running the following two commands in parallel on a multi-processor AArch64 machine can sporadically produce an unexpected warning about duplicate histogram entries: $ while true; do echo hist:key=id.syscall:val=hitcount > \ /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist sleep 0.001 done $ stress-ng --sysbadaddr $(nproc) The warning looks as follows: [ 2911.172474] ------------[ cut here ]------------ [ 2911.173111] Duplicates detected: 1 [ 2911.173574] WARNING: CPU: 2 PID: 12247 at kernel/trace/tracing_map.c:983 tracing_map_sort_entries+0x3e0/0x408 [ 2911.174702] Modules linked in: iscsi_ibft(E) iscsi_boot_sysfs(E) rfkill(E) af_packet(E) nls_iso8859_1(E) nls_cp437(E) vfat(E) fat(E) ena(E) tiny_power_button(E) qemu_fw_cfg(E) button(E) fuse(E) efi_pstore(E) ip_tables(E) x_tables(E) xfs(E) libcrc32c(E) aes_ce_blk(E) aes_ce_cipher(E) crct10dif_ce(E) polyval_ce(E) polyval_generic(E) ghash_ce(E) gf128mul(E) sm4_ce_gcm(E) sm4_ce_ccm(E) sm4_ce(E) sm4_ce_cipher(E) sm4(E) sm3_ce(E) sm3(E) sha3_ce(E) sha512_ce(E) sha512_arm64(E) sha2_ce(E) sha256_arm64(E) nvme(E) sha1_ce(E) nvme_core(E) nvme_auth(E) t10_pi(E) sg(E) scsi_mod(E) scsi_common(E) efivarfs(E) [ 2911.174738] Unloaded tainted modules: cppc_cpufreq(E):1 [ 2911.180985] CPU: 2 PID: 12247 Comm: cat Kdump: loaded Tainted: G E 6.7.0-default #2 1b58bbb22c97e4399dc09f92d309344f69c44a01 [ 2911.182398] Hardware name: Amazon EC2 c7g.8xlarge/, BIOS 1.0 11/1/2018 [ 2911.183208] pstate: 61400005 (nZCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--) [ 2911.184038] pc : tracing_map_sort_entries+0x3e0/0x408 [ 2911.184667] lr : tracing_map_sort_entries+0x3e0/0x408 [ 2911.185310] sp : ffff8000a1513900 [ 2911.185750] x29: ffff8000a1513900 x28: ffff0003f272fe80 x27: 0000000000000001 [ 2911.186600] x26: ffff0003f272fe80 x25: 0000000000000030 x24: 0000000000000008 [ 2911.187458] x23: ffff0003c5788000 x22: ffff0003c16710c8 x21: ffff80008017f180 [ 2911.188310] x20: ffff80008017f000 x19: ffff80008017f180 x18: ffffffffffffffff [ 2911.189160] x17: 0000000000000000 x16: 0000000000000000 x15: ffff8000a15134b8 [ 2911.190015] x14: 0000000000000000 x13: 205d373432323154 x12: 5b5d313131333731 [ 2911.190844] x11: 00000000fffeffff x10: 00000000fffeffff x9 : ffffd1b78274a13c [ 2911.191716] x8 : 000000000017ffe8 x7 : c0000000fffeffff x6 : 000000000057ffa8 [ 2911.192554] x5 : ffff0012f6c24ec0 x4 : 0000000000000000 x3 : ffff2e5b72b5d000 [ 2911.193404] x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff0003ff254480 [ 2911.194259] Call trace: [ 2911.194626] tracing_map_sort_entries+0x3e0/0x408 [ 2911.195220] hist_show+0x124/0x800 [ 2911.195692] seq_read_iter+0x1d4/0x4e8 [ 2911.196193] seq_read+0xe8/0x138 [ 2911.196638] vfs_read+0xc8/0x300 [ 2911.197078] ksys_read+0x70/0x108 [ 2911.197534] __arm64_sys_read+0x24/0x38 [ 2911.198046] invoke_syscall+0x78/0x108 [ 2911.198553] el0_svc_common.constprop.0+0xd0/0xf8 [ 2911.199157] do_el0_svc+0x28/0x40 [ 2911.199613] el0_svc+0x40/0x178 [ 2911.200048] el0t_64_sync_handler+0x13c/0x158 [ 2911.200621] el0t_64_sync+0x1a8/0x1b0 [ 2911.201115] ---[ end trace 0000000000000000 ]--- The problem appears to be caused by CPU reordering of writes issued from __tracing_map_insert(). The check for the presence of an element with a given key in this function is: val = READ_ONCE(entry->val); if (val && keys_match(key, val->key, map->key_size)) ... The write of a new entry is: elt = get_free_elt(map); memcpy(elt->key, key, map->key_size); entry->val = elt; The "memcpy(elt->key, key, map->key_size);" and "entry->val = elt;" stores may become visible in the reversed order on another CPU. This second CPU might then incorrectly determine that a new key doesn't match an already present val->key and subsequently insert a new element, resulting in a duplicate. Fix the problem by adding a write barrier between "memcpy(elt->key, key, map->key_size);" and "entry->val = elt;", and for good measure, also use WRITE_ONCE(entry->val, elt) for publishing the element. The sequence pairs with the mentioned "READ_ONCE(entry->val);" and the "val->key" check which has an address dependency. The barrier is placed on a path executed when adding an element for a new key. Subsequent updates targeting the same key remain unaffected. From the user's perspective, the issue was introduced by commit c193707dde77 ("tracing: Remove code which merges duplicates"), which followed commit cbf4100efb8f ("tracing: Add support to detect and avoid duplicates"). The previous code operated differently; it inherently expected potential races which result in duplicates but merged them later when they occurred. Link: https://lore.kernel.org/linux-trace-kernel/20240122150928.27725-1-petr.pavlu@suse.com Fixes: c193707dde77 ("tracing: Remove code which merges duplicates") Signed-off-by: Petr Pavlu Acked-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/tracing_map.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index c774e560f2f95..a4dcf0f243521 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -574,7 +574,12 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only) } memcpy(elt->key, key, map->key_size); - entry->val = elt; + /* + * Ensure the initialization is visible and + * publish the elt. + */ + smp_wmb(); + WRITE_ONCE(entry->val, elt); atomic64_inc(&map->hits); return entry->val; -- GitLab From e01a83e12604aa2f8d4ab359ec44e341a2248b4a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 22 Jan 2024 15:39:01 -0800 Subject: [PATCH 1281/1290] Revert "btrfs: zstd: fix and simplify the inline extent decompression" This reverts commit 1e7f6def8b2370ecefb54b3c8f390ff894b0c51b. It causes my machine to not even boot, and Klara Modin reports that the cause is that small zstd-compressed files return garbage when read. Reported-by: Klara Modin Link: https://lore.kernel.org/linux-btrfs/CABq1_vj4GpUeZpVG49OHCo-3sdbe2-2ROcu_xDvUG-6-5zPRXg@mail.gmail.com/ Reported-and-bisected-by: Linus Torvalds Acked-by: David Sterba Cc: Qu Wenruo Signed-off-by: Linus Torvalds --- fs/btrfs/compression.h | 2 +- fs/btrfs/zstd.c | 75 +++++++++++++++++++++++++++++------------- 2 files changed, 54 insertions(+), 23 deletions(-) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 97fe3ebf11a22..afd7e50d073d4 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -169,7 +169,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zstd_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); void zstd_init_workspace_manager(void); void zstd_cleanup_workspace_manager(void); diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 346c46d88d07f..0d66db8bc1d47 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -20,7 +20,6 @@ #include "misc.h" #include "compression.h" #include "ctree.h" -#include "super.h" #define ZSTD_BTRFS_MAX_WINDOWLOG 17 #define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG) @@ -619,48 +618,80 @@ done: } int zstd_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); - struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); - const u32 sectorsize = fs_info->sectorsize; zstd_dstream *stream; int ret = 0; - unsigned long to_copy = 0; + size_t ret2; + unsigned long total_out = 0; + unsigned long pg_offset = 0; stream = zstd_init_dstream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); if (!stream) { pr_warn("BTRFS: zstd_init_dstream failed\n"); + ret = -EIO; goto finish; } + destlen = min_t(size_t, destlen, PAGE_SIZE); + workspace->in_buf.src = data_in; workspace->in_buf.pos = 0; workspace->in_buf.size = srclen; workspace->out_buf.dst = workspace->buf; workspace->out_buf.pos = 0; - workspace->out_buf.size = sectorsize; - - /* - * Since both input and output buffers should not exceed one sector, - * one call should end the decompression. - */ - ret = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf); - if (zstd_is_error(ret)) { - pr_warn_ratelimited("BTRFS: zstd_decompress_stream return %d\n", - zstd_get_error_code(ret)); - goto finish; + workspace->out_buf.size = PAGE_SIZE; + + ret2 = 1; + while (pg_offset < destlen + && workspace->in_buf.pos < workspace->in_buf.size) { + unsigned long buf_start; + unsigned long buf_offset; + unsigned long bytes; + + /* Check if the frame is over and we still need more input */ + if (ret2 == 0) { + pr_debug("BTRFS: zstd_decompress_stream ended early\n"); + ret = -EIO; + goto finish; + } + ret2 = zstd_decompress_stream(stream, &workspace->out_buf, + &workspace->in_buf); + if (zstd_is_error(ret2)) { + pr_debug("BTRFS: zstd_decompress_stream returned %d\n", + zstd_get_error_code(ret2)); + ret = -EIO; + goto finish; + } + + buf_start = total_out; + total_out += workspace->out_buf.pos; + workspace->out_buf.pos = 0; + + if (total_out <= start_byte) + continue; + + if (total_out > start_byte && buf_start < start_byte) + buf_offset = start_byte - buf_start; + else + buf_offset = 0; + + bytes = min_t(unsigned long, destlen - pg_offset, + workspace->out_buf.size - buf_offset); + + memcpy_to_page(dest_page, pg_offset, + workspace->out_buf.dst + buf_offset, bytes); + + pg_offset += bytes; } - to_copy = workspace->out_buf.pos; - memcpy_to_page(dest_page, dest_pgoff + to_copy, workspace->out_buf.dst, to_copy); + ret = 0; finish: - /* Error or early end. */ - if (unlikely(to_copy < destlen)) { - ret = -EIO; - memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy); + if (pg_offset < destlen) { + memzero_page(dest_page, pg_offset, destlen - pg_offset); } return ret; } -- GitLab From 7ed2632ec7d72e926b9e8bcc9ad1bb0cd37274bf Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Sun, 14 Jan 2024 00:33:45 +0300 Subject: [PATCH 1282/1290] drm/ttm: fix ttm pool initialization for no-dma-device drivers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The QXL driver doesn't use any device for DMA mappings or allocations so dev_to_node() will panic inside ttm_device_init() on NUMA systems: general protection fault, probably for non-canonical address 0xdffffc000000007a: 0000 [#1] PREEMPT SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x00000000000003d0-0x00000000000003d7] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 6.7.0+ #9 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-3-gd478f380-rebuilt.opensuse.org 04/01/2014 RIP: 0010:ttm_device_init+0x10e/0x340 Call Trace: qxl_ttm_init+0xaa/0x310 qxl_device_init+0x1071/0x2000 qxl_pci_probe+0x167/0x3f0 local_pci_probe+0xe1/0x1b0 pci_device_probe+0x29d/0x790 really_probe+0x251/0x910 __driver_probe_device+0x1ea/0x390 driver_probe_device+0x4e/0x2e0 __driver_attach+0x1e3/0x600 bus_for_each_dev+0x12d/0x1c0 bus_add_driver+0x25a/0x590 driver_register+0x15c/0x4b0 qxl_pci_driver_init+0x67/0x80 do_one_initcall+0xf5/0x5d0 kernel_init_freeable+0x637/0xb10 kernel_init+0x1c/0x2e0 ret_from_fork+0x48/0x80 ret_from_fork_asm+0x1b/0x30 RIP: 0010:ttm_device_init+0x10e/0x340 Fall back to NUMA_NO_NODE if there is no device for DMA. Found by Linux Verification Center (linuxtesting.org). Fixes: b0a7ce53d494 ("drm/ttm: Schedule delayed_delete worker closer") Signed-off-by: Fedor Pchelkin Reviewed-by: Christian König Reported-by: Steven Rostedt Cc: Rajneesh Bhardwaj Cc: Felix Kuehling Signed-off-by: Linus Torvalds --- drivers/gpu/drm/ttm/ttm_device.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/ttm/ttm_device.c b/drivers/gpu/drm/ttm/ttm_device.c index f5187b384ae9a..4130945052ed2 100644 --- a/drivers/gpu/drm/ttm/ttm_device.c +++ b/drivers/gpu/drm/ttm/ttm_device.c @@ -195,7 +195,7 @@ int ttm_device_init(struct ttm_device *bdev, const struct ttm_device_funcs *func bool use_dma_alloc, bool use_dma32) { struct ttm_global *glob = &ttm_glob; - int ret; + int ret, nid; if (WARN_ON(vma_manager == NULL)) return -EINVAL; @@ -215,7 +215,12 @@ int ttm_device_init(struct ttm_device *bdev, const struct ttm_device_funcs *func ttm_sys_man_init(bdev); - ttm_pool_init(&bdev->pool, dev, dev_to_node(dev), use_dma_alloc, use_dma32); + if (dev) + nid = dev_to_node(dev); + else + nid = NUMA_NO_NODE; + + ttm_pool_init(&bdev->pool, dev, nid, use_dma_alloc, use_dma32); bdev->vma_manager = vma_manager; spin_lock_init(&bdev->lru_lock); -- GitLab From 4b088005c897a62fe98f70ab69687706cb2fad3b Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 22 Jan 2024 21:26:33 +0100 Subject: [PATCH 1283/1290] fbdev: stifb: Fix crash in stifb_blank() Avoid a kernel crash in stifb by providing the correct pointer to the fb_info struct. Prior to commit e2e0b838a184 ("video/sticore: Remove info field from STI struct") the fb_info struct was at the beginning of the fb struct. Fixes: e2e0b838a184 ("video/sticore: Remove info field from STI struct") Signed-off-by: Helge Deller Cc: Thomas Zimmermann --- drivers/video/fbdev/stifb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/fbdev/stifb.c b/drivers/video/fbdev/stifb.c index 2de0e675fd150..8e5bac27542d9 100644 --- a/drivers/video/fbdev/stifb.c +++ b/drivers/video/fbdev/stifb.c @@ -1158,7 +1158,7 @@ stifb_init_display(struct stifb_info *fb) } break; } - stifb_blank(0, (struct fb_info *)fb); /* 0=enable screen */ + stifb_blank(0, fb->info); /* 0=enable screen */ SETUP_FB(fb); } -- GitLab From 834bf76add3e6168038150f162cbccf1fd492a67 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Mon, 22 Jan 2024 15:27:48 -0500 Subject: [PATCH 1284/1290] eventfs: Save directory inodes in the eventfs_inode structure The eventfs inodes and directories are allocated when referenced. But this leaves the issue of keeping consistent inode numbers and the number is only saved in the inode structure itself. When the inode is no longer referenced, it can be freed. When the file that the inode was representing is referenced again, the inode is once again created, but the inode number needs to be the same as it was before. Just making the inode numbers the same for all files is fine, but that does not work with directories. The find command will check for loops via the inode number and having the same inode number for directories triggers: # find /sys/kernel/tracing find: File system loop detected; '/sys/kernel/debug/tracing/events/initcall/initcall_finish' is part of the same file system loop as '/sys/kernel/debug/tracing/events/initcall'. [..] Linus pointed out that the eventfs_inode structure ends with a single 32bit int, and on 64 bit machines, there's likely a 4 byte hole due to alignment. We can use this hole to store the inode number for the eventfs_inode. All directories in eventfs are represented by an eventfs_inode and that data structure can hold its inode number. That last int was also purposely placed at the end of the structure to prevent holes from within. Now that there's a 4 byte number to hold the inode, both the inode number and the last integer can be moved up in the structure for better cache locality, where the llist and rcu fields can be moved to the end as they are only used when the eventfs_inode is being deleted. Link: https://lore.kernel.org/all/CAMuHMdXKiorg-jiuKoZpfZyDJ3Ynrfb8=X+c7x0Eewxn-YRdCA@mail.gmail.com/ Link: https://lore.kernel.org/linux-trace-kernel/20240122152748.46897388@gandalf.local.home Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Linus Torvalds Reported-by: Geert Uytterhoeven Tested-by: Geert Uytterhoeven Fixes: 53c41052ba31 ("eventfs: Have the inodes all for files and directories all be the same") Signed-off-by: Steven Rostedt (Google) Reviewed-by: Kees Cook --- fs/tracefs/event_inode.c | 14 +++++++++++--- fs/tracefs/internal.h | 7 ++++--- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 6795fda2af191..6b211522a13ec 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -34,7 +34,15 @@ static DEFINE_MUTEX(eventfs_mutex); /* Choose something "unique" ;-) */ #define EVENTFS_FILE_INODE_INO 0x12c4e37 -#define EVENTFS_DIR_INODE_INO 0x134b2f5 + +/* Just try to make something consistent and unique */ +static int eventfs_dir_ino(struct eventfs_inode *ei) +{ + if (!ei->ino) + ei->ino = get_next_ino(); + + return ei->ino; +} /* * The eventfs_inode (ei) itself is protected by SRCU. It is released from @@ -396,7 +404,7 @@ static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent inode->i_fop = &eventfs_file_operations; /* All directories will have the same inode number */ - inode->i_ino = EVENTFS_DIR_INODE_INO; + inode->i_ino = eventfs_dir_ino(ei); ti = get_tracefs(inode); ti->flags |= TRACEFS_EVENT_INODE; @@ -802,7 +810,7 @@ static int eventfs_iterate(struct file *file, struct dir_context *ctx) name = ei_child->name; - ino = EVENTFS_DIR_INODE_INO; + ino = eventfs_dir_ino(ei_child); if (!dir_emit(ctx, name, strlen(name), ino, DT_DIR)) goto out_dec; diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h index 12b7d0150ae9e..45397df9bb65b 100644 --- a/fs/tracefs/internal.h +++ b/fs/tracefs/internal.h @@ -55,6 +55,10 @@ struct eventfs_inode { struct eventfs_attr *entry_attrs; struct eventfs_attr attr; void *data; + unsigned int is_freed:1; + unsigned int is_events:1; + unsigned int nr_entries:30; + unsigned int ino; /* * Union - used for deletion * @llist: for calling dput() if needed after RCU @@ -64,9 +68,6 @@ struct eventfs_inode { struct llist_node llist; struct rcu_head rcu; }; - unsigned int is_freed:1; - unsigned int is_events:1; - unsigned int nr_entries:30; }; static inline struct tracefs_inode *get_tracefs(const struct inode *inode) -- GitLab From e787644caf7628ad3269c1fbd321c3255cf51710 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 19 Dec 2023 00:19:15 +0100 Subject: [PATCH 1285/1290] rcu: Defer RCU kthreads wakeup when CPU is dying When the CPU goes idle for the last time during the CPU down hotplug process, RCU reports a final quiescent state for the current CPU. If this quiescent state propagates up to the top, some tasks may then be woken up to complete the grace period: the main grace period kthread and/or the expedited main workqueue (or kworker). If those kthreads have a SCHED_FIFO policy, the wake up can indirectly arm the RT bandwith timer to the local offline CPU. Since this happens after hrtimers have been migrated at CPUHP_AP_HRTIMERS_DYING stage, the timer gets ignored. Therefore if the RCU kthreads are waiting for RT bandwidth to be available, they may never be actually scheduled. This triggers TREE03 rcutorture hangs: rcu: INFO: rcu_preempt self-detected stall on CPU rcu: 4-...!: (1 GPs behind) idle=9874/1/0x4000000000000000 softirq=0/0 fqs=20 rcuc=21071 jiffies(starved) rcu: (t=21035 jiffies g=938281 q=40787 ncpus=6) rcu: rcu_preempt kthread starved for 20964 jiffies! g938281 f0x0 RCU_GP_WAIT_FQS(5) ->state=0x0 ->cpu=0 rcu: Unless rcu_preempt kthread gets sufficient CPU time, OOM is now expected behavior. rcu: RCU grace-period kthread stack dump: task:rcu_preempt state:R running task stack:14896 pid:14 tgid:14 ppid:2 flags:0x00004000 Call Trace: __schedule+0x2eb/0xa80 schedule+0x1f/0x90 schedule_timeout+0x163/0x270 ? __pfx_process_timeout+0x10/0x10 rcu_gp_fqs_loop+0x37c/0x5b0 ? __pfx_rcu_gp_kthread+0x10/0x10 rcu_gp_kthread+0x17c/0x200 kthread+0xde/0x110 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2b/0x40 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1b/0x30 The situation can't be solved with just unpinning the timer. The hrtimer infrastructure and the nohz heuristics involved in finding the best remote target for an unpinned timer would then also need to handle enqueues from an offline CPU in the most horrendous way. So fix this on the RCU side instead and defer the wake up to an online CPU if it's too late for the local one. Reported-by: Paul E. McKenney Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier") Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree.c | 34 +++++++++++++++++++++++++++++++++- kernel/rcu/tree_exp.h | 3 +-- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1ae8517778066..b2bccfd37c383 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1013,6 +1013,38 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp) return needmore; } +static void swake_up_one_online_ipi(void *arg) +{ + struct swait_queue_head *wqh = arg; + + swake_up_one(wqh); +} + +static void swake_up_one_online(struct swait_queue_head *wqh) +{ + int cpu = get_cpu(); + + /* + * If called from rcutree_report_cpu_starting(), wake up + * is dangerous that late in the CPU-down hotplug process. The + * scheduler might queue an ignored hrtimer. Defer the wake up + * to an online CPU instead. + */ + if (unlikely(cpu_is_offline(cpu))) { + int target; + + target = cpumask_any_and(housekeeping_cpumask(HK_TYPE_RCU), + cpu_online_mask); + + smp_call_function_single(target, swake_up_one_online_ipi, + wqh, 0); + put_cpu(); + } else { + put_cpu(); + swake_up_one(wqh); + } +} + /* * Awaken the grace-period kthread. Don't do a self-awaken (unless in an * interrupt or softirq handler, in which case we just might immediately @@ -1037,7 +1069,7 @@ static void rcu_gp_kthread_wake(void) return; WRITE_ONCE(rcu_state.gp_wake_time, jiffies); WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq)); - swake_up_one(&rcu_state.gp_wq); + swake_up_one_online(&rcu_state.gp_wq); } /* diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6d7cea5d591f9..2ac440bc7e10b 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -173,7 +173,6 @@ static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp) return ret; } - /* * Report the exit from RCU read-side critical section for the last task * that queued itself during or before the current expedited preemptible-RCU @@ -201,7 +200,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (wake) { smp_mb(); /* EGP done before wake_up(). */ - swake_up_one(&rcu_state.expedited_wq); + swake_up_one_online(&rcu_state.expedited_wq); } break; } -- GitLab From 4759ff71f23e1a9cba001009abab68cde6dc327a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 24 Jan 2024 11:22:32 -0800 Subject: [PATCH 1286/1290] exec: Check __FMODE_EXEC instead of in_execve for LSMs After commit 978ffcbf00d8 ("execve: open the executable file before doing anything else"), current->in_execve was no longer in sync with the open(). This broke AppArmor and TOMOYO which depend on this flag to distinguish "open" operations from being "exec" operations. Instead of moving around in_execve, switch to using __FMODE_EXEC, which is where the "is this an exec?" intent is stored. Note that TOMOYO still uses in_execve around cred handling. Reported-by: Kevin Locke Closes: https://lore.kernel.org/all/ZbE4qn9_h14OqADK@kevinlocke.name Suggested-by: Linus Torvalds Fixes: 978ffcbf00d8 ("execve: open the executable file before doing anything else") Cc: Josh Triplett Cc: John Johansen Cc: Paul Moore Cc: James Morris Cc: Serge E. Hallyn Cc: Kentaro Takeda Cc: Tetsuo Handa Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: Eric Biederman Cc: Andrew Morton Cc: Sebastian Andrzej Siewior Cc: Cc: Cc: Cc: Signed-off-by: Kees Cook Signed-off-by: Linus Torvalds --- security/apparmor/lsm.c | 4 +++- security/tomoyo/tomoyo.c | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 7717354ce0950..98e1150bee9d0 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -469,8 +469,10 @@ static int apparmor_file_open(struct file *file) * Cache permissions granted by the previous exec check, with * implicit read and executable mmap which are required to * actually execute the image. + * + * Illogically, FMODE_EXEC is in f_flags, not f_mode. */ - if (current->in_execve) { + if (file->f_flags & __FMODE_EXEC) { fctx->allow = MAY_EXEC | MAY_READ | AA_EXEC_MMAP; return 0; } diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c index 3c3af149bf1c1..04a92c3d65d44 100644 --- a/security/tomoyo/tomoyo.c +++ b/security/tomoyo/tomoyo.c @@ -328,7 +328,8 @@ static int tomoyo_file_fcntl(struct file *file, unsigned int cmd, static int tomoyo_file_open(struct file *f) { /* Don't check read permission here if called from execve(). */ - if (current->in_execve) + /* Illogically, FMODE_EXEC is in f_flags, not f_mode. */ + if (f->f_flags & __FMODE_EXEC) return 0; return tomoyo_check_open_permission(tomoyo_domain(), &f->f_path, f->f_flags); -- GitLab From 90383cc07895183c75a0db2460301c2ffd912359 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 24 Jan 2024 11:15:33 -0800 Subject: [PATCH 1287/1290] exec: Distinguish in_execve from in_exec Just to help distinguish the fs->in_exec flag from the current->in_execve flag, add comments in check_unsafe_exec() and copy_fs() for more context. Also note that in_execve is only used by TOMOYO now. Cc: Kentaro Takeda Cc: Tetsuo Handa Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: Eric Biederman Cc: Andrew Morton Cc: Sebastian Andrzej Siewior Cc: linux-fsdevel@vger.kernel.org Cc: linux-mm@kvack.org Signed-off-by: Kees Cook --- fs/exec.c | 1 + include/linux/sched.h | 2 +- kernel/fork.c | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/exec.c b/fs/exec.c index 39d773021fffd..d179abb78a1c8 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1633,6 +1633,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm) } rcu_read_unlock(); + /* "users" and "in_exec" locked for copy_fs() */ if (p->fs->users > n_fs) bprm->unsafe |= LSM_UNSAFE_SHARE; else diff --git a/include/linux/sched.h b/include/linux/sched.h index cdb8ea53c365b..ffe8f618ab869 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -920,7 +920,7 @@ struct task_struct { unsigned sched_rt_mutex:1; #endif - /* Bit to tell LSMs we're in execve(): */ + /* Bit to tell TOMOYO we're in execve(): */ unsigned in_execve:1; unsigned in_iowait:1; #ifndef TIF_RESTORE_SIGMASK diff --git a/kernel/fork.c b/kernel/fork.c index 47ff3b35352e0..0d944e92a43ff 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1748,6 +1748,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) if (clone_flags & CLONE_FS) { /* tsk->fs is already what we want */ spin_lock(&fs->lock); + /* "users" and "in_exec" locked for check_unsafe_exec() */ if (fs->in_exec) { spin_unlock(&fs->lock); return -EAGAIN; -- GitLab From 443b349019f2d9461b23213a4308f9cf72e41c5e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 24 Jan 2024 11:52:40 -0800 Subject: [PATCH 1288/1290] samples/cgroup: add .gitignore file for generated samples Make 'git status' quietly happy again after a full allmodconfig build. Fixes: 60433a9d038d ("samples: introduce new samples subdir for cgroup") Signed-off-by: Linus Torvalds --- samples/cgroup/.gitignore | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 samples/cgroup/.gitignore diff --git a/samples/cgroup/.gitignore b/samples/cgroup/.gitignore new file mode 100644 index 0000000000000..3a0161194ccec --- /dev/null +++ b/samples/cgroup/.gitignore @@ -0,0 +1,3 @@ +/cgroup_event_listener +/memcg_event_listener + -- GitLab From 1ed4b563100230ea68821a2b25a3d9f25388a3e6 Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Wed, 24 Jan 2024 14:21:44 -0500 Subject: [PATCH 1289/1290] Revert "KEYS: encrypted: Add check for strsep" This reverts commit b4af096b5df5dd131ab796c79cedc7069d8f4882. New encrypted keys are created either from kernel-generated random numbers or user-provided decrypted data. Revert the change requiring user-provided decrypted data. Reported-by: Vishal Verma Signed-off-by: Mimi Zohar --- security/keys/encrypted-keys/encrypted.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c index 76f55dd13cb80..8af2136069d23 100644 --- a/security/keys/encrypted-keys/encrypted.c +++ b/security/keys/encrypted-keys/encrypted.c @@ -237,10 +237,6 @@ static int datablob_parse(char *datablob, const char **format, break; } *decrypted_data = strsep(&datablob, " \t"); - if (!*decrypted_data) { - pr_info("encrypted_key: decrypted_data is missing\n"); - break; - } ret = 0; break; case Opt_load: -- GitLab From 3eab830189d94f0f80f34cbff609b5bb54002679 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 24 Jan 2024 13:12:20 -0800 Subject: [PATCH 1290/1290] uselib: remove use of __FMODE_EXEC Jann Horn points out that uselib() really shouldn't trigger the new FMODE_EXEC logic introduced by commit 4759ff71f23e ("exec: __FMODE_EXEC instead of in_execve for LSMs"). In fact, it shouldn't even have ever triggered the old pre-existing logic for __FMODE_EXEC (like the NFS code that makes executables not need read permissions). Unlike a real execve(), that can work even with files that are purely executable by the user (not readable), uselib() has that MAY_READ requirement becasue it's really just a convenience wrapper around mmap() for legacy shared libraries. The whole FMODE_EXEC bit was originally introduced by commit b500531e6f5f ("[PATCH] Introduce FMODE_EXEC file flag"), primarily to give ETXTBUSY error returns for distributed filesystems. It has since grown a few other warts (like that NFS thing), but there really isn't any reason to use it for uselib(), and now that we are trying to use it to replace the horrid 'tsk->in_execve' flag, it's actively wrong. Of course, as Jann Horn also points out, nobody should be enabling CONFIG_USELIB in the first place in this day and age, but that's a different discussion entirely. Reported-by: Jann Horn Fixes: 4759ff71f23e ("exec: __FMODE_EXEC instead of in_execve for LSMs") Cc: Kees Cook Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/exec.c b/fs/exec.c index 8cdd5b2dd09c2..1a097c1c2f774 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -128,7 +128,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) struct filename *tmp = getname(library); int error = PTR_ERR(tmp); static const struct open_flags uselib_flags = { - .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, + .open_flag = O_LARGEFILE | O_RDONLY, .acc_mode = MAY_READ | MAY_EXEC, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, -- GitLab